Commit df15698c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Clean Text before sending it to NLP micro services + tests + clean code for documentation

parent 0df45331
...@@ -65,6 +65,7 @@ library ...@@ -65,6 +65,7 @@ library
Gargantext.Core.Text.Terms Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Lang.En Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.RAKE Gargantext.Core.Text.Terms.Multi.RAKE
...@@ -221,7 +222,6 @@ library ...@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Mono.Stem.En Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Group Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.PosTagging Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types Gargantext.Core.Text.Terms.Multi.PosTagging.Types
...@@ -463,6 +463,7 @@ library ...@@ -463,6 +463,7 @@ library
, rdf4h , rdf4h
, regex-compat , regex-compat
, regex-tdfa , regex-tdfa
, replace-attoparsec
, resource-pool , resource-pool
, resourcet , resourcet
, safe , safe
...@@ -842,6 +843,7 @@ test-suite garg-test ...@@ -842,6 +843,7 @@ test-suite garg-test
Ngrams.Lang.Fr Ngrams.Lang.Fr
Ngrams.Lang.Occurrences Ngrams.Lang.Occurrences
Ngrams.Metrics Ngrams.Metrics
Ngrams.NLP
Parsers.Date Parsers.Date
Parsers.Types Parsers.Types
Parsers.WOS Parsers.WOS
......
...@@ -91,6 +91,7 @@ library: ...@@ -91,6 +91,7 @@ library:
- Gargantext.Core.Text.Terms - Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Eleve - Gargantext.Core.Text.Terms.Eleve
- Gargantext.Core.Text.Terms.Mono - Gargantext.Core.Text.Terms.Mono
- Gargantext.Core.Text.Terms.Multi
- Gargantext.Core.Text.Terms.Multi.Lang.En - Gargantext.Core.Text.Terms.Multi.Lang.En
- Gargantext.Core.Text.Terms.Multi.Lang.Fr - Gargantext.Core.Text.Terms.Multi.Lang.Fr
- Gargantext.Core.Text.Terms.Multi.RAKE - Gargantext.Core.Text.Terms.Multi.RAKE
...@@ -246,6 +247,7 @@ library: ...@@ -246,6 +247,7 @@ library:
- rake - rake
- random - random
- rdf4h - rdf4h
- replace-attoparsec
- regex-compat - regex-compat
- regex-tdfa - regex-tdfa
- resource-pool - resource-pool
......
...@@ -16,6 +16,7 @@ import qualified Core.Utils as Utils ...@@ -16,6 +16,7 @@ import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr --import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang --import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ import qualified Ngrams.Lang.Occurrences as Occ
import qualified Ngrams.NLP as NLP
import qualified Ngrams.Metrics as Metrics import qualified Ngrams.Metrics as Metrics
import qualified Parsers.Date as PD import qualified Parsers.Date as PD
-- import qualified Graph.Distance as GD -- import qualified Graph.Distance as GD
...@@ -33,3 +34,4 @@ main = do ...@@ -33,3 +34,4 @@ main = do
PD.testFromRFC3339 PD.testFromRFC3339
-- GD.test -- GD.test
Crypto.test Crypto.test
NLP.main
{-|
Module : Ngrams.NLP
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Ngrams.NLP where
import Data.Text (Text)
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Multi
main :: IO ()
main = hspec $ do
describe "Text that should be cleaned before sending it to NLP tools as micro-services." $ do
let text = "This is a url http://cnrs.gargantext.org to be remove and another one www.gargantext.org and digits 343242-2332 to be remove and some to keep: 232 231 33." :: Text
let result = "This is a url to be remove and another one and digits to be remove and some to keep: 232 231 33."
it "NLP Clean Text before sending to micro services:" $ cleanTextForNLP text `shouldBe` result
...@@ -204,8 +204,9 @@ newEnv port file = do ...@@ -204,8 +204,9 @@ newEnv port file = do
!config_mail <- Mail.readConfig file !config_mail <- Mail.readConfig file
!nlp_env <- nlpServerMap <$> NLP.readConfig file !nlp_env <- nlpServerMap <$> NLP.readConfig file
-- | An 'Env' by default doesn't have strict fields, but when constructing one in production {- An 'Env' by default doesn't have strict fields, but when constructing one in production
-- we want to force them to WHNF to avoid accumulating unnecessary thunks. we want to force them to WHNF to avoid accumulating unnecessary thunks.
-}
pure $ Env pure $ Env
{ _env_settings = settings' { _env_settings = settings'
, _env_logger = logger , _env_logger = logger
......
...@@ -11,28 +11,26 @@ Multi-terms are ngrams where n > 1. ...@@ -11,28 +11,26 @@ Multi-terms are ngrams where n > 1.
-} -}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags) module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where where
import Data.Text hiding (map, group, filter, concat) import Control.Applicative
import Data.Attoparsec.Text as DAT
import Data.List (concat) import Data.List (concat)
import Data.Text hiding (map, group, filter, concat)
import Gargantext.Prelude
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude
import Replace.Attoparsec.Text as RAT
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
-- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------- -------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences type NLP_API = Lang -> Text -> IO PosSentences
...@@ -40,7 +38,7 @@ type NLP_API = Lang -> Text -> IO PosSentences ...@@ -40,7 +38,7 @@ type NLP_API = Lang -> Text -> IO PosSentences
------------------------------------------------------------------- -------------------------------------------------------------------
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount] multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do multiterms nsc l txt = do
ret <- multiterms' tokenTag2terms l txt ret <- multiterms' tokenTag2terms l $ cleanTextForNLP txt
pure $ groupWithCounts ret pure $ groupWithCounts ret
where where
multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a] multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
...@@ -77,3 +75,16 @@ groupTokens :: Lang -> [TokenTag] -> [TokenTag] ...@@ -77,3 +75,16 @@ groupTokens :: Lang -> [TokenTag] -> [TokenTag]
groupTokens EN = En.groupTokens groupTokens EN = En.groupTokens
groupTokens FR = Fr.groupTokens groupTokens FR = Fr.groupTokens
groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet" groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
where
remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
...@@ -140,10 +140,7 @@ bridgeness (Bridgeness_Recursive sn f sim) m = ...@@ -140,10 +140,7 @@ bridgeness (Bridgeness_Recursive sn f sim) m =
bridgeness (Bridgeness_Advanced sim c) m = Map.fromList bridgeness (Bridgeness_Advanced sim c) m = Map.fromList
$ List.filter (\x -> if sim == Conditional then snd x > 0.2 else snd x > 0.02) $ List.filter (\x -> if sim == Conditional then snd x > 0.2 else snd x > 0.02)
$ map (\(ks, (v1,_v2)) -> (ks,v1)) $ map (\(ks, (v1,_v2)) -> (ks,v1))
-- $ List.take (if sim == Conditional then 2*n else 3*n)
-- $ List.sortOn (Down . (snd . snd))
$ Map.toList $ Map.toList
-- $ trace ("bridgeness3 m c" <> show (m,c))
$ Map.intersectionWithKey $ Map.intersectionWithKey
(\k v1 v2 -> trace ("intersectionWithKey " <> (show (k, v1, v2))) (v1, v2)) m c (\k v1 v2 -> trace ("intersectionWithKey " <> (show (k, v1, v2))) (v1, v2)) m c
......
...@@ -195,7 +195,6 @@ doSimilarityMap Conditional threshold strength myCooc = (distanceMap, toIndex ti ...@@ -195,7 +195,6 @@ doSimilarityMap Conditional threshold strength myCooc = (distanceMap, toIndex ti
$ List.sortOn snd $ List.sortOn snd
$ Map.toList $ Map.toList
$ Map.filter (> threshold) $ Map.filter (> threshold)
-- $ conditional myCooc
$ similarities `seq` mat2map similarities $ similarities `seq` mat2map similarities
doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ti diag, ti) doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ti diag, ti)
......
...@@ -435,7 +435,6 @@ reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups = ...@@ -435,7 +435,6 @@ reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
toPhylomemeticNetwork :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch] toPhylomemeticNetwork :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch]
toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups = toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups =
groupsToBranches $ fromList $ map (\g -> (getGroupId g, g)) groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
-- $ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
$ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups $ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment