Commit df15698c authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Clean Text before sending it to NLP micro services + tests + clean code for documentation

parent 0df45331
......@@ -65,6 +65,7 @@ library
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Lang.En
Gargantext.Core.Text.Terms.Multi.Lang.Fr
Gargantext.Core.Text.Terms.Multi.RAKE
......@@ -221,7 +222,6 @@ library
Gargantext.Core.Text.Terms.Mono.Stem.En
Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
......@@ -463,6 +463,7 @@ library
, rdf4h
, regex-compat
, regex-tdfa
, replace-attoparsec
, resource-pool
, resourcet
, safe
......@@ -842,6 +843,7 @@ test-suite garg-test
Ngrams.Lang.Fr
Ngrams.Lang.Occurrences
Ngrams.Metrics
Ngrams.NLP
Parsers.Date
Parsers.Types
Parsers.WOS
......
......@@ -91,6 +91,7 @@ library:
- Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Eleve
- Gargantext.Core.Text.Terms.Mono
- Gargantext.Core.Text.Terms.Multi
- Gargantext.Core.Text.Terms.Multi.Lang.En
- Gargantext.Core.Text.Terms.Multi.Lang.Fr
- Gargantext.Core.Text.Terms.Multi.RAKE
......@@ -246,6 +247,7 @@ library:
- rake
- random
- rdf4h
- replace-attoparsec
- regex-compat
- regex-tdfa
- resource-pool
......
......@@ -16,6 +16,7 @@ import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ
import qualified Ngrams.NLP as NLP
import qualified Ngrams.Metrics as Metrics
import qualified Parsers.Date as PD
-- import qualified Graph.Distance as GD
......@@ -33,3 +34,4 @@ main = do
PD.testFromRFC3339
-- GD.test
Crypto.test
NLP.main
{-|
Module : Ngrams.NLP
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Ngrams.NLP where
import Data.Text (Text)
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Multi
main :: IO ()
main = hspec $ do
describe "Text that should be cleaned before sending it to NLP tools as micro-services." $ do
let text = "This is a url http://cnrs.gargantext.org to be remove and another one www.gargantext.org and digits 343242-2332 to be remove and some to keep: 232 231 33." :: Text
let result = "This is a url to be remove and another one and digits to be remove and some to keep: 232 231 33."
it "NLP Clean Text before sending to micro services:" $ cleanTextForNLP text `shouldBe` result
......@@ -204,8 +204,9 @@ newEnv port file = do
!config_mail <- Mail.readConfig file
!nlp_env <- nlpServerMap <$> NLP.readConfig file
-- | An 'Env' by default doesn't have strict fields, but when constructing one in production
-- we want to force them to WHNF to avoid accumulating unnecessary thunks.
{- An 'Env' by default doesn't have strict fields, but when constructing one in production
we want to force them to WHNF to avoid accumulating unnecessary thunks.
-}
pure $ Env
{ _env_settings = settings'
, _env_logger = logger
......
......@@ -11,28 +11,26 @@ Multi-terms are ngrams where n > 1.
-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where
import Data.Text hiding (map, group, filter, concat)
import Control.Applicative
import Data.Attoparsec.Text as DAT
import Data.List (concat)
import Gargantext.Prelude
import Data.Text hiding (map, group, filter, concat)
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
import Gargantext.Core.Types
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Prelude
import Replace.Attoparsec.Text as RAT
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
-- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
-------------------------------------------------------------------
type NLP_API = Lang -> Text -> IO PosSentences
......@@ -40,7 +38,7 @@ type NLP_API = Lang -> Text -> IO PosSentences
-------------------------------------------------------------------
multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
multiterms nsc l txt = do
ret <- multiterms' tokenTag2terms l txt
ret <- multiterms' tokenTag2terms l $ cleanTextForNLP txt
pure $ groupWithCounts ret
where
multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
......@@ -77,3 +75,16 @@ groupTokens :: Lang -> [TokenTag] -> [TokenTag]
groupTokens EN = En.groupTokens
groupTokens FR = Fr.groupTokens
groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
where
remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
......@@ -140,10 +140,7 @@ bridgeness (Bridgeness_Recursive sn f sim) m =
bridgeness (Bridgeness_Advanced sim c) m = Map.fromList
$ List.filter (\x -> if sim == Conditional then snd x > 0.2 else snd x > 0.02)
$ map (\(ks, (v1,_v2)) -> (ks,v1))
-- $ List.take (if sim == Conditional then 2*n else 3*n)
-- $ List.sortOn (Down . (snd . snd))
$ Map.toList
-- $ trace ("bridgeness3 m c" <> show (m,c))
$ Map.intersectionWithKey
(\k v1 v2 -> trace ("intersectionWithKey " <> (show (k, v1, v2))) (v1, v2)) m c
......
......@@ -195,7 +195,6 @@ doSimilarityMap Conditional threshold strength myCooc = (distanceMap, toIndex ti
$ List.sortOn snd
$ Map.toList
$ Map.filter (> threshold)
-- $ conditional myCooc
$ similarities `seq` mat2map similarities
doSimilarityMap Distributional threshold strength myCooc = (distanceMap, toIndex ti diag, ti)
......
......@@ -435,7 +435,6 @@ reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
toPhylomemeticNetwork :: Int -> [Period] -> PhyloSimilarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch]
toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups =
groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
-- $ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
$ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment