[FIX] NLP Ngrams parser works for French _and_ English.

ea51f50d · Alexandre Delanoë · 0f0feaac · ea51f50d · ea51f50d · ea51f50d
Commit ea51f50d authored Nov 22, 2017 by Alexandre Delanoë
6 changed files
--- a/src/Data/Gargantext/Ngrams/CoreNLP.hs
+++ b/src/Data/Gargantext/Ngrams/CoreNLP.hs
@@ -8,7 +8,9 @@ module Data.Gargantext.Ngrams.CoreNLP where
 import Data.Aeson
 import Data.Aeson.TH (deriveJSON)
 import GHC.Generics
+import Data.Monoid ((<>))

+import Data.Gargantext.Types.Main (Language(..))
 import Data.Gargantext.Prelude
 import Data.Gargantext.Utils.Prefix (unPrefix)
 import Data.Text (Text)
@@ -78,9 +80,13 @@ corenlpPretty txt = do
 --    print $ getResponseHeader "Content-Type" response
    S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)

-corenlp :: String -> IO Sentences
-corenlp txt = do
-    url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
+corenlp :: Language -> String -> IO Sentences
+corenlp lang txt = do
+    let properties = case lang of
+            EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
+            -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
+            FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
+    url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
    let request = setRequestBodyJSON txt url
    response <- httpJSON request
    pure (getResponseBody response :: Sentences)
@@ -93,8 +99,8 @@ corenlp txt = do
 -- Named Entity Recognition example
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
-tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
-tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s
+tokenWith :: (Token -> t) -> Language -> String -> IO [[(Text, t)]]
+tokenWith f lang s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp lang s



--- a/src/Data/Gargantext/Ngrams/Lang/Fr.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/Fr.hs
@@ -13,9 +13,9 @@ selectNgrams xs = pf selectNgrams' xs
        selectNgrams' (_,"N"    ,_       ) = True
        selectNgrams' (_,"NC"   ,_       ) = True
        selectNgrams' (_,"NN+CC",_       ) = True
-- FIXME NER in French must be improved 
--        selectNgrams' (_,_      ,"I-PERS") = True
--        selectNgrams' (_,_      ,"I-LIEU") = True
+        selectNgrams' (_,_      ,"PERSON"      ) = True
+        selectNgrams' (_,_      ,"ORGANIZATION") = True
+        selectNgrams' (_,_      ,"LOCATION"    ) = True
        selectNgrams' (_,_      ,_       ) = False


@@ -53,6 +53,11 @@ groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", y
 -- /!\ sometimes N instead of NC (why?)
 groupNgrams ((x,"ADJ",_):(y,"N",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)

+
+groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+
+
 -- Si aucune des règles précédentes n'est remplie
 groupNgrams (x:xs)                        = (x:(groupNgrams xs))


--- a/src/Data/Gargantext/Ngrams/Parser.hs
+++ b/src/Data/Gargantext/Ngrams/Parser.hs
@@ -12,17 +12,33 @@ import qualified Data.Gargantext.Ngrams.Lang.En as En
 import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr


+-- | Ngrams selection algorithms
+-- A form is a list of characters seperated by one or more spaces in a sentence.
+-- A word is a form.
+
+-- type Form = [Char]
+-- For performance reasons, Type Text is used, then:
+-- type Form = Text
+
+
+-- Let be a form and its associated forms in contexts of a sentence.
+-- Forms and subfoorms can be representend as Tree whose top is the minimal form
+-- as a monogram whos occurrences are 
+
+-- ps : Common words function in Haskell do not take sentence into account
+
+
 -- TODO for scientific papers: add maesures
 -- TODO add the p score regex
 extractNgrams :: Language -> String -> IO [[Ngrams]]
-extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' s
+extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' lang s


-extractNgrams' :: String -> IO [[Ngrams]]
-extractNgrams' t =  pm (pm token2text)
-                <$> pm _sentenceTokens
-                <$> sentences
-                <$> corenlp t
+extractNgrams' :: Language -> String -> IO [[Ngrams]]
+extractNgrams' lang t =  pm (pm token2text)
+                     <$> pm _sentenceTokens
+                     <$> sentences
+                     <$> corenlp lang t

 -- | This function selects ngrams according to grammars specific
 --   of each language.

--- a/src/Data/Gargantext/Parsers.hs
+++ b/src/Data/Gargantext/Parsers.hs
-module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.WOS)
+module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
+                               , module Data.Gargantext.Parsers.Date
+                               )
+                                
    where
 import Data.Gargantext.Parsers.WOS
+import Data.Gargantext.Parsers.Date
--- a/test/Main.hs
+++ b/test/Main.hs
@@ -6,7 +6,7 @@ import qualified Ngrams.Metrics as Metrics
 main :: IO ()
 main = do
    Occ.parsersTest
+    Lang.ngramsExtractionTest FR
    Lang.ngramsExtractionTest EN
    Metrics.main
-    --Lang.ngramsExtractionTest FR

--- a/test/Ngrams/Lang/Fr.hs
+++ b/test/Ngrams/Lang/Fr.hs
@@ -42,5 +42,5 @@ ngramsExtractionTest = hspec $ do
        it "Groupe: Nom commun + préposition + Nom commun + prép + Nom commun" $ do
            let textFr1 = "L'heure d'arrivée des coureurs dépend de la météo du jour."
            testFr1 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr1
-            testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","I-ORG"),("météo du jour","NC","O")]]
+            testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","O"),("météo du jour","NC","O")]]