Commit ea51f50d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] NLP Ngrams parser works for French _and_ English.

parent 0f0feaac
......@@ -8,7 +8,9 @@ module Data.Gargantext.Ngrams.CoreNLP where
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import GHC.Generics
import Data.Monoid ((<>))
import Data.Gargantext.Types.Main (Language(..))
import Data.Gargantext.Prelude
import Data.Gargantext.Utils.Prefix (unPrefix)
import Data.Text (Text)
......@@ -78,9 +80,13 @@ corenlpPretty txt = do
-- print $ getResponseHeader "Content-Type" response
S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
corenlp :: String -> IO Sentences
corenlp txt = do
url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
corenlp :: Language -> String -> IO Sentences
corenlp lang txt = do
let properties = case lang of
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
let request = setRequestBodyJSON txt url
response <- httpJSON request
pure (getResponseBody response :: Sentences)
......@@ -93,8 +99,8 @@ corenlp txt = do
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s
tokenWith :: (Token -> t) -> Language -> String -> IO [[(Text, t)]]
tokenWith f lang s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp lang s
......@@ -13,9 +13,9 @@ selectNgrams xs = pf selectNgrams' xs
selectNgrams' (_,"N" ,_ ) = True
selectNgrams' (_,"NC" ,_ ) = True
selectNgrams' (_,"NN+CC",_ ) = True
-- FIXME NER in French must be improved
-- selectNgrams' (_,_ ,"I-PERS") = True
-- selectNgrams' (_,_ ,"I-LIEU") = True
selectNgrams' (_,_ ,"PERSON" ) = True
selectNgrams' (_,_ ,"ORGANIZATION") = True
selectNgrams' (_,_ ,"LOCATION" ) = True
selectNgrams' (_,_ ,_ ) = False
......@@ -53,6 +53,11 @@ groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", y
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
-- Si aucune des règles précédentes n'est remplie
groupNgrams (x:xs) = (x:(groupNgrams xs))
......
......@@ -12,17 +12,33 @@ import qualified Data.Gargantext.Ngrams.Lang.En as En
import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.
-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text
-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are
-- ps : Common words function in Haskell do not take sentence into account
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Language -> String -> IO [[Ngrams]]
extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' s
extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: String -> IO [[Ngrams]]
extractNgrams' t = pm (pm token2text)
<$> pm _sentenceTokens
<$> sentences
<$> corenlp t
extractNgrams' :: Language -> String -> IO [[Ngrams]]
extractNgrams' lang t = pm (pm token2text)
<$> pm _sentenceTokens
<$> sentences
<$> corenlp lang t
-- | This function selects ngrams according to grammars specific
-- of each language.
......
module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.WOS)
module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
, module Data.Gargantext.Parsers.Date
)
where
import Data.Gargantext.Parsers.WOS
import Data.Gargantext.Parsers.Date
......@@ -6,7 +6,7 @@ import qualified Ngrams.Metrics as Metrics
main :: IO ()
main = do
Occ.parsersTest
Lang.ngramsExtractionTest FR
Lang.ngramsExtractionTest EN
Metrics.main
--Lang.ngramsExtractionTest FR
......@@ -42,5 +42,5 @@ ngramsExtractionTest = hspec $ do
it "Groupe: Nom commun + préposition + Nom commun + prép + Nom commun" $ do
let textFr1 = "L'heure d'arrivée des coureurs dépend de la météo du jour."
testFr1 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr1
testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","I-ORG"),("météo du jour","NC","O")]]
testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","O"),("météo du jour","NC","O")]]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment