[FEAT+TEST][FR] Ngrams group + selection.

c68c7692 · Alexandre Delanoë · 8ae401ed · c68c7692 · c68c7692 · c68c7692
Commit c68c7692 authored Nov 16, 2017 by Alexandre Delanoë
Showing with 94 additions and 92 deletions

En.hs src/Data/Gargantext/Ngrams/Lang/En.hs +23 -23

Fr.hs src/Data/Gargantext/Ngrams/Lang/Fr.hs +34 -53

Parser.hs src/Data/Gargantext/Ngrams/Parser.hs +2 -2

Ngrams.hs test/Ngrams.hs +35 -14

No files found.
--- a/src/Data/Gargantext/Ngrams/Lang/En.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/En.hs
 {-# LANGUAGE OverloadedStrings #-}

-module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams) where
+module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams, textTest) where

 import Data.Gargantext.Prelude
 import Data.Text (Text)
@@ -9,14 +9,14 @@ import Data.Monoid ((<>))
 selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
 selectNgrams xs = pf selectNgrams' xs
    where
-        selectNgrams' (_,"NN",_)           = True
-        selectNgrams' (_,"NNS",_)          = True
-        selectNgrams' (_,"NNP",_)          = True
-        selectNgrams' (_,"NN+CC",_)        = True
-        selectNgrams' (_,_,"PERSON")       = True
-        selectNgrams' (_,_,"ORGANIZATION") = True
-        selectNgrams' (_,_,"LOCATION")     = True
-        selectNgrams' (_,_,_)              = False
+        selectNgrams' (_,"NN"   ,_             ) = True
+        selectNgrams' (_,"NNS"  ,_             ) = True
+        selectNgrams' (_,"NNP"  ,_             ) = True
+        selectNgrams' (_,"NN+CC",_             ) = True
+        selectNgrams' (_,_      ,"PERSON"      ) = True
+        selectNgrams' (_,_      ,"ORGANIZATION") = True
+        selectNgrams' (_,_      ,"LOCATION"    ) = True
+        selectNgrams' (_,_      ,_             ) = False


 groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
@@ -64,16 +64,16 @@ groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <>

 groupNgrams (x:xs)                                          = (x:(groupNgrams xs))

--
--textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
+
+textTest :: [String]
+textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
+           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
+           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
+          , "In both models, the standard drug used was aspirin 100 mg/kg. "
+          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
+         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
+         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
+         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
+         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
+
+
--- a/src/Data/Gargantext/Ngrams/Lang/Fr.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/Fr.hs
 {-# LANGUAGE OverloadedStrings #-}

-module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams)
+module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams, textTest)
    where

 import Data.Gargantext.Prelude
@@ -10,72 +10,53 @@ import Data.Monoid ((<>))
 selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
 selectNgrams xs = pf selectNgrams' xs
    where
-        selectNgrams' (_,"NN",_)           = True
-        selectNgrams' (_,"NNS",_)          = True
-        selectNgrams' (_,"NNP",_)          = True
-        selectNgrams' (_,"NN+CC",_)        = True
-        selectNgrams' (_,_,"PERSON")       = True
-        selectNgrams' (_,_,"ORGANIZATION") = True
-        selectNgrams' (_,_,"LOCATION")     = True
-        selectNgrams' (_,_,_)              = False
+        selectNgrams' (_,"N"    ,_       ) = True
+        selectNgrams' (_,"NC"   ,_       ) = True
+        selectNgrams' (_,"NN+CC",_       ) = True
+-- FIXME NER in French must be improved 
+--        selectNgrams' (_,_      ,"I-PERS") = True
+--        selectNgrams' (_,_      ,"I-LIEU") = True
+        selectNgrams' (_,_      ,_       ) = False


 groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
 groupNgrams []       = []

-groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
+-- "Groupe : nom commun et adjectifs avec conjonction"
+groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
    where
-        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-        cc  = (c1,"CC",c1')
-        jn1 = (j1, "JJ", j1')
-        jn2 = jn j2 j3 j2'
+        n1 = (n <> " " <> j1, "NC", n')
+        n2 = (n <> " " <> j2, "NC", n')

-groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
+-- /!\ sometimes N instead of NC (why?)
+groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
    where
-        jn j m mm p = (j <> " " <> m, p, mm)
-        jn1 = jn j1 n nn ("NN+CC" :: Text)
-        jn2 = jn j2 n nn ("NN+CC" :: Text)
+        n1 = (n <> " " <> j1, "N", n')
+        n2 = (n <> " " <> j2, "N", n')

-groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
-    where
-        jn j m mm p = (j <> " " <> m, p, mm)
-        jn1 = jn j1 n nn ("NN+CC" :: Text)
-        jn2 = jn j2 n nn ("NN+CC" :: Text)
-
-groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
-groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+-- Groupe : Adjectif + Conjonction de coordination + Adjectif
+-- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)

+-- Groupe : Nom commun + préposition + Nom commun
+groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)

-- extractNgrams "Test the antiinflammatory or analgesic activity?"
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
-- > should be (antiinflammatory activity) <> (analgesic activity)
+-- Groupe : Plusieurs adjectifs successifs
+groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)

-groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs)      = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
+-- Groupe : nom commun et adjectif
+groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+-- /!\ sometimes N instead of NC (why?)
+groupNgrams ((x,"N",_):(y,"ADJ",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)

-groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
-groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+-- Groupe : adjectif et nom commun
+groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+-- /!\ sometimes N instead of NC (why?)
+groupNgrams ((x,"ADJ",_):(y,"N",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)

+-- Si aucune des règles précédentes n'est remplie
 groupNgrams (x:xs)                        = (x:(groupNgrams xs))


--textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
+textTest :: [String]
+textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]

--- a/src/Data/Gargantext/Ngrams/Parser.hs
+++ b/src/Data/Gargantext/Ngrams/Parser.hs
@@ -14,8 +14,8 @@ import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr

 -- TODO for scientific papers: add maesures
 -- TODO add the p score regex
-extractNgrams :: String -> IO [[Ngrams]]
-extractNgrams t = pm (groupNgrams EN) <$> extractNgrams' t
+extractNgrams :: Language -> String -> IO [[Ngrams]]
+extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' s


 extractNgrams' :: String -> IO [[Ngrams]]

--- a/test/Ngrams.hs
+++ b/test/Ngrams.hs
@@ -20,27 +20,48 @@ ngramsExtractionTest EN = hspec $ do
        let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]

        it "\"Of\" seperates two ngrams" $ do
-            t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) 
+            t1 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 0) 
            t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]

        it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
-            t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) 
+            t2 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 2) 
            t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
            

 ngramsExtractionTest FR = hspec $ do
-    describe "Ngrams extraction in English Language" $ do
-        let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
-
-        it "\"Of\" seperates two ngrams" $ do
-            t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) 
-            t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
-
-        it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
-            t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) 
-            t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
- 
-
+    describe "Behavioral tests: ngrams extraction in French Language" $ do
+
+        it "Groupe : adjectif et nom commun" $ do
+            let textFr = "Le beau texte fut écrit."
+            testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
+            testFr `shouldBe` [[("beau texte","NC","O")]]
+
+        it "Groupe : adjectifs et nom commun" $ do
+            let textFr = "Le beau petit texte fut écrit."
+            testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
+            testFr `shouldBe` [[("beau petit texte","NC","O")]]
+                -- `shouldBe` [[("beau texte","NC","O"),("petit texte","NC","O")]] ?
+
+        it "Groupe : nom commun et adjectif" $ do
+            let textFr = "Le livre blanc fut écrit."
+            testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
+            testFr `shouldBe` [[("livre blanc","NC","O")]]
+        
+        it "Groupe : nom commun et adjectifs avec conjonction" $ do
+            let textFr = "Le livre blanc et rouge."
+            testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
+            testFr `shouldBe` [[("livre blanc","N","O"),("livre rouge","N","O")]]
+                -- `shouldBe` [[("livre blanc et rouge","N","O")]] ?
+
+        it "Groupe: Nom commun + préposition + Nom commun" $ do
+            let textFr0 = "Le problème du jour est résolu."
+            testFr0 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr0
+            testFr0 `shouldBe` [[("problème du jour","NC","O")]]
+
+        it "Groupe: Nom commun + préposition + Nom commun + prép + Nom commun" $ do
+            let textFr1 = "L'heure d'arrivée des coureurs dépend de la météo du jour."
+            testFr1 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr1
+            testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","I-ORG"),("météo du jour","NC","O")]]


 parsersTest = hspec $ do