Commit c68c7692 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT+TEST][FR] Ngrams group + selection.

parent 8ae401ed
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams) where module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams, textTest) where
import Data.Gargantext.Prelude import Data.Gargantext.Prelude
import Data.Text (Text) import Data.Text (Text)
...@@ -9,14 +9,14 @@ import Data.Monoid ((<>)) ...@@ -9,14 +9,14 @@ import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = pf selectNgrams' xs selectNgrams xs = pf selectNgrams' xs
where where
selectNgrams' (_,"NN",_) = True selectNgrams' (_,"NN" ,_ ) = True
selectNgrams' (_,"NNS",_) = True selectNgrams' (_,"NNS" ,_ ) = True
selectNgrams' (_,"NNP",_) = True selectNgrams' (_,"NNP" ,_ ) = True
selectNgrams' (_,"NN+CC",_) = True selectNgrams' (_,"NN+CC",_ ) = True
selectNgrams' (_,_,"PERSON") = True selectNgrams' (_,_ ,"PERSON" ) = True
selectNgrams' (_,_,"ORGANIZATION") = True selectNgrams' (_,_ ,"ORGANIZATION") = True
selectNgrams' (_,_,"LOCATION") = True selectNgrams' (_,_ ,"LOCATION" ) = True
selectNgrams' (_,_,_) = False selectNgrams' (_,_ ,_ ) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
...@@ -64,16 +64,16 @@ groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> ...@@ -64,16 +64,16 @@ groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <>
groupNgrams (x:xs) = (x:(groupNgrams xs)) groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--textTest :: [String] textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. " textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. " , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. " , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. " , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. " , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. " , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. " , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. " , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams) module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams, textTest)
where where
import Data.Gargantext.Prelude import Data.Gargantext.Prelude
...@@ -10,72 +10,53 @@ import Data.Monoid ((<>)) ...@@ -10,72 +10,53 @@ import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = pf selectNgrams' xs selectNgrams xs = pf selectNgrams' xs
where where
selectNgrams' (_,"NN",_) = True selectNgrams' (_,"N" ,_ ) = True
selectNgrams' (_,"NNS",_) = True selectNgrams' (_,"NC" ,_ ) = True
selectNgrams' (_,"NNP",_) = True selectNgrams' (_,"NN+CC",_ ) = True
selectNgrams' (_,"NN+CC",_) = True -- FIXME NER in French must be improved
selectNgrams' (_,_,"PERSON") = True -- selectNgrams' (_,_ ,"I-PERS") = True
selectNgrams' (_,_,"ORGANIZATION") = True -- selectNgrams' (_,_ ,"I-LIEU") = True
selectNgrams' (_,_,"LOCATION") = True selectNgrams' (_,_ ,_ ) = False
selectNgrams' (_,_,_) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
groupNgrams [] = [] groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs) -- "Groupe : nom commun et adjectifs avec conjonction"
groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
where where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn') n1 = (n <> " " <> j1, "NC", n')
cc = (c1,"CC",c1') n2 = (n <> " " <> j2, "NC", n')
jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2'
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs) -- /!\ sometimes N instead of NC (why?)
groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
where where
jn j m mm p = (j <> " " <> m, p, mm) n1 = (n <> " " <> j1, "N", n')
jn1 = jn j1 n nn ("NN+CC" :: Text) n2 = (n <> " " <> j2, "N", n')
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs) -- Groupe : Adjectif + Conjonction de coordination + Adjectif
where -- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-- Groupe : Nom commun + préposition + Nom commun
groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
-- extractNgrams "Test the antiinflammatory or analgesic activity?" -- Groupe : Plusieurs adjectifs successifs
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]] groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs) -- Groupe : nom commun et adjectif
groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs) -- Groupe : adjectif et nom commun
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs) groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- Si aucune des règles précédentes n'est remplie
groupNgrams (x:xs) = (x:(groupNgrams xs)) groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [String] textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. " textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
...@@ -14,8 +14,8 @@ import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr ...@@ -14,8 +14,8 @@ import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures -- TODO for scientific papers: add maesures
-- TODO add the p score regex -- TODO add the p score regex
extractNgrams :: String -> IO [[Ngrams]] extractNgrams :: Language -> String -> IO [[Ngrams]]
extractNgrams t = pm (groupNgrams EN) <$> extractNgrams' t extractNgrams lang s = pm (groupNgrams lang) <$> extractNgrams' s
extractNgrams' :: String -> IO [[Ngrams]] extractNgrams' :: String -> IO [[Ngrams]]
......
...@@ -20,27 +20,48 @@ ngramsExtractionTest EN = hspec $ do ...@@ -20,27 +20,48 @@ ngramsExtractionTest EN = hspec $ do
let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String] let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
it "\"Of\" seperates two ngrams" $ do it "\"Of\" seperates two ngrams" $ do
t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) t1 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 0)
t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]] t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) t2 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 2)
t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]] t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
ngramsExtractionTest FR = hspec $ do ngramsExtractionTest FR = hspec $ do
describe "Ngrams extraction in English Language" $ do describe "Behavioral tests: ngrams extraction in French Language" $ do
let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
it "Groupe : adjectif et nom commun" $ do
it "\"Of\" seperates two ngrams" $ do let textFr = "Le beau texte fut écrit."
t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]] testFr `shouldBe` [[("beau texte","NC","O")]]
it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do it "Groupe : adjectifs et nom commun" $ do
t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) let textFr = "Le beau petit texte fut écrit."
t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]] testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
testFr `shouldBe` [[("beau petit texte","NC","O")]]
-- `shouldBe` [[("beau texte","NC","O"),("petit texte","NC","O")]] ?
it "Groupe : nom commun et adjectif" $ do
let textFr = "Le livre blanc fut écrit."
testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
testFr `shouldBe` [[("livre blanc","NC","O")]]
it "Groupe : nom commun et adjectifs avec conjonction" $ do
let textFr = "Le livre blanc et rouge."
testFr <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr
testFr `shouldBe` [[("livre blanc","N","O"),("livre rouge","N","O")]]
-- `shouldBe` [[("livre blanc et rouge","N","O")]] ?
it "Groupe: Nom commun + préposition + Nom commun" $ do
let textFr0 = "Le problème du jour est résolu."
testFr0 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr0
testFr0 `shouldBe` [[("problème du jour","NC","O")]]
it "Groupe: Nom commun + préposition + Nom commun + prép + Nom commun" $ do
let textFr1 = "L'heure d'arrivée des coureurs dépend de la météo du jour."
testFr1 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr1
testFr1 `shouldBe` [[("heure d' arrivée des coureurs","NC","I-ORG"),("météo du jour","NC","O")]]
parsersTest = hspec $ do parsersTest = hspec $ do
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment