[NGRAMS] improving ngrams extraction with prep (of/de) respectively in eng/fr.

f12df281 · Alexandre Delanoë · ea51f50d · f12df281 · f12df281 · f12df281
Commit f12df281 authored Nov 22, 2017 by Alexandre Delanoë
Showing with 32 additions and 5 deletions

En.hs src/Data/Gargantext/Ngrams/Lang/En.hs +6 -1

Fr.hs src/Data/Gargantext/Ngrams/Lang/Fr.hs +5 -1

En.hs test/Ngrams/Lang/En.hs +15 -3

Fr.hs test/Ngrams/Lang/Fr.hs +6 -0

No files found.
--- a/src/Data/Gargantext/Ngrams/Lang/En.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/En.hs
@@ -47,6 +47,7 @@ groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN",

 groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
 groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+
 groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
 groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)

@@ -57,7 +58,11 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN",
 -- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
 -- > should be (antiinflammatory activity) <> (analgesic activity)

-groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
+
+groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)

 groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
 groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)

--- a/src/Data/Gargantext/Ngrams/Lang/Fr.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/Fr.hs
@@ -22,6 +22,8 @@ selectNgrams xs = pf selectNgrams' xs
 groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
 groupNgrams []       = []

+--groupNgrams ((_,"DET",_):xs) = groupNgrams xs
+
 -- "Groupe : nom commun et adjectifs avec conjonction"
 groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
    where
@@ -38,7 +40,9 @@ groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (
 -- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)

 -- Groupe : Nom commun + préposition + Nom commun
-groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
+groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs)  = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
+groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
+groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)

 -- Groupe : Plusieurs adjectifs successifs
 groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)

--- a/test/Ngrams/Lang/En.hs
+++ b/test/Ngrams/Lang/En.hs
@@ -16,9 +16,21 @@ ngramsExtractionTest = hspec $ do

        it "\"Of\" seperates two ngrams" $ do
            t1 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 0) 
-            t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
-
+            t1 `shouldBe` [[("Alcoholic extract of Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
+            
        it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
            t2 <- pm (selectNgrams EN) <$> extractNgrams EN (textTest !! 2) 
            t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
-            
+
+        it "Tests nouns with preposition and determinants" $ do
+            let t = "Donald Trump is president of the United-States of America."
+            t2 <- pm (selectNgrams EN) <$> extractNgrams EN t
+            t2 `shouldBe` [[("Donald Trump","NNP","PERSON"),("president of the United-States of America","NN","LOCATION")]]
+
+
+
+
+
+
+
+
--- a/test/Ngrams/Lang/Fr.hs
+++ b/test/Ngrams/Lang/Fr.hs
@@ -39,6 +39,12 @@ ngramsExtractionTest = hspec $ do
            testFr0 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr0
            testFr0 `shouldBe` [[("problème du jour","NC","O")]]

+        it "Groupe: Nom commun + préposition + déterminant + Nom commun" $ do
+            let textFr0 = "Emmanuel Macron est le président de la France."
+            testFr0 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr0
+            testFr0 `shouldBe` [[("Emmanuel Macron","NPP","PERSON"),("président de la France","NC","LOCATION")]]
+
+
        it "Groupe: Nom commun + préposition + Nom commun + prép + Nom commun" $ do
            let textFr1 = "L'heure d'arrivée des coureurs dépend de la météo du jour."
            testFr1 <- pm (selectNgrams FR) <$> (extractNgrams FR) textFr1