[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern...

[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern matching with FR examples.

[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern...
[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern matching with FR examples.
28461a06 · Alexandre Delanoë · b38cd1e3 · 28461a06 · 28461a06 · 28461a06
Commit 28461a06 authored Nov 15, 2017 by Alexandre Delanoë
16 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -33,6 +33,7 @@ library
                  , lens
                  , opaleye
                  , path
+                  , parsec
                  , postgresql-simple
                  , pretty
                  , product-profunctors
@@ -65,9 +66,6 @@ library
  exposed-modules: Data.Gargantext
                  , Data.Gargantext.Analysis
                  , Data.Gargantext.DSL
-                  , Data.Gargantext.Ngrams.Occurrences
-                  , Data.Gargantext.NLP
-                  , Data.Gargantext.NLP.CoreNLP
                  , Data.Gargantext.Database
                  , Data.Gargantext.Database.Instances
                  , Data.Gargantext.Database.Ngram
@@ -78,13 +76,22 @@ library
                  , Data.Gargantext.Database.NodeNodeNgram
                  , Data.Gargantext.Database.Private
                  , Data.Gargantext.Database.User
+                  , Data.Gargantext.NLP
+                  , Data.Gargantext.NLP.CoreNLP
+                  , Data.Gargantext.Ngrams
+                  , Data.Gargantext.Ngrams.Count
+                  , Data.Gargantext.Ngrams.Parser
+                  , Data.Gargantext.Ngrams.Lang.En
+                  , Data.Gargantext.Ngrams.Lang.Fr
+                  , Data.Gargantext.Ngrams.TextMining
+                  , Data.Gargantext.Ngrams.Occurrences
                  , Data.Gargantext.Parsers
                  , Data.Gargantext.Parsers.WOS
                  , Data.Gargantext.Prelude
                  , Data.Gargantext.Server
                  , Data.Gargantext.Types
-                  , Data.Gargantext.Types.Node
                  , Data.Gargantext.Types.Main
+                  , Data.Gargantext.Types.Node
                  , Data.Gargantext.Utils.DateUtils
                  , Data.Gargantext.Utils.Prefix
  default-language:    Haskell2010
@@ -100,10 +107,10 @@ library
 --                     , hastext
 --  default-language:    Haskell2010
-test-suite garg-test-parsers
+test-suite garg-test-ngrams
  type:                exitcode-stdio-1.0
  hs-source-dirs:      test
-  main-is:             Parsers.hs
+  main-is:             Ngrams.hs
  build-depends:       base
                     , extra
                     , text

--- a/src/Data/Gargantext/NLP/CoreNLP.hs
+++ b/src/Data/Gargantext/NLP/CoreNLP.hs
@@ -26,11 +26,15 @@ data Token = Token { _tokenIndex                :: Int
                   , _tokenCharacterOffsetEnd   :: Int
                   , _tokenPos                  :: Text
                   , _tokenNer                  :: Text
-                   , _tokenBefore               :: Text
+                   , _tokenBefore               :: Maybe Text
-                   , _tokenAfter                :: Text
+                   , _tokenAfter                :: Maybe Text
                   } deriving (Show, Generic)
 $(deriveJSON (unPrefix "_token") ''Token)
+token2text :: Token -> (Text, Text, Text)
+token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
 data Sentence  = Sentence { _sentenceIndex :: Int
                          , _sentenceTokens :: [Token]
                          } deriving (Show, Generic)
@@ -49,6 +53,19 @@ instance ToJSON Sentences
 instance FromJSON Sentences
+-- request = 
+-- "fr" : {
+--                 "tokenize.language" : "fr",
+--                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
+--                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
+--                 // dependency parser
+--                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
+--                 "depparse.language" : "french",
+--                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
+--                 "ssplit.newlineIsSentenceBreak": "always" 
+--             },
+-- 
 corenlpPretty :: String -> IO ()
 corenlpPretty txt = do

--- a/src/Data/Gargantext/Ngrams.hs
+++ b/src/Data/Gargantext/Ngrams.hs
 module Data.Gargantext.Ngrams (
-    module Data.Gargantext.Ngrams.TextMining,
+    module Data.Gargantext.Ngrams.Count,
-    module Data.Gargantext.Ngrams.Words,
+    --module Data.Gargantext.Ngrams.Hetero,
-    module Data.Gargantext.Ngrams.Hetero,
+    module Data.Gargantext.Ngrams.Parser,
-    module Data.Gargantext.Ngrams.Count
+    module Data.Gargantext.Ngrams.Occurrences,
+    module Data.Gargantext.Ngrams.TextMining
+    --module Data.Gargantext.Ngrams.Words
    ) where
-import Data.Gargantext.Ngrams.TextMining
-import Data.Gargantext.Ngrams.Words
-import Data.Gargantext.Ngrams.Hetero
 import Data.Gargantext.Ngrams.Count
+--import Data.Gargantext.Ngrams.Hetero
+import Data.Gargantext.Ngrams.Parser
+import Data.Gargantext.Ngrams.Occurrences
+import Data.Gargantext.Ngrams.TextMining
+--import Data.Gargantext.Ngrams.Words
--- a/src/Data/Gargantext/Ngrams/Count.hs
+++ b/src/Data/Gargantext/Ngrams/Count.hs
@@ -4,7 +4,6 @@ module Data.Gargantext.Ngrams.Count where
 import System.Environment (getArgs)
-import Data.List (foldl', take)
 import Data.Foldable as F
 import Data.Map (Map)
@@ -37,9 +36,9 @@ occurrences xs = foldl' (\x y -> M.insertWith' (+) y 1 x) M.empty xs
 --occurrences' :: Ord a => [a] -> Map a Integer
 --occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
+countMain :: IO ()
 countMain = do
-  (fichier:rest) <- getArgs
+  (fichier:_) <- getArgs
  c <- DTLIO.readFile fichier
  --print $ occurrences $ DTL.chunksOf 1 c
  print $ occurrences $ letters'' c

--- a/src/Data/Gargantext/Ngrams/Hetero.hs
+++ b/src/Data/Gargantext/Ngrams/Hetero.hs
@@ -10,7 +10,7 @@ import Opaleye.Internal.Column (Column)
 import Data.Gargantext.Database.Gargandb
 import Data.Gargantext.Database.Private
-import Data.Gargantext.Utils.Chronos
+--import Data.Gargantext.Utils.Chronos
 import Data.Gargantext.Ngrams.Words (cleanText)
 import Data.Gargantext.Ngrams.Count (occurrences)

--- a/src/Data/Gargantext/Ngrams/Lang/En.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/En.hs
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams) where
+import Data.Gargantext.Prelude
+import Data.Text (Text)
+import Data.Monoid ((<>))
+selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+selectNgrams xs = pf selectNgrams' xs
+    where
+        selectNgrams' (_,"NN",_)           = True
+        selectNgrams' (_,"NNS",_)          = True
+        selectNgrams' (_,"NNP",_)          = True
+        selectNgrams' (_,"NN+CC",_)        = True
+        selectNgrams' (_,_,"PERSON")       = True
+        selectNgrams' (_,_,"ORGANIZATION") = True
+        selectNgrams' (_,_,"LOCATION")     = True
+        selectNgrams' (_,_,_)              = False
+groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+groupNgrams []       = []
+groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
+    where
+        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
+        cc  = (c1,"CC",c1')
+        jn1 = (j1, "JJ", j1')
+        jn2 = jn j2 j3 j2'
+groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
+    where
+        jn j m mm p = (j <> " " <> m, p, mm)
+        jn1 = jn j1 n nn ("NN+CC" :: Text)
+        jn2 = jn j2 n nn ("NN+CC" :: Text)
+groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
+    where
+        jn j m mm p = (j <> " " <> m, p, mm)
+        jn1 = jn j1 n nn ("NN+CC" :: Text)
+        jn2 = jn j2 n nn ("NN+CC" :: Text)
+groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
+groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+-- extractNgrams "Test the antiinflammatory or analgesic activity?"
+-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
+-- > should be (antiinflammatory activity) <> (analgesic activity)
+groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs)      = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
+--
+--textTest :: [String]
+--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
+--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
+--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
+--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
+--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
+--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
+--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
+--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
+--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
+--
+--
--- a/src/Data/Gargantext/Ngrams/Lang/Fr.hs
+++ b/src/Data/Gargantext/Ngrams/Lang/Fr.hs
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams)
+    where
+import Data.Gargantext.Prelude
+import Data.Text (Text)
+import Data.Monoid ((<>))
+selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+selectNgrams xs = pf selectNgrams' xs
+    where
+        selectNgrams' (_,"NN",_)           = True
+        selectNgrams' (_,"NNS",_)          = True
+        selectNgrams' (_,"NNP",_)          = True
+        selectNgrams' (_,"NN+CC",_)        = True
+        selectNgrams' (_,_,"PERSON")       = True
+        selectNgrams' (_,_,"ORGANIZATION") = True
+        selectNgrams' (_,_,"LOCATION")     = True
+        selectNgrams' (_,_,_)              = False
+groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+groupNgrams []       = []
+groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
+    where
+        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
+        cc  = (c1,"CC",c1')
+        jn1 = (j1, "JJ", j1')
+        jn2 = jn j2 j3 j2'
+groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
+    where
+        jn j m mm p = (j <> " " <> m, p, mm)
+        jn1 = jn j1 n nn ("NN+CC" :: Text)
+        jn2 = jn j2 n nn ("NN+CC" :: Text)
+groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
+    where
+        jn j m mm p = (j <> " " <> m, p, mm)
+        jn1 = jn j1 n nn ("NN+CC" :: Text)
+        jn2 = jn j2 n nn ("NN+CC" :: Text)
+groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
+groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+-- extractNgrams "Test the antiinflammatory or analgesic activity?"
+-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
+-- > should be (antiinflammatory activity) <> (analgesic activity)
+groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs)      = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
+groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
+--textTest :: [String]
+--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
+--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
+--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
+--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
+--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
+--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
+--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
+--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
+--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
+--
+--
--- a/src/Data/Gargantext/Ngrams/Parser.hs
+++ b/src/Data/Gargantext/Ngrams/Parser.hs
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+module Data.Gargantext.Ngrams.Parser where
+import Data.Gargantext.Prelude
+import Data.Gargantext.NLP.CoreNLP
+import Data.Gargantext.Types.Main (Language(..), Ngrams)
+import qualified Data.Gargantext.Ngrams.Lang.En as En
+import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
+-- TODO for scientific papers: add maesures
+-- TODO add the p score regex
+extractNgrams :: String -> IO [[Ngrams]]
+extractNgrams t = pm (groupNgrams EN) <$> extractNgrams' t
+extractNgrams' :: String -> IO [[Ngrams]]
+extractNgrams' t =  pm (pm token2text)
+                <$> pm _sentenceTokens
+                <$> sentences
+                <$> corenlp t
+-- | This function selects ngrams according to grammars specific
+--   of each language.
+--   In english, JJ is ADJectiv in french.
+selectNgrams :: Language -> [Ngrams] -> [Ngrams]
+selectNgrams EN = En.selectNgrams
+selectNgrams FR = Fr.selectNgrams
+-- | This function analyze and groups (or not) ngrams according to 
+--   grammars specific of each language.
+groupNgrams :: Language -> [Ngrams] -> [Ngrams]
+groupNgrams EN = En.groupNgrams
+groupNgrams FR = Fr.groupNgrams
--- a/src/Data/Gargantext/Ngrams/TFICF.hs
+++ b/src/Data/Gargantext/Ngrams/TFICF.hs
+module Data.Gargantext.Ngrams.TFICF where
+data TFICF = TFICF { _tficfTerms    :: Text
+                   , _tficfContext1 :: Context
+                   , _tficfContext2 :: Context
+                   , _tficfScore    :: Maybe Double
+                   } deriving (Read, Show, Generics)
+tfidf :: Text -> TFICF
+tfidf txt = TFICF txt Document Corpus score
+    where
+        score = Nothing
--- a/src/Data/Gargantext/Ngrams/TextMining.hs
+++ b/src/Data/Gargantext/Ngrams/TextMining.hs
@@ -4,11 +4,12 @@ import Data.Map (empty, Map, insertWith, toList)
 import Data.List (foldl, foldl')
 import qualified Data.List as L
+sortGT :: (Ord a, Ord b) => (a, b) -> (a, b) -> Ordering
 sortGT (a1, b1) (a2, b2)
-	| a1 < a2 = GT
+    | a1 < a2 = GT
-	| a1 > a2 = LT
+    | a1 > a2 = LT
-	| a1 == a2 = compare b1 b2
+    | a1 == a2 = compare b1 b2
+sortGT (_, _) (_, _) = error "What is this case ?"
 --histogram :: Ord a => [a] -> [(a, Int)]
@@ -21,7 +22,7 @@ countElem m e = Data.Map.insertWith (\n o -> n + o) e 1 m
 freqList :: (Ord k) => [k] -> Data.Map.Map k Int
 freqList = foldl countElem Data.Map.empty
--getMaxFromMap :: Data.Map.Map -> Maybe -> [a] -> [a]
+getMaxFromMap :: Ord a => Map a1 a -> [a1]
 getMaxFromMap m = go [] Nothing (toList m)
  where
    go ks _        []           = ks 
@@ -53,4 +54,4 @@ countYear' (xs) = foldl' (\x y -> insertWith (+) y 1 x) empty xs
 textMiningMain :: IO ()
 textMiningMain = do
-	print $ merge ["abc"] ["bcd"]
+    print $ merge ["abc"::String] ["bcd" :: String]
--- a/src/Data/Gargantext/Parsers/WOS.hs
+++ b/src/Data/Gargantext/Parsers/WOS.hs
+{-# LANGUAGE OverloadedStrings #-}
 module Data.Gargantext.Parsers.WOS where
 import Prelude hiding (takeWhile, take, concat, readFile)
@@ -6,6 +8,7 @@ import Data.Map as DM
 import Data.Attoparsec.ByteString
 import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
 import Data.ByteString (ByteString)
+import Data.ByteString.Char8 (pack)
 import Data.Either.Extra(Either(..))
 import Control.Applicative
@@ -47,8 +50,9 @@ data ParserType = WOS | CSV
 wosParser :: Parser [Maybe [ByteString]]
 wosParser = do
    -- TODO Warning if version /= 1.0
-    _ <- manyTill anyChar (string "\nVR 1.0")
+    -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
-    ns <- many1 wosNotice <* "\nEF"
+    _ <- manyTill anyChar (string $ pack "\nVR 1.0")
+    ns <- many1 wosNotice <* (string $ pack "\nEF")
    return ns
 startNotice :: Parser ByteString
@@ -56,7 +60,7 @@ startNotice = "\nPT " *> takeTill isEndOfLine
 wosNotice :: Parser (Maybe [ByteString])
 wosNotice = do
-    n <- startNotice *> wosFields <* manyTill anyChar (string "\nER\n")
+    n <- startNotice *> wosFields <* manyTill anyChar (string $ pack "\nER\n")
    return n
 field' :: Parser (ByteString, [ByteString])

--- a/src/Data/Gargantext/Types/Main.hs
+++ b/src/Data/Gargantext/Types/Main.hs
@@ -16,6 +16,19 @@ import Data.Gargantext.Types.Node ( NodePoly
                               , HyperdataNotebook
                               )
+data Language = EN | FR -- | DE | IT | SP
+    -- > EN == english
+    -- > FR == french
+    -- > DE == deutch  (not implemented yet)
+    -- > IT == italian (not implemented yet)
+    -- > SP == spanish (not implemented yet)
+    -- > ... add your language and help us to implement it (:
+type Ngrams = (Text, Text, Text)
 -- | TODO add Symbolic Node / Document
 --   TODO make instances of Nodes

--- a/stack.yaml
+++ b/stack.yaml
@@ -4,4 +4,5 @@ packages:
 - .
 - /home/alexandre/local/logiciels/haskell/servant/servant-multipart
 extra-deps:
+- utc-0.2.0.1
 resolver: lts-9.2
--- a/test/Ngrams.hs
+++ b/test/Ngrams.hs
+{-# LANGUAGE OverloadedStrings   #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+import Test.Hspec
+import Test.QuickCheck
+import Control.Exception (evaluate)
+import Data.Text (Text)
+import Data.Gargantext.Prelude
+import Data.Gargantext.Types.Main (Language(..))
+import Data.Gargantext.Ngrams
+import Data.Gargantext.Ngrams.Occurrences (parseOccurrences)
+import Data.Gargantext.Ngrams.Parser (extractNgrams, selectNgrams)
+ngramsExtractionTest EN = hspec $ do
+    describe "Ngrams extraction in English Language" $ do
+        let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
+        it "\"Of\" seperates two ngrams" $ do
+            t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) 
+            t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
+        it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
+            t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) 
+            t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
+ngramsExtractionTest FR = hspec $ do
+    describe "Ngrams extraction in English Language" $ do
+        let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
+        it "\"Of\" seperates two ngrams" $ do
+            t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0) 
+            t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
+        it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
+            t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2) 
+            t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
+parsersTest = hspec $ do
+  describe "Parser for occurrences" $ do
+    let txt     = "internet"
+    it "returns the result of one parsing" $ do
+        parseOccurrences "internet" "internet" `shouldBe` Right 1
+    -- | Context of Text should be toLower
+    it "returns the result of one parsing not case sensitive" $ do
+        let txtCase = "Internet"
+        parseOccurrences txtCase "internet"  `shouldBe` Right 1
+    it "returns the result of one parsing after space" $ do
+        parseOccurrences txt " internet"  
+            `shouldBe` Right 1
+    it "returns the result of one parsing after chars" $ do
+        parseOccurrences txt "l'internet"  
+            `shouldBe` Right 1
+    it "returns the result of multiple parsing" $ do
+        parseOccurrences txt "internet internet of things" 
+            `shouldBe` Right 2
+    it "returns the result of multiple parsing separated by text" $ do
+        parseOccurrences txt "internet in the internet of things" 
+            `shouldBe` Right 2
+    it "returns the result of multiple parsing separated by punctuation" $ do
+        parseOccurrences txt "internet. In the internet of things, internet like; internet?"
+            `shouldBe` Right 4
+--  describe "Parser for nodes" $ do
+--    it "returns the result of one parsing after space" $ do
+--      occOfCorpus 249509 "sciences" `shouldReturn` 7
+main :: IO ()
+main = do
+    -- parsersTest
+    -- ngramsExtractionTest EN
+    ngramsExtractionTest FR
--- a/test/Parsers.hs
+++ b/test/Parsers.hs
-{-# LANGUAGE OverloadedStrings #-}
-import Test.Hspec
-import Test.QuickCheck
-import Control.Exception (evaluate)
-import Data.Text (Text)
-import Data.Gargantext.Parsers.Occurrences (parseOccurrences)
-- import Data.Gargantext.Analysis (occOfCorpus)
-parsersTest = hspec $ do
-  describe "Parser for occurrences" $ do
-    let txt     = "internet"
-    it "returns the result of one parsing" $ do
-        parseOccurrences "internet" "internet" `shouldBe` Right 1
-    -- | Context of Text should be toLower
-    it "returns the result of one parsing not case sensitive" $ do
-        let txtCase = "Internet"
-        parseOccurrences txtCase "internet"  `shouldBe` Right 1
-    it "returns the result of one parsing after space" $ do
-        parseOccurrences txt " internet"  
-            `shouldBe` Right 1
-    it "returns the result of one parsing after chars" $ do
-        parseOccurrences txt "l'internet"  
-            `shouldBe` Right 1
-    it "returns the result of multiple parsing" $ do
-        parseOccurrences txt "internet internet of things" 
-            `shouldBe` Right 2
-    it "returns the result of multiple parsing separated by text" $ do
-        parseOccurrences txt "internet in the internet of things" 
-            `shouldBe` Right 2
-    it "returns the result of multiple parsing separated by punctuation" $ do
-        parseOccurrences txt "internet. In the internet of things, internet like; internet?"
-            `shouldBe` Right 4
--  describe "Parser for nodes" $ do
--    it "returns the result of one parsing after space" $ do
--      occOfCorpus 249509 "sciences" `shouldReturn` 7
-main :: IO ()
-main = do
-    parsersTest
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -8,7 +8,8 @@ import Control.Exception (evaluate)
 import Data.Text (Text)
 import Data.Hastext.Parsers.Occurrences (parse)
-main = print "hspec $ do
+main :: IO ()
+main = print "hspec $ do"
  describe "Parser for occurrences" $ do
    let txt     = "internet"
@@ -43,5 +44,4 @@ main = print "hspec $ do
-main :: IO ()