[FEAT] grouping ngrams better written (simplified) with semigroup. TODO: update the tests

02e3e8d2 · Alexandre Delanoë · c20d8f5f · 02e3e8d2 · 02e3e8d2 · 02e3e8d2
Commit 02e3e8d2 authored May 09, 2018 by Alexandre Delanoë
13 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -15,7 +15,6 @@ dependencies:
 - text
 library:
  source-dirs: src
-  default-extensions: 
  ghc-options:
  - -Wincomplete-uni-patterns
  - -Wincomplete-record-updates
@@ -33,6 +32,7 @@ library:
  - Gargantext.Text.Metrics.FrequentItemSet
  - Gargantext.Text.Ngrams.PosTagging.CoreNLP
  - Gargantext.Text.Ngrams.PosTagging.Parser
+  - Gargantext.Text.Ngrams.Token.Text
  - Gargantext.Text.Parsers.Date
  - Gargantext.Database
  - Gargantext.API
@@ -54,10 +54,11 @@ library:
  - data-time-segment
  - directory
  - duckling
+  - exceptions
  - filepath
  - fclabels
  - fast-logger
-    #  - haskell-gi-base
+  - full-text-search
  - http-client
  - http-client-tls
  - http-conduit
@@ -134,7 +135,6 @@ tests:
  garg-test:
    main: Main.hs
    source-dirs: src-test
-    default-extensions: 
    ghc-options:
    - -threaded
    - -rtsopts

--- a/src-test/Ngrams/Lang/En.hs
+++ b/src-test/Ngrams/Lang/En.hs
@@ -46,9 +46,3 @@ ngramsExtractionTest = hspec $ do
            t2 `shouldBe` [[("Donald Trump","NNP","PERSON"),("president of the United-States of America","NN","LOCATION")]]


-
-
-
-
-
-
--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -20,84 +20,37 @@ n non negative integer
 module Gargantext.Text
  where

-import Data.Char (Char, isAlphaNum, isSpace)
-import Data.Text (Text, filter, toLower, split, splitOn)
 import qualified Data.Text as DT
 --import Data.Text.IO (readFile)

-import Data.Set (Set)
-import qualified Data.Set as S
+
 import Data.Map.Strict  (Map
-                        , empty
-                        , insertWith, unionWith
                        , lookupIndex
                        --, fromList, keys
                        )
+
+import Data.Text (Text, split)
 import qualified Data.Map.Strict as M (filter)
-import Data.Foldable (foldl')

 -----------------------------------------------------------------
-import Gargantext.Text.Ngrams.Stem.En
+import Gargantext.Text.Ngrams
+import Gargantext.Text.Metrics.Occurrences

 import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
 import Gargantext.Prelude hiding (filter)
 -----------------------------------------------------------------

-
-data ListName = Stop | Candidate | Graph
-  deriving (Show, Eq)
-
-
-
-data Ngroup = Ngroup { _ngroup_label  ::  Ngrams
-                     , _ngroup_ngrams :: [Ngrams]
+data Group = Group { _group_label  ::  Ngrams
+                   , _group_ngrams :: [Ngrams]
                   } deriving (Show)


-data Ngrams = Ngrams { _ngrams_label :: [Text]
-                     , _ngrams_stem  :: Set Text
-                     } deriving (Show)
-
-text2ngrams :: Text -> Ngrams
-text2ngrams txt = Ngrams txt' (S.fromList $ map stem txt')
-  where
-    txt' = splitOn " " txt
-
-equivNgrams :: Ngrams -> Ngrams -> Bool
-equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
-                                        || s2 `S.isSubsetOf` s1
-
-
-type Occ     = Int
--type Index   = Int
-
-- Data Ngrams = Monograms | MultiGrams
-
-ngrams :: Text -> [Text]
-ngrams xs = monograms $ toLower $ filter isGram xs
-
 clean :: Text -> Text
 clean txt = DT.map clean' txt
  where
    clean' '’' = '\''
    clean' c  = c

-monograms :: Text -> [Text]
-monograms txt = split isWord txt
-  where
-    isWord c = c `elem` [' ', '\'', ',', ';']
-
-isGram :: Char -> Bool
-isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
-
-- | Compute the occurrences (occ)
-occ :: Ord a => [a] -> Map a Occ
-occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-
-- TODO add groups and filter stops
-sumOcc :: Ord a => [Map a Occ] -> Map a Occ
-sumOcc xs = foldl' (unionWith (+)) empty xs
-
 --noApax :: Ord a => Map a Occ -> Map a Occ
 --noApax m = M.filter (>1) m

@@ -135,32 +88,35 @@ text2fis n xs = list2fis n (map ngrams xs)

 -------------------------------------------------------------------
 -- Contexts of text
-
 sentences :: Text -> [Text]
 sentences txt = split isStop txt

 isStop :: Char -> Bool
 isStop c = c `elem` ['.','?','!']

---- | https://en.wikipedia.org/wiki/Text_mining
--testText :: Text
--testText = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
--
--
--
---- | Tests
----ngramsTest :: [Text]
--ngramsTest =  ocs
--  where
--    --txt = concat <$> lines <$> clean <$> readFile filePath
--    txt = clean $ testText
--    -- | Number of sentences
--    ls   = sentences $ txt
--    -- | Number of monograms used in the full text
--    ws   = ngrams    $ txt
--    -- | stem ngrams
--    -- TODO
--    -- group ngrams
--    ocs  = occ       $ ws
--
--
+
+-- | https://en.wikipedia.org/wiki/Text_mining
+testText_en :: Text
+testText_en = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
+
+-- | https://fr.wikipedia.org/wiki/Fouille_de_textes
+testText_fr :: Text
+testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."
+
+-- | Ngrams Test
+-- >>> ngramsTest testText
+-- 248
+ngramsTest :: Text -> Int
+ngramsTest x=  length ws
+  where
+    --txt = concat <$> lines <$> clean <$> readFile filePath
+    txt = clean x
+    -- | Number of sentences
+    --ls   = sentences $ txt
+    -- | Number of monograms used in the full text
+    ws   = ngrams    $ txt
+    -- | stem ngrams
+    -- TODO
+    -- group ngrams
+    --ocs  = occ       $ ws
+
--- a/src/Gargantext/Text/Metrics/Occurrences.hs
+++ b/src/Gargantext/Text/Metrics/Occurrences.hs
@@ -7,17 +7,34 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Here is a longer description of this module, containing some
-commentary with @some markup@.
+Token and occurrence
+
+An occurrence is not necessarily a token. Considering the sentence:
+"A rose is a rose is a rose". We may equally correctly state that there
+are eight or three words in the sentence. There are, in fact, three word
+types in the sentence: "rose", "is" and "a". There are eight word tokens
+in a token copy of the line. The line itself is a type. There are not
+eight word types in the line. It contains (as stated) only the three
+word types, 'a', 'is' and 'rose', each of which is unique. So what do we
+call what there are eight of? They are occurrences of words. There are
+three occurrences of the word type 'a', two of 'is' and three of 'rose'.
+Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Metrics.Occurrences where
+module Gargantext.Text.Metrics.Occurrences
+  where

 import Gargantext.Prelude

+import Data.Map.Strict  (Map
+                        , empty
+                        , insertWith, unionWith
+                        )
+
 import Control.Monad ((>>),(>>=))
 import Data.String (String())
 import Data.Attoparsec.Text
@@ -25,7 +42,18 @@ import Data.Text (Text)

 import Data.Either.Extra(Either(..))
 import qualified Data.Text as T
-import Control.Applicative
+import Control.Applicative hiding (empty)
+-----------------------------------------------------------
+
+type Occ = Int
+
+-- | Compute the occurrences (occ)
+occ :: Ord a => [a] -> Map a Occ
+occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
+
+-- TODO add groups and filter stops
+sumOcc :: Ord a => [Map a Occ] -> Map a Occ
+sumOcc xs = foldl' (unionWith (+)) empty xs


 occurrenceParser :: Text -> Parser Bool

--- a/src/Gargantext/Text/Ngrams.hs
+++ b/src/Gargantext/Text/Ngrams.hs
 {-|
-Module      : Gargantext.Parsers
-Description : All parsers of Gargantext in one file.
+Module      : Gargantext.Text.Ngrams
+Description : Ngrams definition and tools
 Copyright   : (c) CNRS, 2017 - present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Mainly reexport functions in @Data.Text.Metrics@
+An @n-gram@ is a contiguous sequence of n items from a given sample of
+text. In Gargantext application the items are words, n is a non negative
+integer.
+
+Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
+"unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
+3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
+"four-gram", "five-gram", and so on.
+
+Source: https://en.wikipedia.org/wiki/Ngrams
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
@@ -15,3 +25,49 @@ Mainly reexport functions in @Data.Text.Metrics@
 module Gargantext.Text.Ngrams
  where

+import Data.Char (Char, isAlphaNum, isSpace)
+import Data.Text (Text, split, splitOn, pack)
+
+import Data.Set  (Set)
+import qualified Data.Set as S
+
+import Gargantext.Prelude
+import Gargantext.Core
+
+import Gargantext.Text.Ngrams.Stem (stem)
+
+
+data Ngrams = Ngrams { _ngrams_label :: [Text]
+                     , _ngrams_stem  :: Set Text
+                     } deriving (Show)
+
+
+data Terms = MonoGrams | MultiGrams
+type MonoGrams  = Text
+type MultiGrams = [Text]
+
+
+ngrams :: Text -> [Text]
+ngrams = monograms
+
+text2ngrams :: Lang -> Text -> Ngrams
+text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
+  where
+    txt' = splitOn (pack " ") txt
+
+
+equivNgrams :: Ngrams -> Ngrams -> Bool
+equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
+                                       || s2 `S.isSubsetOf` s1
+
+--monograms :: Text -> [Text]
+--monograms xs = monograms $ toLower $ filter isGram xs
+
+monograms :: Text -> [Text]
+monograms txt = split isWord txt
+  where
+    isWord c = c `elem` [' ', '\'', ',', ';']
+
+isGram :: Char -> Bool
+isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
+
--- a/src/Gargantext/Text/Ngrams/Lists.hs
+++ b/src/Gargantext/Text/Ngrams/Lists.hs
@@ -19,8 +19,13 @@ module Gargantext.Text.Ngrams.Lists
 --import Data.Maybe
 --import Data.List (filter)
 --import Gargantext.Text
--import Gargantext.Prelude
+import Gargantext.Prelude
 --
+
+data ListName = Stop | Candidate | Graph
+  deriving (Show, Eq)
+
+
 --graph :: [Ngrams] -> [Ngrams]
 --graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
 --
@@ -30,3 +35,4 @@ module Gargantext.Text.Ngrams.Lists
 --stop :: [Ngrams] -> [Ngrams]
 --stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs

+
--- a/src/Gargantext/Text/Ngrams/PosTagging/CoreNLP.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/CoreNLP.hs
@@ -9,12 +9,13 @@ Portability : POSIX

 -}

-
 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE DeriveGeneric     #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE TemplateHaskell   #-}
 {-# LANGUAGE TypeOperators     #-}
+{-# LANGUAGE FlexibleContexts  #-}
+

 module Gargantext.Text.Ngrams.PosTagging.CoreNLP
  where
@@ -22,9 +23,15 @@ module Gargantext.Text.Ngrams.PosTagging.CoreNLP
 import GHC.Generics
 import GHC.Show (Show(..))

+import Data.ByteString.Lazy.Internal (ByteString)
 import Data.Aeson.TH (deriveJSON)
-import Data.Monoid ((<>))
-import Data.Text (Text)
+import Data.Aeson
+import Data.Monoid
+import Data.Maybe (isJust)
+
+import Data.Set (Set, fromList, empty)
+
+import Data.Text (Text, splitOn, pack, toLower, unpack)

 import Gargantext.Core (Lang(..))
 import Gargantext.Core.Utils.Prefix (unPrefix)
@@ -32,24 +39,107 @@ import Gargantext.Prelude

 import Network.HTTP.Simple

-
+import Control.Monad.Catch      (MonadThrow)
+import Control.Monad.IO.Class  (MonadIO)
+import Data.String.Conversions (ConvertibleStrings)
+
+------------------------------------------------------------------------
+data Tag = POS | NER
+  deriving (Show, Eq)
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+data POS = NP
+         | JJ  | VB
+         | CC  | IN | DT
+         | NoPos
+  deriving (Show, Generic, Eq)
+
+------------------------------------------------------------------------
+instance FromJSON POS where
+  parseJSON = withText "String" (\x -> pure (pos $ unpack x))
+    where
+      pos :: [Char] -> POS
+      pos "NP"  = NP
+      pos "NN"  = NP
+      pos "NC"  = NP
+      pos "NNS" = NP
+      pos "NNP" = NP
+      pos "JJ"  = JJ
+      pos "ADJ" = JJ
+      pos "VB"  = VB
+      pos "VBN" = VB
+      pos "VBG" = VB
+      pos "CC"  = CC
+      pos "IN"  = IN
+      pos "DT"  = DT
+      -- French specific
+      pos "P"  = IN
+      pos  _    = NoPos
+
+instance ToJSON POS
+------------------------------------------------------------------------
+data NER = PERSON | ORGANIZATION | LOCATION | NoNER
+  deriving (Show, Generic)
+------------------------------------------------------------------------
+instance FromJSON NER where
+  parseJSON = withText "String" (\x -> pure (ner $ unpack x))
+    where
+      ner :: [Char] -> NER
+      ner "PERSON"       = PERSON
+      ner "ORGANIZATION" = ORGANIZATION
+      ner "LOCATION"     = LOCATION
+      ner  _             = NoNER
+
+instance ToJSON NER
+------------------------------------------------------------------------
 data Token = Token { _tokenIndex                :: Int
                   , _tokenWord                 :: Text
                   , _tokenOriginalText         :: Text
                   , _tokenLemma                :: Text
                   , _tokenCharacterOffsetBegin :: Int
                   , _tokenCharacterOffsetEnd   :: Int
-                   , _tokenPos                  :: Text
-                   , _tokenNer                  :: Text
+                   , _tokenPos                  :: Maybe POS
+                   , _tokenNer                  :: Maybe NER
                   , _tokenBefore               :: Maybe Text
                   , _tokenAfter                :: Maybe Text
                   } deriving (Show, Generic)
 $(deriveJSON (unPrefix "_token") ''Token)
+------------------------------------------------------------------------
+data NgramsTag  = NgramsTag { _my_token_word :: [Text]
+                            , _my_token_stem :: Set Text
+                            , _my_token_pos  :: Maybe POS
+                            , _my_token_ner  :: Maybe NER
+                            } deriving (Show)
+------------------------------------------------------------------------
+tokens2ngramsTags :: [Token] -> [NgramsTag]
+tokens2ngramsTags ts = select $ map ngramsTag ts
+------------------------------------------------------------------------
+ngramsTag :: Token -> NgramsTag
+ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n
+  where
+    w' = split w
+    s' = fromList (split s)
+    split = splitOn (pack " ") . toLower

-token2text :: Token -> (Text, Text, Text)
-token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
+select :: [NgramsTag] -> [NgramsTag]
+select xs = filter isNgrams xs
+    where
+      isNgrams (NgramsTag _ _ p n) = isJust p || isJust n

+instance Monoid NgramsTag where
+  mempty = NgramsTag [] empty Nothing Nothing

+  mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
+         = NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
+          where
+            p3 = case (p1,p2) of
+                   (Just JJ, Just NP) -> Just NP
+                   (Just VB, Just NP) -> Just NP
+                   _                  -> p1
+
+  mconcat = foldl mappend mempty
+
+------------------------------------------------------------------------
 data Sentence  = Sentence { _sentenceIndex :: Int
                          , _sentenceTokens :: [Token]
                          } deriving (Show, Generic)
@@ -82,16 +172,28 @@ $(deriveJSON (unPrefix "_") ''Sentences)
 -- 


-corenlp :: Lang -> Text -> IO Sentences
-corenlp lang txt = do
+corenlp' :: ( MonadThrow m, MonadIO m, FromJSON a
+            , ConvertibleStrings p ByteString) =>
+            Lang -> p -> m (Response a)
+corenlp' lang txt = do
    let properties = case lang of
            EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
            -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
            FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
    url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
    let request = setRequestBodyLBS (cs txt) url
-    response <- httpJSON request
-    pure (getResponseBody response :: Sentences)
+    httpJSON request
+
+corenlpRaw :: Lang -> Text -> IO Value
+corenlpRaw lang txt = do
+  response <- corenlp' lang txt
+  pure (getResponseBody response)
+
+
+corenlp :: Lang -> Text -> IO Sentences
+corenlp lang txt = do
+  response <- corenlp' lang txt
+  pure (getResponseBody response)

 -- | parseWith
 -- Part Of Speech example
@@ -102,7 +204,9 @@ corenlp lang txt = do
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
-tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
-
+tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) 
+                  <$> map _sentenceTokens
+                  <$> _sentences 
+                  <$> corenlp lang s


--- a/src/Gargantext/Text/Ngrams/PosTagging/Lang/En.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Lang/En.hs
@@ -14,87 +14,99 @@ commentary with @some markup@.
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Ngrams.PosTagging.Lang.En (selectNgrams, groupNgrams, textTest) 
+module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
  where

+--import Data.Text (Text)
+import Data.Maybe (Maybe(Just))
+import Gargantext.Text.Ngrams.PosTagging.CoreNLP
 import Gargantext.Prelude
-import Data.Text (Text)
-import Data.Monoid ((<>))
-
-selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
-selectNgrams xs = filter isNgrams xs
-    where
-        isNgrams (_,"NN"   ,_             ) = True
-        isNgrams (_,"NNS"  ,_             ) = True
-        isNgrams (_,"NNP"  ,_             ) = True
-        isNgrams (_,"NN+CC",_             ) = True
-        isNgrams (_,_      ,"PERSON"      ) = True
-        isNgrams (_,_      ,"ORGANIZATION") = True
-        isNgrams (_,_      ,"LOCATION"    ) = True
-        isNgrams (_,_      ,_             ) = False
-
-
-groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
-groupNgrams []       = []
-
-groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
-    where
-        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-        cc  = (c1, "CC", c1')
-        jn1 = (j1, "JJ", j1')
-        jn2 = jn j2 j3 j2'
-
-groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
-    where
-        jn j m mm p = (j <> " " <> m, p, mm)
-        jn1 = jn j1 n nn ("NN+CC" :: Text)
-        jn2 = jn j2 n nn ("NN+CC" :: Text)
-
-groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
-    where
-        jn j m mm p = (j <> " " <> m, p, mm)
-        jn1 = jn j1 n nn ("NN+CC" :: Text)
-        jn2 = jn j2 n nn ("NN+CC" :: Text)
-
-groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
-groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-
-
-- extractNgrams "Test the antiinflammatory or analgesic activity?"
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
-- > should be (antiinflammatory activity) <> (analgesic activity)
-
-groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs)      = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
-groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs)   = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
-
-groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
-groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
-groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs)         = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
-
-groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
-
-
-textTest :: [Text]
-textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-          , "In both models, the standard drug used was aspirin 100 mg/kg. "
-          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
-

+group :: [NgramsTag] -> [NgramsTag]
+group [] = []
+group ntags = group2 NP NP
+           $ group2 NP VB
+           $ group2 NP IN
+           $ group2 IN DT
+           $ group2 VB NP
+           $ group2 JJ NP
+           $ group2 JJ JJ
+           $ group2 JJ CC
+           $ ntags
+
+------------------------------------------------------------------------
+-- | FIXME p1 and p2 not really taken into account
+group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
+group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
+  if (p1 == p1') && (p2 == p2')
+     then group2 p1 p2 (x<>y : z)
+     else (x : group2 p1 p2 (y:z))
+group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
+group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
+group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
+group2 _ _ [] = []
+
+-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
+-- group xs = group3 NN IN DT xs
+
+
+-- TO BE REMOVED old code
+--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
+--    where
+--        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
+--        cc  = (c1, "CC", c1')
+--        jn1 = (j1, "JJ", j1')
+--        jn2 = jn j2 j3 j2'
+--
+--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
+--    where
+--        jn j m mm p = (j <> " " <> m, p, mm)
+--        jn1 = jn j1 n nn ("NN+CC" :: Text)
+--        jn2 = jn j2 n nn ("NN+CC" :: Text)
+--
+--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
+--    where
+--        jn j m mm p = (j <> " " <> m, p, mm)
+--        jn1 = jn j1 n nn ("NN+CC" :: Text)
+--        jn2 = jn j2 n nn ("NN+CC" :: Text)
+--
+--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
+--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--
+--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--
+--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--
+--groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
+--
+--
+---- extractNgrams "Test the antiinflammatory or analgesic activity?"
+---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
+---- > should be (antiinflammatory activity) <> (analgesic activity)
+--
+--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
+--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs)      = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
+--
+--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
+--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs)   = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
+--
+--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs)         = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
+--
+--groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
+
+
+--textTest :: [Text]
+--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
+--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
+--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
+--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
+--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
+--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
+--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
+--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
+--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--- a/src/Gargantext/Text/Ngrams/PosTagging/Lang/Fr.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Lang/Fr.hs
@@ -7,79 +7,118 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Here is a longer description of this module, containing some
-commentary with @some markup@.
+This @group@ function groups horizontally ngrams in their context of
+sentence according to grammars specific of each language. In english, JJ
+is ADJectiv in french. -
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
+module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
  where

+import Data.Maybe (Maybe(Just))
+import Gargantext.Text.Ngrams.PosTagging.CoreNLP
 import Gargantext.Prelude
-import Data.Text (Text)
-import Data.Monoid ((<>))
-
-selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
-selectNgrams xs = filter selectNgrams' xs
-    where
-        selectNgrams' (_,"N"    ,_       ) = True
-        selectNgrams' (_,"NC"   ,_       ) = True
-        selectNgrams' (_,"NN+CC",_       ) = True
-        selectNgrams' (_,_      ,"PERSON"      ) = True
-        selectNgrams' (_,_      ,"ORGANIZATION") = True
-        selectNgrams' (_,_      ,"LOCATION"    ) = True
-        selectNgrams' (_,_      ,_       ) = False
-
-
-groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
-groupNgrams []       = []
-
--groupNgrams ((_,"DET",_):xs) = groupNgrams xs
-
-- "Groupe : nom commun et adjectifs avec conjonction"
-groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-    where
-        n1 = (n <> " " <> j1, "NC", n')
-        n2 = (n <> " " <> j2, "NC", n')
-
-- /!\ sometimes N instead of NC (why?)
-groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-    where
-        n1 = (n <> " " <> j1, "N", n')
-        n2 = (n <> " " <> j2, "N", n')
-
-- Groupe : Adjectif + Conjonction de coordination + Adjectif
-- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
-
-- Groupe : Nom commun + préposition + Nom commun
-groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs)  = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
-groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
-groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
-
-- Groupe : Plusieurs adjectifs successifs
-groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
-
-- Groupe : nom commun et adjectif
-groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
-groupNgrams ((x,"N",_):(y,"ADJ",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-
-- Groupe : adjectif et nom commun
-groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
-groupNgrams ((x,"ADJ",_):(y,"N",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-
-
-groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
-groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
-
-
-- Si aucune des règles précédentes n'est remplie
-groupNgrams (x:xs)                        = (x:(groupNgrams xs))
-

-textTest :: [Text]
-textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]

+group :: [NgramsTag] -> [NgramsTag]
+group [] = []
+group ntags = group2 NP NP
+           $ group2 NP VB
+           $ group2 NP IN
+           $ group2 IN DT
+           $ group2 VB NP
+           $ group2 JJ NP
+           $ group2 NP JJ
+           $ group2 JJ JJ
+           $ group2 JJ CC
+           $ ntags
+
+------------------------------------------------------------------------
+-- | FIXME p1 and p2 not really taken into account
+group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
+group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
+  if (p1 == p1') && (p2 == p2')
+     then group2 p1 p2 (x<>y : z)
+     else (x : group2 p1 p2 (y:z))
+group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
+group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
+group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
+group2 _ _ [] = []
+
+
+------------------------------------------------------------------------
+
+
+--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
+--    where
+--
+--import Gargantext.Prelude
+--import Data.Text (Text)
+--import Data.Monoid ((<>))
+--
+--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+--selectNgrams xs = filter selectNgrams' xs
+--    where
+--        selectNgrams' (_,"N"    ,_       ) = True
+--        selectNgrams' (_,"NC"   ,_       ) = True
+--        selectNgrams' (_,"NN+CC",_       ) = True
+--        selectNgrams' (_,_      ,"PERSON"      ) = True
+--        selectNgrams' (_,_      ,"ORGANIZATION") = True
+--        selectNgrams' (_,_      ,"LOCATION"    ) = True
+--        selectNgrams' (_,_      ,_       ) = False
+--
+--
+--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
+--groupNgrams []       = []
+--
+----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
+--
+---- "Groupe : nom commun et adjectifs avec conjonction"
+--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
+--    where
+--        n1 = (n <> " " <> j1, "NC", n')
+--        n2 = (n <> " " <> j2, "NC", n')
+--
+---- /!\ sometimes N instead of NC (why?)
+--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
+--    where
+--        n1 = (n <> " " <> j1, "N", n')
+--        n2 = (n <> " " <> j2, "N", n')
+--
+---- Groupe : Adjectif + Conjonction de coordination + Adjectif
+---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
+--
+---- Groupe : Nom commun + préposition + Nom commun
+--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs)  = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
+--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
+--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
+--
+---- Groupe : Plusieurs adjectifs successifs
+--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
+--
+---- Groupe : nom commun et adjectif
+--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+---- /!\ sometimes N instead of NC (why?)
+--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+--
+---- Groupe : adjectif et nom commun
+--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+---- /!\ sometimes N instead of NC (why?)
+--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
+--
+--
+--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+--
+--
+---- Si aucune des règles précédentes n'est remplie
+--groupNgrams (x:xs)                        = (x:(groupNgrams xs))
+--
+--
+--textTest :: [Text]
+--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
+--
--- a/src/Gargantext/Text/Ngrams/PosTagging/Parser.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Parser.hs
@@ -9,63 +9,43 @@ Portability : POSIX

 Here is a longer description of this module, containing some
 commentary with @some markup@.
+
+Ngrams selection algorithms
+A form is a list of characters seperated by one or more spaces in a sentence.
+A word is a form.
+
 -}

 {-# LANGUAGE NoImplicitPrelude   #-}
-{-# LANGUAGE OverloadedStrings   #-}
 {-# LANGUAGE ScopedTypeVariables #-}

 module Gargantext.Text.Ngrams.PosTagging.Parser
  where

-import Gargantext.Prelude
-import Data.Text hiding (map)
+import Data.Text hiding (map, group)

+import Gargantext.Prelude
 import Gargantext.Core (Lang(..))
 import Gargantext.Text.Ngrams.PosTagging.CoreNLP
+
 import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
 import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr

-type SNgrams       = (Text, Text, Text)
-
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.
-
-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text
-
-
-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are 
-
-- ps : Common words function in Haskell do not take sentence into account
-

-- TODO for scientific papers: add maesures
-- TODO add the p score regex
-extractNgrams :: Lang -> Text -> IO [[SNgrams]]
-extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
+extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
+extractNgrams lang s = map (group lang) <$> extractNgrams' lang s


-extractNgrams' :: Lang -> Text -> IO [[SNgrams]]
-extractNgrams' lang t =  map (map token2text)
+extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
+extractNgrams' lang t =  map tokens2ngramsTags
                     <$> map _sentenceTokens
                     <$> _sentences
                     <$> corenlp lang t

-- | This function selects ngrams according to grammars specific
--   of each language.
--   In english, JJ is ADJectiv in french.
-selectNgrams :: Lang -> [SNgrams] -> [SNgrams]
-selectNgrams EN = En.selectNgrams
-selectNgrams FR = Fr.selectNgrams

-- | This function analyze and groups (or not) ngrams according to 
--   grammars specific of each language.
-groupNgrams :: Lang -> [SNgrams] -> [SNgrams]
-groupNgrams EN = En.groupNgrams
-groupNgrams FR = Fr.groupNgrams
+---- | This function analyse and groups (or not) ngrams according to 
+----   grammars specific of each language.
+group :: Lang -> [NgramsTag] -> [NgramsTag]
+group EN = En.group
+group FR = Fr.group

--- a/src/Gargantext/Text/Ngrams/Stem.hs
+++ b/src/Gargantext/Text/Ngrams/Stem.hs
@@ -7,8 +7,14 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Here is a longer description of this module, containing some
-commentary with @some markup@.
+In linguistic morphology and information retrieval, stemming is the
+process of reducing inflected (or sometimes derived) words to their word
+stem, base or root form—generally a written word form. The @stem@ needs
+not be identical to the morphological root of the word; it is usually
+sufficient that related words map to the same stem, even if this stem is
+not in itself a valid root.
+Source : https://en.wikipedia.org/wiki/Stemming
+
 -}


@@ -26,6 +32,23 @@ import Gargantext.Core (Lang(..))
 --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
 --import Language.Aspell.Options (ACOption(..))

+
+-- | Stemmer
+
+-- A stemmer for English, for example, should identify the string "cats"
+-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
+
+
+-- and
+-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
+-- algorithm reduces the words "fishing", "fished", and "fisher" to the
+-- root word, "fish". On the other hand, "argue", "argued", "argues",
+-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
+-- case where the stem is not itself a word or root) but "argument" and
+-- "arguments" reduce to the stem "argument".
+
+
+
 stem :: Lang -> Text -> Text
 stem lang = DT.pack . N.stem lang' . DT.unpack
  where
@@ -33,3 +56,5 @@ stem lang = DT.pack . N.stem lang' . DT.unpack
              EN -> N.English
              FR -> N.French

+
+
--- a/src/Gargantext/Text/Ngrams/Token.hs
+++ b/src/Gargantext/Text/Ngrams/Token.hs
+{-|
+Module      : Gargantext.Text.Ngrams.Token
+Description : Tokens and tokenizing a text
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+In computer science, lexical analysis, lexing or tokenization is the
+process of converting a sequence of characters (such as in a computer
+program or web page) into a sequence of tokens (strings with an assigned
+and thus identified meaning).
+Source: https://en.wikipedia.org/wiki/Tokenize
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module Gargantext.Text.Ngrams.Token (tokenize)
+  where
+
+import Data.Text (Text)
+import qualified Gargantext.Text.Ngrams.Token.Text as En
+
+type Token = Text
+
+-- >>> tokenize "A rose is a rose is a rose."
+-- ["A","rose","is","a","rose","is","a","rose", "."]
+-- 
+tokenize :: Text -> [Token]
+tokenize = En.tokenize
+
--- a/stack.yaml
+++ b/stack.yaml
@@ -16,6 +16,7 @@ extra-deps:
 - aeson-lens-0.5.0.0
 - duckling-0.1.3.0
 - extra-1.5.3
+- full-text-search-0.2.1.4
 - haskell-src-exts-1.18.2
 - http-types-0.12.1
 - protolude-0.2