Merge branch 'pipeline'

7e930b57 · Alexandre Delanoë · 6b1bba16 · 0be01d72 · 7e930b57 · 7e930b57
Commit 7e930b57 authored May 31, 2018 by Alexandre Delanoë
52 changed files
--- a/README.md
+++ b/README.md
@@ -48,6 +48,10 @@ sudo apt-get install libbz2-dev lipq-dev

 git clone https://github.com/np/servant-job.git

+### Get the clustering louvain library
+
+git clone https://gitlab.iscpif.fr/gargantext/clustering-louvain.git
+
 ## Building and installing
 stack install


--- a/app/CleanCsvCorpus.hs
+++ b/app/CleanCsvCorpus.hs
+{-|
+Module      : CleanCsvCorpus.hs
+Description : Gargantext starter
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Given a Gargantext CSV File and its Query This script cleans and
+compress the contexts around the main terms of the query.
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module CleanCsvCorpus  where
+
+--import GHC.IO (FilePath)
+import Data.SearchEngine as S
+import qualified Data.Set as S
+import Data.Text (pack)
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+
+import Gargantext.Prelude
+import Gargantext.Text.Search
+import Gargantext.Text.Parsers.CSV
+
+------------------------------------------------------------------------
+
+type Query = [S.Term]
+
+filterDocs :: [DocId] -> Vector Doc -> Vector Doc
+filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
+
+
+main :: IO ()
+main = do
+  let rPath = "/tmp/Gargantext_Corpus.csv"
+  let wPath = "/tmp/Gargantext_Corpus_bis.csv"
+  --let q = ["water", "scarcity", "morocco", "shortage","flood"]
+  let q = ["gratuit", "gratuité", "culture", "culturel"]
+
+  (h,csvDocs) <- readCsv rPath
+  
+  putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
+  putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
+  
+  let docs   = toDocs csvDocs
+  let engine = insertDocs docs initialDocSearchEngine
+  let docIds = S.query engine (map pack q)
+  let docs'  = fromDocs $ filterDocs docIds (V.fromList docs)
+  
+  putStrLn $ "Number of documents after:" <> show (V.length docs')
+  putStrLn $ "Mean size of docs:" <> show (docsSize docs')
+  
+  writeCsv wPath (h, docs')
+
+
--- a/app/Main.hs
+++ b/app/Main.hs
@@ -6,27 +6,35 @@ License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-}

+Script to start gargantext with different modes (Dev, Prod, Mock).

+-}

 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE DeriveGeneric     #-}
 {-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE StandaloneDeriving #-}
 {-# LANGUAGE TypeOperators     #-}
-
+{-# LANGUAGE Strict             #-}

 module Main where

-import Prelude (putStrLn)

 import Options.Generic
 import Data.Text (unpack)

 import Gargantext.Prelude
 import Gargantext.API (startGargantext, startGargantextMock)
+
+--------------------------------------------------------
+-- Graph Tests
+--import qualified Gargantext.Graph.Utils                    as U
+--import qualified Gargantext.Graph.Distances.Conditional    as C
+--import qualified Gargantext.Graph.Distances.Distributional as D
+--import qualified Gargantext.Graph.Distances.Matrice        as M
 --------------------------------------------------------

 data Mode = Dev | Mock | Prod 
@@ -45,12 +53,13 @@ data MyOptions w = MyOptions { run  :: w ::: Mode        <?> "Possible modes: De
 instance ParseRecord (MyOptions Wrapped)
 deriving instance Show (MyOptions Unwrapped)

+
+
 main :: IO ()
 main = do 
-
    MyOptions myMode myPort myIniFile  <- unwrapRecord
            "Gargantext: collaborative platform for text-mining"
-    
+
    let myPort' = case myPort of
            Just p  -> p
            Nothing -> 8008
@@ -64,6 +73,11 @@ main = do
                                       Just i  -> i
            Mock -> startGargantextMock myPort'
            _  -> startGargantextMock myPort'
-    
+
    putStrLn $ "Starting Gargantext with mode: " <> show myMode
    start
+
+-- main' :: IO ()
+--main' = putStrLn $ show $ M.conditional $ M.myMat 10
+
+
--- a/package.yaml
+++ b/package.yaml
@@ -18,26 +18,24 @@ library:
  ghc-options:
  - -Wincomplete-uni-patterns
  - -Wincomplete-record-updates
-  - -Werror
+  - -Wmissing-signatures
+  - -Wunused-binds
+  - -Wunused-imports
+    #  - -Werror
  exposed-modules:
  - Gargantext
  - Gargantext.Prelude
  - Gargantext.Core
-  - Gargantext.Core.Utils
  - Gargantext.Core.Types
-  - Gargantext.Core.Types.Node
  - Gargantext.Text
-  - Gargantext.Text.Metrics
-  - Gargantext.Text.Metrics.Occurrences
-  - Gargantext.Text.Metrics.FrequentItemSet
-  - Gargantext.Text.Ngrams.PosTagging.CoreNLP
-  - Gargantext.Text.Ngrams.PosTagging.Parser
-  - Gargantext.Text.Ngrams.Token.Text
-  - Gargantext.Text.Parsers.Date
-  - Gargantext.Database
+  - Gargantext.Text.Search
+  - Gargantext.Text.Parsers.CSV
  - Gargantext.API
+  - Gargantext.Viz.Graph.Distances.Matrice
  dependencies:
  - QuickCheck
+  - accelerate
+  - accelerate-io
  - aeson
  - aeson-lens
  - aeson-pretty
@@ -47,6 +45,7 @@ library:
  - base16-bytestring
  - bytestring
  - case-insensitive
+  - cassava
  - conduit
  - conduit-extra
  - containers
@@ -56,6 +55,7 @@ library:
  - duckling
  - exceptions
  - filepath
+  - fullstop
  - fclabels
  - fast-logger
  - full-text-search
@@ -70,6 +70,7 @@ library:
  - jose-jwt
  - lens
  - logging-effect
+  - matrix
  - monad-logger
  - mtl
  - natural-transformation
@@ -122,14 +123,23 @@ library:

 executable:
  main: Main.hs
-  source-dirs:      app
-  ghc-options:         -threaded -rtsopts -with-rtsopts=-N -O2
+  source-dirs: app
+  ghc-options: 
+  - -threaded 
+  - -rtsopts 
+  - -with-rtsopts=-N 
+  - -O2 
+  - -Wmissing-signatures
  dependencies:
    - base
+    - containers
    - gargantext
+    - vector
+    - cassava
    - ini
    - optparse-generic
    - unordered-containers
+    - full-text-search

 tests:
  garg-test:
@@ -157,6 +167,7 @@ tests:
    - -threaded
    - -rtsopts
    - -with-rtsopts=-N
+    - -Wmissing-signatures
    dependencies:
    - doctest
    - Glob

--- a/src/Gargantext.hs
+++ b/src/Gargantext.hs
@@ -11,7 +11,7 @@ Portability : POSIX
 -}

 module Gargantext (
-                     module Gargantext.Database,
+                     module Gargantext.Database
 --                     module Gargantext.Ngrams,
 --                     module Gargantext.Utils,
                    ) where
@@ -19,3 +19,9 @@ module Gargantext (
 import Gargantext.Database
 -- import Gargantext.Ngrams
 -- import Gargantext.Utils
+
+
+
+
+
+
--- a/src/Gargantext/API/FrontEnd.hs
+++ b/src/Gargantext/API/FrontEnd.hs
@@ -6,18 +6,17 @@ License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-}

-{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+Loads all static file for the front-end.

+-}

 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE TemplateHaskell   #-}

 ---------------------------------------------------------------------
-module Gargantext.API.FrontEnd
-      where
+module Gargantext.API.FrontEnd where

 import           Servant.Static.TH (createApiAndServerDecs)


--- a/src/Gargantext/API/Node.hs
+++ b/src/Gargantext/API/Node.hs
@@ -23,8 +23,6 @@ module Gargantext.API.Node
      where
 -------------------------------------------------------------------

-import System.IO (putStrLn)
-
 import Control.Monad.IO.Class (liftIO)
 import Control.Monad ((>>))
 --import System.IO (putStrLn, readFile)
@@ -98,13 +96,13 @@ type FacetDocAPI = "table"

 -- | Node API functions
 roots :: Connection -> Server Roots
-roots conn = liftIO (putStrLn "Log Needed" >> getNodesWithParentId conn 0 Nothing)
+roots conn = liftIO (putStrLn ( "Log Needed" :: Text) >> getNodesWithParentId conn 0 Nothing)
          :<|> pure (panic "not implemented yet")
          :<|> pure (panic "not implemented yet")
          :<|> pure (panic "not implemented yet")

 nodeAPI :: Connection -> NodeId -> Server NodeAPI
-nodeAPI conn id =  liftIO (putStrLn "getNode" >> getNode              conn id )
+nodeAPI conn id =  liftIO (putStrLn ("getNode" :: Text) >> getNode              conn id )
              :<|> deleteNode'   conn id
              :<|> getNodesWith' conn id
              :<|> getFacet      conn id

--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -11,11 +11,113 @@ Here is a longer description of this module, containing some
 commentary with @some markup@.
 -}

+------------------------------------------------------------------------
 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE DeriveGeneric     #-}

 module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
-                          , module Gargantext.Core.Types.Node
-                          ) where
+                             , module Gargantext.Core.Types.Node
+                             , Term, Terms(..)
+                             , TokenTag(..), POS(..), NER(..)
+                             , Label, Stems
+                             ) where
+
+import GHC.Generics
+import Data.Aeson
+import Data.Monoid
+import Data.Set (Set, empty)
+--import qualified Data.Set as S
+
+import Data.Text (Text, unpack)

 import Gargantext.Core.Types.Main
 import Gargantext.Core.Types.Node
+import Gargantext.Prelude
+
+------------------------------------------------------------------------
+
+type Term  = Text
+type Stems = Set Text
+type Label = [Text]
+
+data Terms = Terms { _terms_label :: Label
+                   , _terms_stem  :: Stems
+                   } deriving (Ord)
+
+instance Show Terms where
+  show (Terms l s) = show l
+-- class Inclusion where include
+--instance Eq Terms where
+--  (==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
+--                                || s2 `S.isSubsetOf` s1
+
+instance Eq Terms where
+  (==) (Terms _ s1) (Terms _ s2) = s1 == s2
+
+
+------------------------------------------------------------------------
+data Tag = POS | NER
+  deriving (Show, Eq)
+------------------------------------------------------------------------
+data POS = NP
+         | JJ  | VB
+         | CC  | IN | DT
+         | NoPos
+  deriving (Show, Generic, Eq)
+------------------------------------------------------------------------
+instance FromJSON POS where
+  parseJSON = withText "String" (\x -> pure (pos $ unpack x))
+    where
+      pos :: [Char] -> POS
+      pos "NP"  = NP
+      pos "NN"  = NP
+      pos "NC"  = NP
+      pos "NNS" = NP
+      pos "NNP" = NP
+      pos "JJ"  = JJ
+      pos "ADJ" = JJ
+      pos "VB"  = VB
+      pos "VBN" = VB
+      pos "VBG" = VB
+      pos "CC"  = CC
+      pos "IN"  = IN
+      pos "DT"  = DT
+      -- French specific
+      pos "P"  = IN
+      pos  _    = NoPos
+
+instance ToJSON POS
+------------------------------------------------------------------------
+data NER = PERSON | ORGANIZATION | LOCATION | NoNER
+  deriving (Show, Generic)
+------------------------------------------------------------------------
+instance FromJSON NER where
+  parseJSON = withText "String" (\x -> pure (ner $ unpack x))
+    where
+      ner :: [Char] -> NER
+      ner "PERSON"       = PERSON
+      ner "ORGANIZATION" = ORGANIZATION
+      ner "LOCATION"     = LOCATION
+      ner  _             = NoNER
+
+instance ToJSON NER
+
+data TokenTag  = TokenTag { _my_token_word :: [Text]
+                          , _my_token_stem :: Set Text
+                          , _my_token_pos  :: Maybe POS
+                          , _my_token_ner  :: Maybe NER
+                          } deriving (Show)
+
+instance Monoid TokenTag where
+  mempty = TokenTag [] empty Nothing Nothing
+
+  mappend (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _)
+         = TokenTag (w1 <> w2) (s1 <> s2) p3 n1
+          where
+            p3 = case (p1,p2) of
+                   (Just JJ, Just NP) -> Just NP
+                   (Just VB, Just NP) -> Just NP
+                   _                  -> p1
+
+  mconcat = foldl mappend mempty
+
--- a/src/Gargantext/Core/Types/Main.hs
+++ b/src/Gargantext/Core/Types/Main.hs
@@ -34,7 +34,7 @@ import Gargantext.Prelude
 ------------------------------------------------------------------------
 -- All the Database is structred like a hierarchical Tree
 data Tree a = NodeT a [Tree a]
-    deriving (Show, Read, Eq)
+  deriving (Show, Read, Eq)

 -- data Tree a = NodeT a [Tree a]
 -- same as Data.Tree

--- a/src/Gargantext/Core/Types/Phylo.hs
+++ b/src/Gargantext/Core/Types/Phylo.hs
@@ -40,9 +40,9 @@ import Gargantext.Core.Utils.Prefix (unPrefix)
 -- Duration : time Segment of the whole phylomemy in UTCTime format (start,end)
 -- Ngrams   : list of all (possible) terms contained in the phylomemy (with their id)
 -- Steps    : list of all steps to build the phylomemy
-data Phylo = Phylo { _phyloDuration :: (Start, End)
-                   , _phyloNgrams   :: [Ngram]
-                   , _phyloPeriods  :: [PhyloPeriod]
+data Phylo = Phylo { _phylo_Duration :: (Start, End)
+                   , _phylo_Ngrams   :: [Ngram]
+                   , _phylo_Periods  :: [PhyloPeriod]
                   } deriving (Generic)

 -- | UTCTime in seconds since UNIX epoch
@@ -55,20 +55,20 @@ type NgramId = Int
 -- | PhyloStep : steps of phylomemy on temporal axis
 -- Period: tuple (start date, end date) of the step of the phylomemy
 -- Levels: levels of granularity
-data PhyloPeriod = PhyloPeriod { _phyloPeriodId     :: PhyloPeriodId
-                               , _phyloPeriodLevels :: [PhyloLevel]
+data PhyloPeriod = PhyloPeriod { _phylo_PeriodId     :: PhyloPeriodId
+                               , _phylo_PeriodLevels :: [PhyloLevel]
                               } deriving (Generic)

 type PhyloPeriodId = (Start, End)

 -- | PhyloLevel : levels of phylomemy on level axis
 -- Levels description:
-- Level -1: Ngram equals itself         (by identity) == _phyloNgrams
+-- Level -1: Ngram equals itself         (by identity) == _phylo_Ngrams
 -- Level  0: Group of synonyms           (by stems + by qualitative expert meaning)
 -- Level  1: First level of clustering
 -- Level  N: Nth   level of clustering
-data PhyloLevel = PhyloLevel { _phyloLevelId     :: PhyloLevelId
-                             , _phyloLevelGroups :: [PhyloGroup]
+data PhyloLevel = PhyloLevel { _phylo_LevelId     :: PhyloLevelId
+                             , _phylo_LevelGroups :: [PhyloGroup]
                             } deriving (Generic)

 type PhyloLevelId = (PhyloPeriodId, Int)
@@ -78,15 +78,15 @@ type PhyloLevelId = (PhyloPeriodId, Int)
 -- Ngrams: set of terms that build the group
 -- Period Parents|Childs: weighted link to Parents|Childs (Temporal Period   axis)
 -- Level  Parents|Childs: weighted link to Parents|Childs (Level Granularity axis)
-data PhyloGroup = PhyloGroup { _phyloGroupId            :: PhyloGroupId
-                             , _phyloGroupLabel         :: Maybe Text
-                             , _phyloGroupNgrams        :: [NgramId]
+data PhyloGroup = PhyloGroup { _phylo_GroupId            :: PhyloGroupId
+                             , _phylo_GroupLabel         :: Maybe Text
+                             , _phylo_GroupNgrams        :: [NgramId]
                   
-                             , _phyloGroupPeriodParents :: [Edge]
-                             , _phyloGroupPeriodChilds  :: [Edge]
+                             , _phylo_GroupPeriodParents :: [Edge]
+                             , _phylo_GroupPeriodChilds  :: [Edge]
                   
-                             , _phyloGroupLevelParents  :: [Edge]
-                             , _phyloGroupLevelChilds   :: [Edge]
+                             , _phylo_GroupLevelParents  :: [Edge]
+                             , _phylo_GroupLevelChilds   :: [Edge]
                             } deriving (Generic)

 type PhyloGroupId = (PhyloLevelId, Int)
@@ -94,7 +94,7 @@ type Edge         = (PhyloGroupId, Weight)
 type Weight       = Double

 -- | JSON instances
-$(deriveJSON (unPrefix "_phylo"       ) ''Phylo       )
-$(deriveJSON (unPrefix "_phyloPeriod" ) ''PhyloPeriod )
-$(deriveJSON (unPrefix "_phyloLevel"  ) ''PhyloLevel  )
-$(deriveJSON (unPrefix "_phyloGroup"  ) ''PhyloGroup  )
+$(deriveJSON (unPrefix "_phylo_"       ) ''Phylo       )
+$(deriveJSON (unPrefix "_phylo_Period" ) ''PhyloPeriod )
+$(deriveJSON (unPrefix "_phylo_Level"  ) ''PhyloLevel  )
+$(deriveJSON (unPrefix "_phylo_Group"  ) ''PhyloGroup  )
--- a/src/Gargantext/Database.hs
+++ b/src/Gargantext/Database.hs
@@ -19,7 +19,7 @@ module Gargantext.Database (
  , module Gargantext.Database.User
  , module Gargantext.Database.Node
  , module Gargantext.Database.NodeNode
-  , module Gargantext.Database.Ngram
+--  , module Gargantext.Database.Ngram
  , module Gargantext.Database.NodeNgram
  , module Gargantext.Database.NodeNodeNgram
  , module Gargantext.Database.NodeNgramNgram
@@ -34,7 +34,7 @@ import Gargantext.Database.Utils
 import Gargantext.Database.User
 import Gargantext.Database.Node
 import Gargantext.Database.NodeNode
-import Gargantext.Database.Ngram
+--import Gargantext.Database.Ngram
 import Gargantext.Database.NodeNgram
 import Gargantext.Database.NodeNodeNgram
 import Gargantext.Database.NodeNgramNgram

--- a/src/Gargantext/Database/Cooc.hs
+++ b/src/Gargantext/Database/Cooc.hs
@@ -11,16 +11,18 @@ Here is a longer description of this module, containing some
 commentary with @some markup@.
 -}

--{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE NoImplicitPrelude #-}
 --{-# LANGUAGE OverloadedStrings #-}

 {-# LANGUAGE QuasiQuotes #-}

 module Gargantext.Database.Cooc where

+import Control.Monad ((>>=))
 import Database.PostgreSQL.Simple
 import Database.PostgreSQL.Simple.SqlQQ

+import Gargantext.Prelude
 import Gargantext (connectGargandb)

 type CorpusId    = Int
@@ -29,10 +31,10 @@ type GroupListId = Int

 coocTest :: IO [(Int, Int, Int)]
 coocTest = connectGargandb "gargantext.ini"
-  >>= \conn -> cooc conn 421968 446602 446599 
+  >>= \conn -> dBcooc conn 421968 446602 446599 

-cooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
-cooc conn corpus mainList groupList = query conn [sql|
+dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
+dBcooc conn corpus mainList groupList = query conn [sql|
  set work_mem='1GB';

  --EXPLAIN ANALYZE

--- a/src/Gargantext/Database/Ngram.hs
+++ b/src/Gargantext/Database/Ngram.hs
@@ -81,5 +81,5 @@ findWith f t = find (\x -> f x == t)
 --userWithId t xs = userWith userUserId t xs

 -- | not optimized (get all ngrams without filters)
-getNgrams :: PGS.Connection -> IO [Ngram]
-getNgrams conn = runQuery conn queryNgramTable
+dbGetNgrams :: PGS.Connection -> IO [Ngram]
+dbGetNgrams conn = runQuery conn queryNgramTable
--- a/src/Gargantext/Database/Node.hs
+++ b/src/Gargantext/Database/Node.hs
@@ -45,7 +45,7 @@ import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
 import Data.Typeable (Typeable)
 import qualified Data.ByteString.Internal as DBI
 import Database.PostgreSQL.Simple (Connection)
-import Opaleye
+import Opaleye hiding (FromField)

 -- | Types for Node Database Management
 data PGTSVector

--- a/src/Gargantext/Database/Utils.hs
+++ b/src/Gargantext/Database/Utils.hs
@@ -33,7 +33,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
 import Opaleye (Query, Unpackspec, showSqlForPostgres)
 import Data.Profunctor.Product.Default (Default)
 import Data.Maybe (maybe)
-import Prelude (id, putStrLn)
 -- TODO add a reader Monad here
 -- read this in the init file

@@ -61,5 +60,5 @@ connectGargandb fp = do
    connect parameters

 printSql :: Default Unpackspec a a => Query a -> IO ()
-printSql = putStrLn . maybe "Empty query" id . showSqlForPostgres
+printSql = putStrLn . maybe "Empty query" identity . showSqlForPostgres

--- a/src/Gargantext/Pipeline.hs
+++ b/src/Gargantext/Pipeline.hs
+{-|
+Module      : Gargantext.Pipeline
+Description : Server API
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+-}
+
+{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
+{-# LANGUAGE NoImplicitPrelude           #-}
+
+module Gargantext.Pipeline
+  where
+
+import Data.Text.IO (readFile)
+
+----------------------------------------------
+----------------------------------------------
+import Gargantext.Core
+import Gargantext.Prelude
+
+import Gargantext.Viz.Graph.Index (score)
+import Gargantext.Viz.Graph.Distances.Matrice (distributional)
+import Gargantext.Text.Metrics.Occurrences
+import Gargantext.Text.Terms
+import Gargantext.Text.Context
+
+
+pipeline path = do
+  -- Text  <- IO Text <- FilePath
+  text     <- readFile path  
+  let contexts = splitBy (Sentences 3) text
+  myterms <- extractTerms Multi FR contexts
+  -- TODO    filter (\t -> not . elem t stopList) myterms
+  -- TODO    groupBy (Stem | GroupList)
+  let myCooc = removeApax $ cooc myterms
+  -- Cooc -> Matrix
+  pure $ score distributional myCooc
+  -- Matrix -> Clustering -> Graph -> JSON
+
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -30,21 +30,26 @@ module Gargantext.Prelude
 import Data.Maybe (isJust, fromJust, maybe)
 import Protolude ( Bool(True, False), Int, Double, Integer
                 , Fractional, Num, Maybe(Just,Nothing)
+                 , Enum, Bounded, Float
                 , Floating, Char, IO
                 , pure, (<*>), (<$>), panic
+                 , putStrLn
                 , head, flip
                 , Ord, Integral, Foldable, RealFrac, Monad, filter
-                 , reverse, map, zip, drop, take, zipWith
+                 , reverse, map, mapM, zip, drop, take, zipWith
                 , sum, fromIntegral, length, fmap, foldl, foldl'
                 , takeWhile, sqrt, undefined, identity
                 , abs, min, max, maximum, minimum, return, snd, truncate
-                 , (+), (*), (/), (-), (.), ($), (**), (^), (<), (>), log
+                 , (+), (*), (/), (-), (.), ($), (&), (**), (^), (<), (>), log
                 , Eq, (==), (>=), (<=), (<>), (/=)
-                 , (&&), (||), not
+                 , (&&), (||), not, any
                 , fst, snd, toS
                 , elem, die, mod, div, const, either
-                 , curry, uncurry
+                 , curry, uncurry, repeat
                 , otherwise, when
+                 , undefined
+                 , IO()
+                 , compare
                 )

 -- TODO import functions optimized in Utils.Count
@@ -75,18 +80,6 @@ pr = reverse
 map2 :: (t -> b) -> [[t]] -> [[b]]
 map2 fun = map (map fun)

-pz :: [a] -> [b] -> [(a, b)]
-pz  = zip
-
-pd :: Int -> [a] -> [a]
-pd  = drop
-
-ptk :: Int -> [a] -> [a]
-ptk = take
-
-pzw :: (a -> b -> c) -> [a] -> [b] -> [c]
-pzw = zipWith
-
 -- Exponential Average
 eavg :: [Double] -> Double
 eavg (x:xs) = a*x + (1-a)*(eavg xs)
@@ -114,6 +107,12 @@ movingAverage steps xs = map mean $ chunkAlong steps 1 xs
 ma :: [Double] -> [Double]
 ma = movingAverage 3

+-- | splitEvery n == chunkAlong n n
+splitEvery :: Int -> [a] -> [[a]]
+splitEvery _ [] = L.cycle [[]]
+splitEvery n xs =
+  let (h,t) = L.splitAt n xs
+  in  h : splitEvery n t

 -- | Function to split a range into chunks
 chunkAlong :: Int -> Int -> [a] -> [[a]]
@@ -235,3 +234,8 @@ zipSnd f xs = zip xs (f xs)
 unMaybe :: [Maybe a] -> [a]
 unMaybe = map fromJust . L.filter isJust

+-- maximumWith
+maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
+
+
+
--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -7,10 +7,7 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Ngrams exctration.
-
-Definitions of ngrams.
-n non negative integer
+Text gathers terms in unit of contexts.

 -}

@@ -20,80 +17,44 @@ n non negative integer
 module Gargantext.Text
  where

+import Data.Maybe
 import qualified Data.Text as DT
--import Data.Text.IO (readFile)
-
-
-import Data.Map.Strict  (Map
-                        , lookupIndex
-                        --, fromList, keys
-                        )

+import qualified Data.Set as S
 import Data.Text (Text, split)
-import qualified Data.Map.Strict as M (filter)

+import NLP.FullStop (segment)
 -----------------------------------------------------------------
-import Gargantext.Text.Ngrams
-import Gargantext.Text.Metrics.Occurrences
-
-import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
+import Gargantext.Core
+import Gargantext.Core.Types
+import Gargantext.Text.Metrics.Occurrences (Occ, occurrences, cooc)
 import Gargantext.Prelude hiding (filter)
 -----------------------------------------------------------------

-data Group = Group { _group_label  ::  Ngrams
-                   , _group_ngrams :: [Ngrams]
-                   } deriving (Show)
-
+type Config  = Lang -> Context

-clean :: Text -> Text
-clean txt = DT.map clean' txt
-  where
-    clean' '’' = '\''
-    clean' c  = c
-
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-
-- | /!\ indexes are not the same:
+type Context = Text -> [Text]

-- | Index ngrams from Map
--indexNgram :: Ord a => Map a Occ -> Map Index a
--indexNgram m = fromList (zip [1..] (keys m))
+data Viz = Graph | Phylo | Chart

-- | Index ngrams from Map
--ngramIndex :: Ord a => Map a Occ -> Map a Index
--ngramIndex m = fromList (zip (keys m) [1..])
+pipeline :: Config -> Text -> Viz
+pipeline = undefined

-indexWith :: Ord a => Map a Occ -> [a] -> [Int]
-indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
-
-indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
-indexIt xs = (m, is)
-  where
-    m  = sumOcc (map occ  xs)
-    is = map    (indexWith m) xs
-
-list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
-list2fis n xs = (m', fs)
-  where
-    (m, is) = indexIt xs
-    m'      = M.filter (>50000) m
-    fs      = FIS.all n is
-
-text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
-text2fis n xs = list2fis n (map ngrams xs)
-
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined

+-----------------------------------------------------------------
 -------------------------------------------------------------------
 -- Contexts of text
 sentences :: Text -> [Text]
-sentences txt = split isStop txt
+sentences txt = map DT.pack $ segment $ DT.unpack txt
+
+sentences' :: Text -> [Text]
+sentences' txt = split isStop txt

 isStop :: Char -> Bool
 isStop c = c `elem` ['.','?','!']

+unsentences :: [Text] -> Text
+unsentences txts = DT.intercalate " " txts

 -- | https://en.wikipedia.org/wiki/Text_mining
 testText_en :: Text
@@ -103,19 +64,23 @@ testText_en = DT.pack "Text mining, also referred to as text data mining, roughl
 testText_fr :: Text
 testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."

+termTests :: Text
+termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
+
+
 -- | Ngrams Test
 -- >>> ngramsTest testText
 -- 248
-ngramsTest :: Text -> Int
-ngramsTest x=  length ws
-  where
-    --txt = concat <$> lines <$> clean <$> readFile filePath
-    txt = clean x
-    -- | Number of sentences
-    --ls   = sentences $ txt
-    -- | Number of monograms used in the full text
-    ws   = ngrams    $ txt
-    -- | stem ngrams
+--ngramsTest :: Text -> Int
+--ngramsTest x =  length ws
+--  where
+--    --txt = concat <$> lines <$> clean <$> readFile filePath
+--    txt = clean x
+--    -- | Number of sentences
+--    --ls   = sentences $ txt
+--    -- | Number of monograms used in the full text
+--    ws   = ngrams    $ txt
+--    -- | stem ngrams
    -- TODO
    -- group ngrams
    --ocs  = occ       $ ws

--- a/src/Gargantext/Text/Context.hs
+++ b/src/Gargantext/Text/Context.hs
+{-|
+Module      : Gargantext.Text.Context
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Context of text management tool
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Text.Context
+  where
+
+import Data.Text (Text, pack, unpack, length)
+import Data.String (IsString)
+
+import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
+import Gargantext.Text
+import Gargantext.Prelude hiding (length)
+
+
+data SplitContext = Chars Int | Sentences Int | Paragraphs Int
+
+tag = parseTags
+-- | splitBy contexts of Chars or Sentences or Paragraphs
+-- >> splitBy (Chars 0) "abcde"
+-- ["a","b","c","d","e"]
+-- >> splitBy (Chars 1) "abcde"
+-- ["ab","bc","cd","de"]
+-- >> splitBy (Chars 2) "abcde"
+-- ["abc","bcd","cde"]
+splitBy :: SplitContext -> Text -> [Text]
+splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
+splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
+splitBy (Paragraphs _) = map unTag       . filter isTagText   . tag
+  where
+    unTag :: IsString p => Tag p -> p
+    unTag (TagText x) = x
+    unTag _           = ""
+
+
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@

 {-# LANGUAGE NoImplicitPrelude #-}

+module Gargantext.Text.Metrics where

-module Gargantext.Text.Metrics (levenshtein
-                                      , levenshteinNorm
-                                      , damerauLevenshtein
-                                      , damerauLevenshteinNorm
-                                      , overlap
-                                      , jaccard
-                                      , hamming
-                                      ) where
-
-
-import Data.Text (Text)
-import GHC.Real (Ratio)
-import qualified Data.Text.Metrics as DTM
-
-import Gargantext.Prelude
-{- * Example de titre
-}
-
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science, 
-- the Levenshtein distance is a string metric for measuring 
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
+--import Data.Text (Text)
+--import GHC.Real (Ratio)
+--import qualified Data.Text.Metrics as DTM
 --
-levenshtein :: Text -> Text -> Int
-levenshtein = DTM.levenshtein
-
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
+--import Gargantext.Prelude
 --
-levenshteinNorm :: Text -> Text -> Ratio Int
-levenshteinNorm = DTM.levenshteinNorm
-
-- | Return Damerau-Levenshtein distance between two 'Text' values. The 
-- function works like 'levenshtein', but the collection of allowed     
-- operations also includes transposition of two /adjacent/ characters. 
-- See also:                                                            
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance> 
--
-damerauLevenshtein :: Text -> Text -> Int
-damerauLevenshtein = DTM.damerauLevenshtein
-
-- damerau-Levenshtein distance normalized
--
-damerauLevenshteinNorm :: Text -> Text -> Ratio Int
-damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-
-- Treating inputs like sets
-
-- | Return overlap coefficient for two 'Text' values. Returned value   
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1  
-- if both 'Text' values are empty.                                     
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
-overlap :: Text -> Text -> Ratio Int
-overlap = DTM.overlap
-
-
-- | Jaccard distance
-- measures dissimilarity between sample sets
-jaccard :: Text -> Text -> Ratio Int
-jaccard = DTM.jaccard
-
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See:  https://en.wikipedia.org/wiki/Hamming_distance
-
-hamming :: Text -> Text -> Maybe Int
-hamming = DTM.hamming
-
+--noApax :: Ord a => Map a Occ -> Map a Occ
+--noApax m = M.filter (>1) m

--- a/src/Gargantext/Text/Metrics/CharByChar.hs
+++ b/src/Gargantext/Text/Metrics/CharByChar.hs
+{-|
+Module      : Gargantext.Text.Metrics.CharByChar
+Description : All parsers of Gargantext in one file.
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Mainly reexport functions in @Data.Text.Metrics@
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+
+module Gargantext.Text.Metrics.CharByChar (levenshtein
+                                      , levenshteinNorm
+                                      , damerauLevenshtein
+                                      , damerauLevenshteinNorm
+                                      , overlap
+                                      , jaccard
+                                      , hamming
+                                      ) where
+
+
+import Data.Text (Text)
+import GHC.Real (Ratio)
+import qualified Data.Text.Metrics as DTM
+
+import Gargantext.Prelude
+
+--noApax :: Ord a => Map a Occ -> Map a Occ
+--noApax m = M.filter (>1) m
+
+
+{- * Example de titre
+-}
+
+-- | This module provide metrics to compare Text
+-- starting as an API rexporting main functions of the great lib
+-- text-metrics of Mark Karpov
+
+-- | Levenshtein Distance
+-- In information theory, Linguistics and computer science, 
+-- the Levenshtein distance is a string metric for measuring 
+-- the difference between two sequences.
+-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
+--
+levenshtein :: Text -> Text -> Int
+levenshtein = DTM.levenshtein
+
+-- | Return normalized Levenshtein distance between two 'Text' values.
+-- Result is a non-negative rational number (represented as @'Ratio'
+-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
+-- strings, while 1 means exact match.
+--
+levenshteinNorm :: Text -> Text -> Ratio Int
+levenshteinNorm = DTM.levenshteinNorm
+
+-- | Return Damerau-Levenshtein distance between two 'Text' values. The 
+-- function works like 'levenshtein', but the collection of allowed     
+-- operations also includes transposition of two /adjacent/ characters. 
+-- See also:                                                            
+-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance> 
+--
+damerauLevenshtein :: Text -> Text -> Int
+damerauLevenshtein = DTM.damerauLevenshtein
+
+-- damerau-Levenshtein distance normalized
+--
+damerauLevenshteinNorm :: Text -> Text -> Ratio Int
+damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
+
+-- Treating inputs like sets
+
+-- | Return overlap coefficient for two 'Text' values. Returned value   
+-- is in the range from 0 (no similarity) to 1 (exact match). Return 1  
+-- if both 'Text' values are empty.                                     
+--
+-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
+overlap :: Text -> Text -> Ratio Int
+overlap = DTM.overlap
+
+
+-- | Jaccard distance
+-- measures dissimilarity between sample sets
+jaccard :: Text -> Text -> Ratio Int
+jaccard = DTM.jaccard
+
+-- | Hamming Distance
+-- In information theory, the Hamming distance between two strings of
+-- equal length is the number of positions at which the corresponding
+-- symbols are different. In other words, it measures the minimum number of
+-- substitutions required to change one string into the other
+-- See:  https://en.wikipedia.org/wiki/Hamming_distance
+
+hamming :: Text -> Text -> Maybe Int
+hamming = DTM.hamming
+
+
--- a/src/Gargantext/Text/Metrics/FrequentItemSet.hs
+++ b/src/Gargantext/Text/Metrics/FrequentItemSet.hs
@@ -15,7 +15,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)

 module Gargantext.Text.Metrics.FrequentItemSet
  ( Fis, Size
-  , occ, cooc
+  , occ_hlcm, cooc_hlcm
  , all, between
  , module HLCM
  )
@@ -34,12 +34,12 @@ type Size = Either Int (Int, Int)

 ------------------------------------------------------------------------
 -- | Occurrence is Frequent Item Set of size 1
-occ  :: Frequency -> [[Item]] -> [Fis]
-occ f is  = fisWithSize (Left 1) f is
+occ_hlcm  :: Frequency -> [[Item]] -> [Fis]
+occ_hlcm f is  = fisWithSize (Left 1) f is

 -- | Cooccurrence is Frequent Item Set of size 2
-cooc :: Frequency -> [[Item]] -> [Fis]
-cooc f is = fisWithSize (Left 2) f is
+cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
+cooc_hlcm f is = fisWithSize (Left 2) f is

 all :: Frequency -> [[Item]] -> [Fis]
 all f is  = fisWith Nothing f is
@@ -89,3 +89,39 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f

 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
+
+
+--
+---- | /!\ indexes are not the same:
+--
+---- | Index ngrams from Map
+----indexNgram :: Ord a => Map a Occ -> Map Index a
+----indexNgram m = fromList (zip [1..] (keys m))
+--
+---- | Index ngrams from Map
+----ngramIndex :: Ord a => Map a Occ -> Map a Index
+----ngramIndex m = fromList (zip (keys m) [1..])
+--
+--indexWith :: Ord a => Map a Occ -> [a] -> [Int]
+--indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
+--
+--indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
+--indexIt xs = (m, is)
+--  where
+--    m  = sumOcc (map occ  xs)
+--    is = map    (indexWith m) xs
+--
+--list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
+--list2fis n xs = (m', fs)
+--  where
+--    (m, is) = indexIt xs
+--    m'      = M.filter (>50000) m
+--    fs      = FIS.all n is
+--
+--text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
+--text2fis n xs = list2fis n (map terms xs)
+--
+----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
+----text2fisWith = undefined
+--
+
--- a/src/Gargantext/Text/Metrics/Occurrences.hs
+++ b/src/Gargantext/Text/Metrics/Occurrences.hs
@@ -28,43 +28,92 @@ Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrence
 module Gargantext.Text.Metrics.Occurrences
  where

-import Gargantext.Prelude

+import Control.Arrow ((***))
+import qualified Data.List as List
 import Data.Map.Strict  (Map
-                        , empty
-                        , insertWith, unionWith
+                        , empty, singleton
+                        , insertWith, insertWithKey, unionWith
+                        , toList, lookup, mapKeys
                        )
+import Data.Set (Set)
+import qualified Data.Set as Set
+import Data.Text (pack)

+import qualified Data.Map.Strict as DMS
 import Control.Monad ((>>),(>>=))
 import Data.String (String())
 import Data.Attoparsec.Text
-import Data.Text (Text)

-import Data.Either.Extra(Either(..))
-import qualified Data.Text as T
-import Control.Applicative hiding (empty)
-----------------------------------------------------------
+------------------------------------------------------------------------
+import Gargantext.Prelude
+import Gargantext.Core.Types
+------------------------------------------------------------------------
+type Occ  a = Map      a  Int
+type Cooc a = Map (a,  a) Int
+type FIS  a = Map (Set a) Int
+
+data Group = ByStem | ByOntology
+
+type Grouped = Stems
+
+
+{-
+-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
+-- >> map occurrences <$> Prelude.mapM (terms Mono EN) 
+-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
+--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
+---- 
+
+           -}
+
+type Occs = Int
+type Coocs = Int
+
+removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
+removeApax = DMS.filter (> 1)
+
+cooc :: [[Terms]] -> Map (Label, Label) Int
+cooc tss =
+  mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
+  where
+    terms_occs = occurrences (List.concat tss)
+    delta f = f *** f

-type Occ = Int

-- | Compute the occurrences (occ)
-occ :: Ord a => [a] -> Map a Occ
-occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
+labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
+labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
+                     Just label -> label
+                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)

-- TODO add groups and filter stops
-sumOcc :: Ord a => [Map a Occ] -> Map a Occ
-sumOcc xs = foldl' (unionWith (+)) empty xs
+cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
+cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
+  where
+      xs = [ ((x, y), 1)
+           | xs <- tss
+           , ys <- tss
+           , x <- Set.toList xs
+           , y <- Set.toList ys
+           , x < y
+           ]
+
+
+-- | Compute the grouped occurrences (occ)
+occurrences :: [Terms] -> Map Grouped (Map Terms Int)
+occurrences = occurrences' _terms_stem

+occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
+occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty

-occurrenceParser :: Text -> Parser Bool
-occurrenceParser txt = manyTill anyChar (string txt) >> pure True
+-- TODO add groups and filter stops
+sumOcc :: Ord a => [Occ a] -> Occ a
+sumOcc xs = foldl' (unionWith (+)) empty xs

-occurrencesParser :: Text -> Parser Int
-occurrencesParser txt = case txt of
-                    "" -> pure 0
-                    _  -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
-    where
-        txt' = T.toLower txt

-parseOccurrences :: Text -> Text -> Either String Int
-parseOccurrences x = parseOnly (occurrencesParser x)
--- a/src/Gargantext/Text/Ngrams/PosTagging/Lang/En.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Lang/En.hs
-{-|
-Module      : Gargantext.Text.Ngrams.PosTagging.Lang.En
-Description : 
-Copyright   : (c) CNRS, 2017-Present
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-
-Here is a longer description of this module, containing some
-commentary with @some markup@.
-}
-
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
-
-module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
-  where
-
--import Data.Text (Text)
-import Data.Maybe (Maybe(Just))
-import Gargantext.Text.Ngrams.PosTagging.CoreNLP
-import Gargantext.Prelude
-
-group :: [NgramsTag] -> [NgramsTag]
-group [] = []
-group ntags = group2 NP NP
-           $ group2 NP VB
-           $ group2 NP IN
-           $ group2 IN DT
-           $ group2 VB NP
-           $ group2 JJ NP
-           $ group2 JJ JJ
-           $ group2 JJ CC
-           $ ntags
-
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
-group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
-group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
-  if (p1 == p1') && (p2 == p2')
-     then group2 p1 p2 (x<>y : z)
-     else (x : group2 p1 p2 (y:z))
-group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
-group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
-group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
-group2 _ _ [] = []
-
-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
-- group xs = group3 NN IN DT xs
-
-
-- TO BE REMOVED old code
--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs)      = groupNgrams (jn1:cc:jn2:xs)
--    where
--        jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
--        cc  = (c1, "CC", c1')
--        jn1 = (j1, "JJ", j1')
--        jn2 = jn j2 j3 j2'
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs)      = groupNgrams (jn1:jn2:xs)
--    where
--        jn j m mm p = (j <> " " <> m, p, mm)
--        jn1 = jn j1 n nn ("NN+CC" :: Text)
--        jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs)      = groupNgrams (jn1:jn2:xs)
--    where
--        jn j m mm p = (j <> " " <> m, p, mm)
--        jn1 = jn j1 n nn ("NN+CC" :: Text)
--        jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs)    = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs)   = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NP",_):(y,"NP",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--
---- extractNgrams "Test the antiinflammatory or analgesic activity?"
---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
---- > should be (antiinflammatory activity) <> (analgesic activity)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs)       = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs)      = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs)    = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs)   = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs)         = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
-
-
--textTest :: [Text]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
--           , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
--           , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
--          , "In both models, the standard drug used was aspirin 100 mg/kg. "
--          , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
--         , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
--         , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
--         , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
--         , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--- a/src/Gargantext/Text/Ngrams/PosTagging/Lang/Fr.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Lang/Fr.hs
-{-|
-Module      : Gargantext.Text.Ngrams.PosTagging.Lang.Fr
-Description : 
-Copyright   : (c) CNRS, 2017-Present
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-
-This @group@ function groups horizontally ngrams in their context of
-sentence according to grammars specific of each language. In english, JJ
-is ADJectiv in french. -
-
-}
-
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
-
-module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
-  where
-
-import Data.Maybe (Maybe(Just))
-import Gargantext.Text.Ngrams.PosTagging.CoreNLP
-import Gargantext.Prelude
-
-
-group :: [NgramsTag] -> [NgramsTag]
-group [] = []
-group ntags = group2 NP NP
-           $ group2 NP VB
-           $ group2 NP IN
-           $ group2 IN DT
-           $ group2 VB NP
-           $ group2 JJ NP
-           $ group2 NP JJ
-           $ group2 JJ JJ
-           $ group2 JJ CC
-           $ ntags
-
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
-group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
-group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
-  if (p1 == p1') && (p2 == p2')
-     then group2 p1 p2 (x<>y : z)
-     else (x : group2 p1 p2 (y:z))
-group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
-group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
-group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
-group2 _ _ [] = []
-
-
------------------------------------------------------------------------
-
-
--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
--    where
--
--import Gargantext.Prelude
--import Data.Text (Text)
--import Data.Monoid ((<>))
--
--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--selectNgrams xs = filter selectNgrams' xs
--    where
--        selectNgrams' (_,"N"    ,_       ) = True
--        selectNgrams' (_,"NC"   ,_       ) = True
--        selectNgrams' (_,"NN+CC",_       ) = True
--        selectNgrams' (_,_      ,"PERSON"      ) = True
--        selectNgrams' (_,_      ,"ORGANIZATION") = True
--        selectNgrams' (_,_      ,"LOCATION"    ) = True
--        selectNgrams' (_,_      ,_       ) = False
--
--
--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--groupNgrams []       = []
--
----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
--
---- "Groupe : nom commun et adjectifs avec conjonction"
--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
--    where
--        n1 = (n <> " " <> j1, "NC", n')
--        n2 = (n <> " " <> j2, "NC", n')
--
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
--    where
--        n1 = (n <> " " <> j1, "N", n')
--        n2 = (n <> " " <> j2, "N", n')
--
---- Groupe : Adjectif + Conjonction de coordination + Adjectif
---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
--
---- Groupe : Nom commun + préposition + Nom commun
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs)  = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
--
---- Groupe : Plusieurs adjectifs successifs
--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
--
---- Groupe : nom commun et adjectif
--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
---- Groupe : adjectif et nom commun
--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs)  = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs)   = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--
--
---- Si aucune des règles précédentes n'est remplie
--groupNgrams (x:xs)                        = (x:(groupNgrams xs))
--
--
--textTest :: [Text]
--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
--
--- a/src/Gargantext/Text/Ngrams/PosTagging/Parser.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/Parser.hs
-{-|
-Module      : Gargantext.Text.Ngrams.PosTagging.Parser
-Description : 
-Copyright   : (c) CNRS, 2017-Present
-License     : AGPL + CECILL v3
-Maintainer  : team@gargantext.org
-Stability   : experimental
-Portability : POSIX
-
-Here is a longer description of this module, containing some
-commentary with @some markup@.
-
-Ngrams selection algorithms
-A form is a list of characters seperated by one or more spaces in a sentence.
-A word is a form.
-
-}
-
-{-# LANGUAGE NoImplicitPrelude   #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-
-module Gargantext.Text.Ngrams.PosTagging.Parser
-  where
-
-import Data.Text hiding (map, group)
-
-import Gargantext.Prelude
-import Gargantext.Core (Lang(..))
-import Gargantext.Text.Ngrams.PosTagging.CoreNLP
-
-import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
-import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
-
-
-extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
-extractNgrams lang s = map (group lang) <$> extractNgrams' lang s
-
-
-extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
-extractNgrams' lang t =  map tokens2ngramsTags
-                     <$> map _sentenceTokens
-                     <$> _sentences
-                     <$> corenlp lang t
-
-
---- | This function analyse and groups (or not) ngrams according to 
----   grammars specific of each language.
-group :: Lang -> [NgramsTag] -> [NgramsTag]
-group EN = En.group
-group FR = Fr.group
-
--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -37,6 +37,7 @@ import Data.Either.Extra(Either())

 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf8)
+import qualified Data.Text as DT
 ----
 --import Control.Monad (join)
 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
@@ -51,16 +52,16 @@ import Gargantext.Text.Parsers.WOS (wosParser)
 ---- import Gargantext.Parsers.DOC (docParser)
 ---- import Gargantext.Parsers.ODT (odtParser)

--import Gargantext.Prelude (pm)
 --import Gargantext.Types.Main (ErrorMessage(), Corpus)

-- FIXME
--type Field = Text
 type ParseError = String
--
--data Corpus = Corpus { _corpusErrors :: [ParseError]
--                     , _corpusMap    :: Map FilePath (Map Field Text)
--                    }
+type Field      = Text
+type Document   = DM.Map Field Text
+
+type FilesParsed = DM.Map FilePath FileParsed
+data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
+                              , _fileParsed_result :: [Document]
+                              } deriving (Show)


 -- | According to the format of Input file,
@@ -75,37 +76,43 @@ data FileFormat = WOS        -- Implemented (ISI Format)
 -- TODO: to debug maybe add the filepath in error message


-parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
-parse format path = do
-    files <- case takeExtension path of
-              ".zip" -> openZip              path
-              _      -> pure <$> DB.readFile path
-    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
-    pure (as, map toText $ concat bs)
-      where
-        -- TODO : decode with bayesian inference on encodings
-        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-
-
-- | withParser:
-- According the format of the text, choosing the right parser.
-- TODO  withParser :: FileFormat -> Parser [Document]
-withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
-withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
--withParser _   = error "[ERROR] Parser not implemented yet"
-
-runParser :: FileFormat -> DB.ByteString 
-          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
-runParser format text = pure $ parseOnly (withParser format) text
-
-openZip :: FilePath -> IO [DB.ByteString]
-openZip fp = do
-    path    <- resolveFile' fp
-    entries <- withArchive path (DM.keys <$> getEntries)
-    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
-    pure bs
+--parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
+--parse format path = do
+--    files <- case takeExtension path of
+--              ".zip" -> openZip              path
+--              _      -> pure <$> DB.readFile path
+--    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
+--    pure (as, map toText $ concat bs)
+--      where
+--        -- TODO : decode with bayesian inference on encodings
+--        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
+--
+--
+---- | withParser:
+---- According the format of the text, choosing the right parser.
+---- TODO  withParser :: FileFormat -> Parser [Document]
+--withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
+--withParser WOS = wosParser
+----withParser DOC = docParser
+----withParser ODT = odtParser
+----withParser XML = xmlParser
+----withParser _   = error "[ERROR] Parser not implemented yet"
+--
+--runParser :: FileFormat -> DB.ByteString 
+--          -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
+--runParser format text = pure $ parseOnly (withParser format) text
+--
+--openZip :: FilePath -> IO [DB.ByteString]
+--openZip fp = do
+--    path    <- resolveFile' fp
+--    entries <- withArchive path (DM.keys <$> getEntries)
+--    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
+--    pure bs
+
+clean :: Text -> Text
+clean txt = DT.map clean' txt
+  where
+    clean' '’' = '\''
+    clean' c  = c


--- a/src/Gargantext/Text/Parsers/CSV.hs
+++ b/src/Gargantext/Text/Parsers/CSV.hs
+{-|
+Module      : Gargantext.Text.Parsers.CSV
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+CSV parser for Gargantext corpus files.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE DeriveGeneric     #-}
+
+module Gargantext.Text.Parsers.CSV where
+
+import GHC.Real (round)
+import GHC.IO (FilePath)
+
+import Control.Applicative
+
+import Data.Char (ord)
+import Data.Csv
+import Data.Either (Either(Left, Right))
+import Data.Text (Text, pack, length)
+import qualified Data.ByteString.Lazy as BL
+
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+import Safe (tailMay)
+
+import Gargantext.Text
+import Gargantext.Text.Context
+import Gargantext.Prelude hiding (length)
+
+---------------------------------------------------------------
+data Doc = Doc
+    { d_docId  :: !Int
+    , d_title  :: !Text
+    , d_source :: !Text
+    , d_publication_year  :: !Int
+    , d_publication_month :: !Int
+    , d_publication_day   :: !Int
+    , d_abstract          :: !Text
+    , d_authors           :: !Text
+    }
+    deriving (Show)
+---------------------------------------------------------------
+toDocs :: Vector CsvDoc -> [Doc]
+toDocs v = V.toList
+         $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
+                       -> Doc nId t s py pm pd abst auth )
+                       (V.enumFromN 1 (V.length v'')) v''
+          where
+            v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
+            seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
+
+---------------------------------------------------------------
+fromDocs :: Vector Doc -> Vector CsvDoc
+fromDocs docs = V.map fromDocs' docs
+  where
+    fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
+
+---------------------------------------------------------------
+-- | Split a document in its context
+-- TODO adapt the size of the paragraph according to the corpus average
+
+
+splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
+splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
+                          if docSize > 1000
+                            then
+                              if (mod (round m) docSize) >= 10
+                                then
+                                  splitDoc' splt doc
+                                else
+                                  V.fromList [doc]
+                            else
+                              V.fromList [doc]
+
+
+splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
+splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
+    where
+      firstDoc = CsvDoc t s py pm pd firstAbstract auth
+      firstAbstract = head' abstracts
+      
+      nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
+      
+      abstracts    = (splitBy $ contextSize) abst
+      head' x = maybe ""   identity (head x)
+      tail' x = maybe [""] identity (tailMay x)
+
+---------------------------------------------------------------
+---------------------------------------------------------------
+type Mean = Double
+
+docsSize :: Vector CsvDoc -> Mean
+docsSize csvDoc = mean ls
+  where
+    ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
+
+
+---------------------------------------------------------------
+data CsvDoc = CsvDoc
+    { c_title  :: !Text
+    , c_source :: !Text
+    , c_publication_year  :: !Int
+    , c_publication_month :: !Int
+    , c_publication_day   :: !Int
+    , c_abstract          :: !Text
+    , c_authors           :: !Text
+    }
+    deriving (Show)
+
+instance FromNamedRecord CsvDoc where
+  parseNamedRecord r = CsvDoc <$> r .: "title"
+                              <*> r .: "source"
+                              <*> r .: "publication_year"
+                              <*> r .: "publication_month"
+                              <*> r .: "publication_day"
+                              <*> r .: "abstract"
+                              <*> r .: "authors"
+
+instance ToNamedRecord CsvDoc where
+  toNamedRecord (CsvDoc t s py pm pd abst aut) = 
+    namedRecord [ "title"  .= t
+                , "source" .= s
+                , "publication_year"  .= py
+                , "publication_month" .= pm
+                , "publication_day"   .= pd
+                , "abstract"          .= abst
+                , "authors"           .= aut
+                ]
+
+
+csvDecodeOptions :: DecodeOptions
+csvDecodeOptions = (defaultDecodeOptions
+                      {decDelimiter = fromIntegral $ ord '\t'}
+                    )
+
+csvEncodeOptions :: EncodeOptions
+csvEncodeOptions = ( defaultEncodeOptions 
+                      {encDelimiter = fromIntegral $ ord '\t'}
+                    )
+
+
+readCsv :: FilePath -> IO (Header, Vector CsvDoc)
+readCsv fp = do
+    csvData <- BL.readFile fp
+    case decodeByNameWith csvDecodeOptions csvData of
+      Left e    -> panic (pack e)
+      Right csvDocs -> pure csvDocs
+
+
+writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
+writeCsv fp (h, vs) = BL.writeFile fp $
+                      encodeByNameWith csvEncodeOptions h (V.toList vs)
+
--- a/src/Gargantext/Text/Parsers/CSV_hs
+++ b/src/Gargantext/Text/Parsers/CSV_hs
--- a/src/Gargantext/Text/Parsers/Date.hs
+++ b/src/Gargantext/Text/Parsers/Date.hs
@@ -18,8 +18,9 @@ DGP.parseDate1 DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Parsers.Date (parseDate1, Lang(FR, EN), parseDate, fromRFC3339, parseTimeOfDay, getMultiplicator) where
+module Gargantext.Text.Parsers.Date (parseDate1, parseDate, fromRFC3339, parseTimeOfDay, getMultiplicator) where

+import Gargantext.Core (Lang(FR,EN))
 import Gargantext.Prelude
 import Prelude (toInteger, div, otherwise, (++))
 --import Gargantext.Types.Main as G
@@ -30,7 +31,8 @@ import Data.Time.LocalTime (utc)
 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale)
                        , DucklingTime(DucklingTime)
                        )
-import Duckling.Core (makeLocale, Lang(FR,EN), Some(This), Dimension(Time))
+import Duckling.Core (makeLocale, Some(This), Dimension(Time))
+import qualified Duckling.Core as DC
 import Duckling.Types (jsonValue, Entity)

 import Duckling.Api (analyze, parse)
@@ -63,9 +65,9 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt)
 -- TODO add Paris at Duckling.Locale Region datatype
 -- | To get Homogeinity of the languages
 --   TODO : put this in a more generic place in the source code
--parserLang :: G.Language -> Lang
--parserLang G.FR = FR
--parserLang G.EN = EN
+parserLang :: Lang -> DC.Lang
+parserLang FR = DC.FR
+parserLang EN = DC.EN


 -- | Final Date parser API
@@ -93,7 +95,7 @@ utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time

 -- | Local Context which depends on Lang and Time
 localContext :: Lang -> DucklingTime -> Context
-localContext lang dt = Context {referenceTime = dt, locale = makeLocale lang Nothing}
+localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}

 -- | Date parser with Duckling
 parseDateWithDuckling :: Lang -> Text -> IO [ResolvedToken]

--- a/src/Gargantext/Text/Search.hs
+++ b/src/Gargantext/Text/Search.hs
+{-|
+Module      : Gargantext.Text.Search
+Description : All parsers of Gargantext in one file.
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+This search Engine is first made to clean CSV file according to a query.
+
+Starting from this model, a specific Gargantext engine will be made
+(using more metrics scores/features).
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
+
+module Gargantext.Text.Search where
+
+import Data.SearchEngine
+
+import Data.Ix
+
+-- Usefull to use stopwords
+-- import Data.Set (Set)
+-- import qualified Data.Set as Set
+import Data.Text (Text)
+
+import Gargantext.Prelude
+import Gargantext.Text.Terms.Mono (monoterms)
+import Gargantext.Text.Terms.Mono.Stem as ST
+import Gargantext.Text.Parsers.CSV
+
+type DocId = Int
+
+type DocSearchEngine = SearchEngine
+                         Doc
+                         DocId
+                         DocField
+                         NoFeatures
+
+data DocField = TitleField
+              | AbstractField
+  deriving (Eq, Ord, Enum, Bounded, Ix, Show)
+
+initialDocSearchEngine :: DocSearchEngine
+initialDocSearchEngine =
+    initSearchEngine docSearchConfig defaultSearchRankParameters
+
+docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
+docSearchConfig =
+    SearchConfig {
+      documentKey           = d_docId,
+      extractDocumentTerms  = extractTerms,
+      transformQueryTerm    = normaliseQueryToken,
+      documentFeatureValue  = const noFeatures
+  }
+  where
+    extractTerms :: Doc -> DocField -> [Text]
+    extractTerms doc TitleField       = monoterms (d_title doc)
+    extractTerms doc AbstractField    = monoterms (d_abstract doc)
+
+    normaliseQueryToken :: Text -> DocField -> Text
+    normaliseQueryToken tok =
+      let tokStem = ST.stem ST.EN
+       in \field -> case field of
+                      TitleField    -> tokStem tok
+                      AbstractField -> tokStem tok
+
+defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
+defaultSearchRankParameters =
+    SearchRankParameters {
+      paramK1,
+      paramB,
+      paramFieldWeights,
+      paramFeatureWeights     = noFeatures,
+      paramFeatureFunctions   = noFeatures,
+      paramResultsetSoftLimit = 2000,
+      paramResultsetHardLimit = 4000,
+      paramAutosuggestPrefilterLimit  = 500,
+      paramAutosuggestPostfilterLimit = 500
+    }
+  where
+    paramK1 :: Float
+    paramK1 = 1.5
+
+    paramB :: DocField -> Float
+    paramB TitleField      = 0.9
+    paramB AbstractField   = 0.5
+
+    paramFieldWeights :: DocField -> Float
+    paramFieldWeights TitleField    = 20
+    paramFieldWeights AbstractField = 5
+
--- a/src/Gargantext/Text/Ngrams.hs
+++ b/src/Gargantext/Text/Ngrams.hs
@@ -18,56 +18,40 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a

 Source: https://en.wikipedia.org/wiki/Ngrams

+TODO
+group Ngrams -> Tree
+compute occ by node of Tree
+group occs according groups
+
+compute cooccurrences
+compute graph
+
 -}

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Ngrams
+module Gargantext.Text.Terms
  where

-import Data.Char (Char, isAlphaNum, isSpace)
-import Data.Text (Text, split, splitOn, pack)
-
-import Data.Set  (Set)
-import qualified Data.Set as S
+import Data.Text (Text)
+import Data.Traversable

 import Gargantext.Prelude
 import Gargantext.Core
-
-import Gargantext.Text.Ngrams.Stem (stem)
-
-
-data Ngrams = Ngrams { _ngrams_label :: [Text]
-                     , _ngrams_stem  :: Set Text
-                     } deriving (Show)
-
-
-data Terms = MonoGrams | MultiGrams
-type MonoGrams  = Text
-type MultiGrams = [Text]
-
-
-ngrams :: Text -> [Text]
-ngrams = monograms
-
-text2ngrams :: Lang -> Text -> Ngrams
-text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
-  where
-    txt' = splitOn (pack " ") txt
-
-
-equivNgrams :: Ngrams -> Ngrams -> Bool
-equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
-                                       || s2 `S.isSubsetOf` s1
-
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
-
-monograms :: Text -> [Text]
-monograms txt = split isWord txt
-  where
-    isWord c = c `elem` [' ', '\'', ',', ';']
-
-isGram :: Char -> Bool
-isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
+import Gargantext.Core.Types
+import Gargantext.Text.Terms.Multi (multiterms)
+import Gargantext.Text.Terms.Mono  (monoterms')
+
+data TermType = Mono | Multi
+
+-- remove Stop Words
+-- map (filter (\t -> not . elem t)) $ 
+------------------------------------------------------------------------
+extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
+extractTerms termType lang = mapM (terms termType lang)
+------------------------------------------------------------------------
+terms :: TermType -> Lang -> Text -> IO [Terms]
+terms Mono  lang txt = pure $ monoterms' lang txt
+terms Multi lang txt = multiterms lang txt
+------------------------------------------------------------------------

--- a/src/Gargantext/Text/Ngrams/Lists.hs
+++ b/src/Gargantext/Text/Ngrams/Lists.hs
@@ -13,7 +13,7 @@ commentary with @some markup@.

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Ngrams.Lists
+module Gargantext.Text.Terms.Lists
  where

 --import Data.Maybe
@@ -35,4 +35,3 @@ data ListName = Stop | Candidate | Graph
 --stop :: [Ngrams] -> [Ngrams]
 --stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs

-
--- a/src/Gargantext/Text/Terms/Mono.hs
+++ b/src/Gargantext/Text/Terms/Mono.hs
+{-|
+Module      : Gargantext.Text.Terms.Mono
+Description : Mono Terms module
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Mono-terms are Nterms where n == 1.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module Gargantext.Text.Terms.Mono (monoterms, monoterms')
+  where
+
+import Data.Text (Text, toLower, split, splitOn, pack)
+import qualified Data.Set as S
+
+import Gargantext.Core
+import Gargantext.Core.Types
+import Gargantext.Text.Terms.Mono.Stem (stem)
+
+import Gargantext.Prelude
+import Data.Char (isAlphaNum, isSpace)
+
+monoterms' :: Lang -> Text -> [Terms]
+monoterms' l txt = map (text2terms l) $ monoterms txt
+
+monoterms :: Text -> [Text]
+monoterms txt = map toLower $ split isWord txt
+  where
+    isWord c = c `elem` [' ', '\'', ',', ';']
+
+text2terms :: Lang -> Text -> Terms
+text2terms lang txt = Terms label stems
+  where
+    label = splitOn (pack " ") txt
+    stems = S.fromList $ map (stem lang) label
+
+  --monograms :: Text -> [Text]
+--monograms xs = monograms $ toLower $ filter isGram xs
+
+isGram :: Char -> Bool
+isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
+
--- a/src/Gargantext/Text/Ngrams/Stem.hs
+++ b/src/Gargantext/Text/Ngrams/Stem.hs
@@ -17,14 +17,16 @@ Source : https://en.wikipedia.org/wiki/Stemming

 -}

+{-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Ngrams.Stem
+module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
  where

 import Data.Text (Text)
 import qualified Data.Text   as DT
 import qualified NLP.Stemmer as N

+import Gargantext.Prelude
 import Gargantext.Core (Lang(..))

 -- (stem, Stemmer(..))
@@ -38,7 +40,6 @@ import Gargantext.Core (Lang(..))
 -- A stemmer for English, for example, should identify the string "cats"
 -- (and possibly "catlike", "catty" etc.) as based on the root "cat".

-
 -- and
 -- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
 -- algorithm reduces the words "fishing", "fished", and "fisher" to the
@@ -48,7 +49,6 @@ import Gargantext.Core (Lang(..))
 -- "arguments" reduce to the stem "argument".


-
 stem :: Lang -> Text -> Text
 stem lang = DT.pack . N.stem lang' . DT.unpack
  where

--- a/src/Gargantext/Text/Ngrams/Stem/En.hs
+++ b/src/Gargantext/Text/Ngrams/Stem/En.hs
@@ -17,7 +17,7 @@ Adapted from:

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Ngrams.Stem.En (stem)
+module Gargantext.Text.Terms.Mono.Stem.En (stemIt)
  where

 import Control.Monad
@@ -196,8 +196,8 @@ step5 = step5b . step5a
 allSteps :: [Char] -> [Char]
 allSteps = step5 . step4 . step3 . step2 . step1

-stem :: Text -> Text
-stem s = pack (stem' $ unpack s)
+stemIt :: Text -> Text
+stemIt s = pack (stem' $ unpack s)

 stem' :: [Char] -> [Char]
 stem' s | length s < 3 = s

--- a/src/Gargantext/Text/Ngrams/Token.hs
+++ b/src/Gargantext/Text/Ngrams/Token.hs
@@ -17,17 +17,26 @@ Source: https://en.wikipedia.org/wiki/Tokenize

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Ngrams.Token (tokenize)
+module Gargantext.Text.Terms.Mono.Token (tokenize)
  where

 import Data.Text (Text)
-import qualified Gargantext.Text.Ngrams.Token.Text as En
+import qualified Gargantext.Text.Terms.Mono.Token.En as En
+
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude

 type Token = Text

 -- >>> tokenize "A rose is a rose is a rose."
 -- ["A","rose","is","a","rose","is","a","rose", "."]
 -- 
+
+data Context = Letter | Word | Sentence | Line | Paragraph
+
 tokenize :: Text -> [Token]
 tokenize = En.tokenize

+tokenize' :: Lang -> Context -> [Token]
+tokenize' = undefined
+
--- a/src/Gargantext/Text/Ngrams/Token/Text.hs
+++ b/src/Gargantext/Text/Ngrams/Token/Text.hs
@@ -7,13 +7,13 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Inspired from https://bitbucket.org/gchrupala/lingo/overview
+First inspired from https://bitbucket.org/gchrupala/lingo/overview
 -}

-
+{-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Text.Ngrams.Token.Text
+module Gargantext.Text.Terms.Mono.Token.En
    ( EitherList(..)
    , Tokenizer
    , tokenize
@@ -30,13 +30,18 @@ module Gargantext.Text.Ngrams.Token.Text
    )
  where

+import Data.Foldable (concatMap)
 import qualified Data.Char as Char
 import Data.Maybe
 import Control.Monad
+import Control.Applicative (Applicative)

 import Data.Text (Text)
 import qualified Data.Text as T

+import Data.Either
+import Gargantext.Prelude
+
 -- | A Tokenizer is function which takes a list and returns a list of Eithers
 --  (wrapped in a newtype). Right Texts will be passed on for processing
 --  to tokenizers down

--- a/src/Gargantext/Text/Terms/Multi.hs
+++ b/src/Gargantext/Text/Terms/Multi.hs
+{-|
+Module      : Gargantext.Text.Terms.Multi
+Description : Multi Terms module
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Multi-terms are ngrams where n > 1.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module Gargantext.Text.Terms.Multi (multiterms)
+  where
+
+import Data.Text hiding (map, group, filter, concat)
+import Data.List (concat)
+
+import Gargantext.Prelude
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types
+
+import Gargantext.Text.Terms.Multi.PosTagging
+import qualified Gargantext.Text.Terms.Multi.Lang.En as En
+import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
+
+multiterms :: Lang -> Text -> IO [Terms]
+multiterms lang txt = concat
+                   <$> map (map tokenTag2terms)
+                   <$> map (filter (\t -> _my_token_pos t == Just NP)) 
+                   <$> tokenTags lang txt
+
+tokenTag2terms :: TokenTag -> Terms
+tokenTag2terms (TokenTag w t _ _) =  Terms w t
+
+tokenTags :: Lang -> Text -> IO [[TokenTag]]
+tokenTags lang s = map (group lang) <$> tokenTags' lang s
+
+
+tokenTags' :: Lang -> Text -> IO [[TokenTag]]
+tokenTags' lang t =  map tokens2tokensTags
+                     <$> map _sentenceTokens
+                     <$> _sentences
+                     <$> corenlp lang t
+
+---- | This function analyses and groups (or not) ngrams according to
+----   specific grammars of each language.
+group :: Lang -> [TokenTag] -> [TokenTag]
+group EN = En.group
+group FR = Fr.group
+
--- a/src/Gargantext/Text/Terms/Multi/Group.hs
+++ b/src/Gargantext/Text/Terms/Multi/Group.hs
+{-|
+Module      : Gargantext.Text.Terms.Multi.Group
+Description : English Grammar rules to group postag tokens.
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Rule-based grammars are computed in this english module in order to
+group the tokens into extracted terms.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Text.Terms.Multi.Group (group2)
+  where
+
+import Data.Maybe (Maybe(Just))
+
+import Gargantext.Core.Types
+import Gargantext.Prelude
+
+-- | FIXME p1 and p2 not really taken into account
+group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
+group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
+  if (p1 == p1') && (p2 == p2')
+     then group2 p1 p2 (x<>y : z)
+     else (x : group2 p1 p2 (y:z))
+group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
+group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
+group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
+group2 _ _ [] = []
+
+
+
--- a/src/Gargantext/Text/Terms/Multi/Lang/En.hs
+++ b/src/Gargantext/Text/Terms/Multi/Lang/En.hs
+{-|
+Module      : Gargantext.Text.Terms.Multi.Lang.En
+Description : English Grammar rules to group postag tokens.
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Rule-based grammars are computed in this english module in order to group
+the tokens into extracted terms.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Text.Terms.Multi.Lang.En (group)
+  where
+
+import Gargantext.Prelude
+import Gargantext.Core.Types
+import Gargantext.Text.Terms.Multi.Group
+
+------------------------------------------------------------------------
+-- | Rule grammar to group tokens
+group :: [TokenTag] -> [TokenTag]
+group [] = []
+group ntags = group2 NP NP
+           $ group2 NP VB
+           $ group2 NP IN
+           $ group2 IN DT
+           $ group2 VB NP
+           $ group2 JJ NP
+           $ group2 JJ JJ
+           $ group2 JJ CC
+           $ ntags
+
+------------------------------------------------------------------------
+--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
+--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs)         = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
+--
+--groupNgrams (x:xs)                                          = (x:(groupNgrams xs))
+
--- a/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
+++ b/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
+{-|
+Module      : Gargantext.Text.Terms.Multi.Lang.Fr
+Description : French Grammar rules to group postag tokens.
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+This @group@ function groups horizontally ngrams in their context of
+sentence according to grammars specific of each language. In english, JJ
+is ADJectiv in french.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Text.Terms.Multi.Lang.Fr (group)
+  where
+
+import Gargantext.Prelude
+import Gargantext.Core.Types
+import Gargantext.Text.Terms.Multi.Group (group2)
+
+group :: [TokenTag] -> [TokenTag]
+group [] = []
+group ntags = group2 NP NP
+            $ group2 NP VB
+            $ group2 NP IN
+            $ group2 IN DT
+            $ group2 VB NP
+            $ group2 JJ NP
+            $ group2 NP JJ
+            $ group2 JJ JJ
+            $ group2 JJ CC
+            $ ntags
+
+------------------------------------------------------------------------
+-- TODO
+--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs)             = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
+--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--- a/src/Gargantext/Text/Ngrams/PosTagging/CoreNLP.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging/CoreNLP.hs
 {-|
-Module      : Gargantext.Text.Ngrams.PosTagging.CoreNLP
-Description : CoreNLP module
+Module      : Gargantext.Text.Terms.Multi.PosTagging
+Description : PosTagging module using Stanford java REST API
 Copyright   : (c) CNRS, 2017
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

+In corpus linguistics, part-of-speech tagging (POS tagging or PoS
+tagging or POST), also called grammatical tagging or word-category
+disambiguation, is the process of marking up a word in a text (corpus)
+as corresponding to a particular part of speech,[1] based on both its
+definition and its context—i.e., its relationship with adjacent and
+related words in a phrase, sentence, or paragraph. A simplified form of
+this is commonly taught to school-age children, in the identification of
+words as nouns, verbs, adjectives, adverbs, etc.
+
+Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
 -}

 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE DeriveGeneric     #-}
+{-# LANGUAGE FlexibleContexts  #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE TemplateHaskell   #-}
 {-# LANGUAGE TypeOperators     #-}
-{-# LANGUAGE FlexibleContexts  #-}

-
-module Gargantext.Text.Ngrams.PosTagging.CoreNLP
+module Gargantext.Text.Terms.Multi.PosTagging
  where

 import GHC.Generics
@@ -26,14 +35,14 @@ import GHC.Show (Show(..))
 import Data.ByteString.Lazy.Internal (ByteString)
 import Data.Aeson.TH (deriveJSON)
 import Data.Aeson
-import Data.Monoid
 import Data.Maybe (isJust)

-import Data.Set (Set, fromList, empty)
+import Data.Set (fromList)

-import Data.Text (Text, splitOn, pack, toLower, unpack)
+import Data.Text (Text, splitOn, pack, toLower)

 import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types
 import Gargantext.Core.Utils.Prefix (unPrefix)
 import Gargantext.Prelude

@@ -44,53 +53,6 @@ import Control.Monad.IO.Class  (MonadIO)
 import Data.String.Conversions (ConvertibleStrings)

 ------------------------------------------------------------------------
-data Tag = POS | NER
-  deriving (Show, Eq)
------------------------------------------------------------------------
------------------------------------------------------------------------
-data POS = NP
-         | JJ  | VB
-         | CC  | IN | DT
-         | NoPos
-  deriving (Show, Generic, Eq)
-
------------------------------------------------------------------------
-instance FromJSON POS where
-  parseJSON = withText "String" (\x -> pure (pos $ unpack x))
-    where
-      pos :: [Char] -> POS
-      pos "NP"  = NP
-      pos "NN"  = NP
-      pos "NC"  = NP
-      pos "NNS" = NP
-      pos "NNP" = NP
-      pos "JJ"  = JJ
-      pos "ADJ" = JJ
-      pos "VB"  = VB
-      pos "VBN" = VB
-      pos "VBG" = VB
-      pos "CC"  = CC
-      pos "IN"  = IN
-      pos "DT"  = DT
-      -- French specific
-      pos "P"  = IN
-      pos  _    = NoPos
-
-instance ToJSON POS
------------------------------------------------------------------------
-data NER = PERSON | ORGANIZATION | LOCATION | NoNER
-  deriving (Show, Generic)
------------------------------------------------------------------------
-instance FromJSON NER where
-  parseJSON = withText "String" (\x -> pure (ner $ unpack x))
-    where
-      ner :: [Char] -> NER
-      ner "PERSON"       = PERSON
-      ner "ORGANIZATION" = ORGANIZATION
-      ner "LOCATION"     = LOCATION
-      ner  _             = NoNER
-
-instance ToJSON NER
 ------------------------------------------------------------------------
 data Token = Token { _tokenIndex                :: Int
                   , _tokenWord                 :: Text
@@ -105,39 +67,21 @@ data Token = Token { _tokenIndex                :: Int
                   } deriving (Show, Generic)
 $(deriveJSON (unPrefix "_token") ''Token)
 ------------------------------------------------------------------------
-data NgramsTag  = NgramsTag { _my_token_word :: [Text]
-                            , _my_token_stem :: Set Text
-                            , _my_token_pos  :: Maybe POS
-                            , _my_token_ner  :: Maybe NER
-                            } deriving (Show)
 ------------------------------------------------------------------------
-tokens2ngramsTags :: [Token] -> [NgramsTag]
-tokens2ngramsTags ts = select $ map ngramsTag ts
+tokens2tokensTags :: [Token] -> [TokenTag]
+tokens2tokensTags ts = filter' $ map tokenTag ts
 ------------------------------------------------------------------------
-ngramsTag :: Token -> NgramsTag
-ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n
+tokenTag :: Token -> TokenTag
+tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n
  where
    w' = split w
    s' = fromList (split s)
    split = splitOn (pack " ") . toLower

-select :: [NgramsTag] -> [NgramsTag]
-select xs = filter isNgrams xs
+filter' :: [TokenTag] -> [TokenTag]
+filter' xs = filter isNgrams xs
    where
-      isNgrams (NgramsTag _ _ p n) = isJust p || isJust n
-
-instance Monoid NgramsTag where
-  mempty = NgramsTag [] empty Nothing Nothing
-
-  mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
-         = NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
-          where
-            p3 = case (p1,p2) of
-                   (Just JJ, Just NP) -> Just NP
-                   (Just VB, Just NP) -> Just NP
-                   _                  -> p1
-
-  mconcat = foldl mappend mempty
+      isNgrams (TokenTag _ _ p n) = isJust p || isJust n

 ------------------------------------------------------------------------
 data Sentence  = Sentence { _sentenceIndex :: Int

--- a/src/Gargantext/Viz/Graph.hs
+++ b/src/Gargantext/Viz/Graph.hs
+{-|
+Module      : Gargantext.Viz.Graph
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TemplateHaskell   #-}
+{-# LANGUAGE DeriveGeneric     #-}
+
+module Gargantext.Viz.Graph
+  where
+
+import GHC.Generics (Generic)
+import Data.Aeson.TH (deriveJSON)
+import Data.Text (Text)
+
+import Gargantext.Prelude
+import Gargantext.Core.Utils.Prefix (unPrefix)
+
+-----------------------------------------------------------
+data TypeNode = Terms | Unknown
+  deriving (Show, Generic)
+
+$(deriveJSON (unPrefix "") ''TypeNode)
+
+data Attributes = Attributes { clust_default :: Int }
+  deriving (Show, Generic)
+$(deriveJSON (unPrefix "") ''Attributes)
+
+data Node = Node { n_size :: Int
+                 , n_type :: TypeNode
+                 , n_id   :: Text
+                 , n_label :: Text
+                 , n_attributes :: Attributes
+                 }
+  deriving (Show, Generic)
+$(deriveJSON (unPrefix "n_") ''Node)
+
+data Edge = Edge { e_source :: Int
+                 , e_target :: Int
+                 , e_weight :: Double
+                 , e_id     :: Int
+                 }
+  deriving (Show, Generic)
+$(deriveJSON (unPrefix "e_") ''Edge)
+
+data Graph = Graph { g_nodes :: [Node]
+                   , g_edges :: [Edge]
+                   }
+  deriving (Show, Generic)
+$(deriveJSON (unPrefix "g_") ''Graph)
+
+-----------------------------------------------------------
+
+
+
+
+
+
+
+
--- a/src/Gargantext/Text/Ngrams/PosTagging.hs
+++ b/src/Gargantext/Text/Ngrams/PosTagging.hs
 {-|
-Module      : Gargantext.Text.Ngrams.PosTagging
-Description : All parsers of Gargantext in one file.
-Copyright   : (c) CNRS, 2017 - present
+Module      : Gargantext.Graph.Distances
+Description : Distance management tools
+Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Mainly reexport functions in @Data.Text.Metrics@
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE Strict            #-}

-module Gargantext.Text.Ngrams.PosTagging
+module Gargantext.Viz.Graph.Distances
  where


+
+
+
+
+
+
--- a/src/Gargantext/Viz/Graph/Distances/Conditional.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Conditional.hs
+{-|
+Module      : Gargantext.Graph.Distances.Conditional
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Motivation and definition of the @Conditional@ distance.
+-}
+
+{-# LANGUAGE BangPatterns      #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE Strict            #-}
+module Gargantext.Viz.Graph.Distances.Conditional
+  where
+
+import Data.Matrix hiding (identity)
+import Data.String.Conversions (ConvertibleStrings(..))
+
+import Data.List (concat, sortOn)
+import qualified Data.List as L
+
+import Data.Map (Map)
+import qualified Data.Map as M
+
+import Data.Set (Set)
+import qualified Data.Set as S
+
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+
+import Gargantext.Prelude
+import Gargantext.Viz.Graph.Utils
+
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+-- | Optimisation issue
+
+toBeOptimized :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
+toBeOptimized m = proba Col m
+
+------------------------------------------------------------------------
+-- | Main Functions
+-- Compute the probability from axis
+-- x' = x / (sum Col x)
+proba :: (Num a, Fractional a) => Axis -> Matrix a -> Matrix a
+proba a m = mapOn a (\c x -> x / V.sum (axis a c m)) m
+
+
+mapOn :: Axis -> (AxisId -> a -> a) -> Matrix a -> Matrix a
+mapOn a f m = V.foldl' f'  m (V.enumFromTo 1 (nOf a m))
+  where
+    f' m' c = mapOnly a f c m'
+
+mapOnly :: Axis -> (AxisId -> a -> a) -> AxisId -> Matrix a -> Matrix a
+mapOnly Col = mapCol
+mapOnly Row = mapRow
+
+mapAll :: (a -> a) -> Matrix a -> Matrix a
+mapAll f m = mapOn Col (\_ -> f) m
+
+
+---------------------------------------------------------------
+-- | Compute a distance from axis
+-- xs = (sum Col x') - x'
+distFromSum :: (Num a, Fractional a)
+         => Axis -> Matrix a -> Matrix a
+distFromSum  a m = mapOn a (\c x -> V.sum (axis a c m) - x) m
+---------------------------------------------------------------
+---------------------------------------------------------------
+-- | To compute included/excluded or specific/generic scores
+opWith  :: (Fractional a1, Num a1)
+         => (Matrix a2 -> t -> Matrix a1) -> Matrix a2 -> t -> Matrix a1
+opWith op xs ys = mapAll (\x -> x / (2*n -1)) (xs `op` ys)
+  where
+    n = fromIntegral $ nOf Col xs
+---------------------------------------------------------------
+
+
+-------------------------------------------------------
+conditional :: (Num a, Fractional a, Ord a) => Matrix a -> Matrix a
+conditional m = filter (threshold m') m'
+  where
+------------------------------------------------------------------------
+    -- | Main Operations
+    -- x' = x / (sum Col x)
+    x' = proba Col m
+
+------------------------------------------------------------------------
+    -- xs = (sum Col x') - x'
+    xs = distFromSum Col x'
+    -- ys = (sum Row x') - x'
+    ys = distFromSum Row x'
+
+------------------------------------------------------------------------
+--  | Top included or excluded
+    ie = opWith (+) xs ys
+--  ie = ( xs + ys) / (2 * (x.shape[0] - 1))
+
+--  | Top specific or generic
+    sg = opWith (-) xs ys
+--  sg = ( xs - ys) / (2 * (x.shape[0] - 1))
+    
+    nodes_kept :: [Int]
+    nodes_kept = take k' $ S.toList
+                         $ foldl' (\s (n1,n2) -> insert [n1,n2] s) S.empty
+                         $ map fst
+                         $ nodes_included k <> nodes_specific k
+
+    nodes_included n = take n $ sortOn snd $ toListsWithIndex ie
+    nodes_specific m = take m $ sortOn snd $ toListsWithIndex sg
+    insert as s = foldl' (\s' a -> S.insert a s') s as
+    k' = 2*k
+    k = 10
+    
+    dico_nodes :: Map Int Int
+    dico_nodes     = M.fromList $ zip [1..] nodes_kept
+    dico_nodes_rev = M.fromList $ zip nodes_kept [1..]
+
+    m' = matrix (length nodes_kept) 
+                (length nodes_kept) 
+                (\(i,j) -> getElem ((M.!) dico_nodes i) ((M.!) dico_nodes j) x')
+
+    threshold m = V.minimum $ V.map (\cId -> V.maximum $ getCol cId m) (V.enumFromTo 1 (nOf Col m))
+    
+    filter t m  = mapAll (\x -> filter' t x) m
+      where
+        filter' t x = case (x >= t) of
+                        True  -> x
+                        False -> 0
+
+------------------------------------------------------------------------
--- a/src/Gargantext/Viz/Graph/Distances/Distributional.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Distributional.hs
+{-|
+Module      : Gargantext.Graph.Distances.Distributional
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Motivation and definition of the @Distributional@ distance.
+-}
+
+{-# LANGUAGE BangPatterns      #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE Strict            #-}
+
+
+module Gargantext.Viz.Graph.Distances.Distributional
+  where
+
+import Data.Matrix hiding (identity)
+import Data.String.Conversions (ConvertibleStrings(..))
+
+import Data.Map (Map)
+import qualified Data.Map as M
+
+import Data.Set (Set)
+import qualified Data.Set as S
+
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+
+import Gargantext.Prelude
+import Gargantext.Viz.Graph.Utils
+
+
+distributional :: (Floating a, Ord a) => Matrix a -> [((Int, Int), a)]
+distributional m = filter (\((x,y), d) -> foldl' (&&) True (conditions x y d) ) distriList
+  where
+    conditions x y d  =  [ (x /= y)
+                         , (d > miniMax')
+                         , ((M.lookup (x,y) distriMap) > (M.lookup (y,x) distriMap))
+                         ]
+    distriList   = toListsWithIndex distriMatrix
+    distriMatrix = ri (mi m)
+    
+    distriMap    = M.fromList $ distriList
+    miniMax'     = miniMax distriMatrix
+
+ri :: (Ord a, Fractional a) => Matrix a -> Matrix a
+ri m = matrix c r doRi
+  where
+    doRi (x,y)     = doRi' x y m
+    doRi' x y mi'' = sumMin x y mi'' / (V.sum $ ax Col x y mi'')
+
+    sumMin x y mi' = V.sum $ V.map (\(a,b) -> min a b )
+                           $ V.zip (ax Col x y mi') (ax Row x y mi')
+    (c,r) = (nOf Col m, nOf Row m)
+
+
+mi :: (Ord a, Floating a) => Matrix a -> Matrix a
+mi m = matrix c r createMat
+  where
+    (c,r) = (nOf Col m, nOf Row m)
+    createMat (x,y) = doMi x y m
+    doMi x y m = if x == y then 0 else (max (log (doMi' x y m)) 0 )
+    
+    doMi' x y m = (getElem x y m) / ( cross x y m / total m )
+    
+    cross x y m = (V.sum $ ax Col x y m) * (V.sum $ ax Row x y m)
+    
+
+
+ax :: Axis -> Int -> Int -> Matrix a -> Vector a
+ax a  i j m  = dropAt j' $ axis a i' m
+                  where
+                    i' = div i c + 1
+                    j' = mod r j + 1
+                    (c,r) = (nOf Col m, nOf Row m)
+
+miniMax :: (Ord a) => Matrix a -> a
+miniMax m = V.minimum $ V.map (\c -> V.maximum $ getCol c m) (V.enumFromTo 1 (nOf Col m))
+
+
--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+{-|
+Module      : Gargantext.Graph.Distances.Matrix
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Motivation and definition of the @Conditional@ distance.
+
+Implementation use Accelerate library :
+  * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
+    [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
+    In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
+
+  * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
+    [Optimising Purely Functional GPU Programs][MCKL13].
+    In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
+
+  * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
+    [Embedding Foreign Code][CMCK14].
+    In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
+
+  * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
+    [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
+    In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
+
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE TypeFamilies      #-}
+{-# LANGUAGE TypeOperators     #-}
+
+module Gargantext.Viz.Graph.Distances.Matrice
+  where
+
+import Data.Array.Accelerate
+import Data.Array.Accelerate.Interpreter (run)
+import Data.Array.Accelerate.Smart
+import Data.Array.Accelerate.Type
+import Data.Array.Accelerate.Array.Sugar (fromArr, Array, Z)
+
+import Data.Maybe (Maybe(Just))
+import qualified Gargantext.Prelude as P
+import qualified Data.Array.Accelerate.Array.Representation     as Repr
+
+import Gargantext.Text.Metrics.Occurrences
+
+
+-----------------------------------------------------------------------
+-- Test perf.
+distriTest = distributional $ myMat 100
+-----------------------------------------------------------------------
+
+vector :: Int -> (Array (Z :. Int) Int)
+vector n = fromList (Z :. n) [0..n]
+
+matrix :: Elt c => Int -> [c] -> Matrix c
+matrix n l = fromList (Z :. n :. n) l
+
+myMat :: Int -> Matrix Int
+myMat n = matrix n [1..]
+
+-- | Two ways to get the rank (as documentation)
+rank :: (Matrix a) -> Int
+rank m = arrayRank $ arrayShape m
+
+rank' :: (Matrix a) -> Int
+rank' m = n
+  where
+    Z :. _ :. n = arrayShape m
+
+-----------------------------------------------------------------------
+-- | Conditional Distance
+
+type Rank = Int
+
+proba :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
+proba r mat = zipWith (/) mat (mkSum r mat)
+
+mkSum :: Rank -> Acc (Matrix Double) -> Acc (Matrix Double)
+mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) 
+            $ fold (+) 0 mat
+
+
+type Matrix' a = Acc (Matrix a)
+type InclusionExclusion    = Double
+type SpecificityGenericity = Double
+
+conditional :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
+conditional m = (run $ ie (use m), run $ sg (use m))
+  where
+
+    ie :: Matrix' Double -> Matrix' Double
+    ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
+    sg :: Acc (Matrix Double) -> Acc (Matrix Double)
+    sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
+
+    n :: Exp Double
+    n = P.fromIntegral r
+
+    r :: Rank
+    r = rank' m
+
+    xs :: Matrix' Double -> Matrix' Double
+    xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
+    ys :: Acc (Matrix Double) -> Acc (Matrix Double)
+    ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
+
+-- filter with threshold
+-----------------------------------------------------------------------
+
+-- | Distributional Distance
+
+distributional :: Matrix Int -> Matrix Double
+distributional m = run $ filter $ ri (map fromIntegral $ use m)
+  where
+    n    = rank' m
+    
+    miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
+      where
+        miniMax' = (the $ minimum $ maximum m)
+    
+    filter  m = zipWith (\a b -> max a b) m (transpose m)
+    
+    ri mat = zipWith (/) mat1 mat2
+      where
+        mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat)
+        mat2 = mkSum n mat
+    
+    mi    m'  = zipWith (\a b -> max (log $ a/b) 0)  m'
+              $ zipWith (/) (crossProduct m') (total m')
+
+    total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
+    
+    crossProduct m = zipWith (*) (cross m  ) (cross (transpose m))
+    cross mat      = zipWith (-) (mkSum n mat) (mat)
+
+
+int2double :: Matrix Int -> Matrix Double
+int2double m = run (map fromIntegral $ use m)
+
--- a/src/Gargantext/Viz/Graph/Index.hs
+++ b/src/Gargantext/Viz/Graph/Index.hs
+{-|
+Module      : Gargantext.Graph.Distances.Utils
+Description : Tools to compute distances from Cooccurrences
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Basically @compute@ takes an accelerate function as first input, a Map
+of coccurrences as second input and outputs a Map automatically using
+indexes.
+
+TODO:
+--cooc2fgl :: Ord t, Integral n => Map (t, t) n -> Graph
+--fgl2json
+
+-}
+
+{-# LANGUAGE BangPatterns      #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TypeOperators     #-}
+
+
+module Gargantext.Viz.Graph.Index
+  where
+
+import qualified Data.Array.Accelerate as A
+import qualified Data.Array.Accelerate.Interpreter as A
+import Data.Array.Accelerate (Matrix, Elt, Shape, (:.)(..), Z(..))
+
+import qualified Data.Vector.Unboxed as DVU
+import Data.Maybe (fromMaybe)
+
+import Data.Set (Set)
+import qualified Data.Set as S
+
+import Data.Map (Map)
+import qualified Data.Map.Strict    as M
+
+import Gargantext.Prelude
+
+type Index    = Int
+
+-------------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+score :: (Ord t) => (A.Matrix Int -> A.Matrix Double)
+                -> Map (t, t) Int
+                -> Map (t, t) Double
+score f m = fromIndex fromI . mat2map . f $ cooc2mat toI m
+  where
+    (toI, fromI) = createIndexes m
+
+-------------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+cooc2mat :: Ord t => Map t Index -> Map (t, t) Int -> Matrix Int
+cooc2mat ti m = map2mat 0 n idx
+  where
+    n = M.size ti
+    idx = toIndex ti m -- it is important to make sure that toIndex is ran only once.
+
+map2mat :: Elt a => a -> Int -> Map (Index, Index) a -> Matrix a
+map2mat def n m = A.fromFunction shape (\(Z :. x :. y) -> fromMaybe def $ M.lookup (x, y) m)
+  where
+    shape = (Z :. n :. n)
+
+mat2map :: (Elt a, Shape (Z :. Index)) =>
+            A.Array (Z :. Index :. Index) a -> Map (Index, Index) a
+mat2map m = M.fromList . map f . A.toList . A.run . A.indexed $ A.use m
+  where
+    Z :. _ :. n = A.arrayShape m
+    f ((Z :. i :. j), x) = ((i, j), x)
+
+-------------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+toIndex :: Ord t => Map t Index -> Map (t,t) a -> Map (Index,Index) a
+toIndex ni ns = indexConversion ni ns
+
+fromIndex :: Ord t => Map Index t -> Map (Index, Index) a -> Map (t,t) a
+fromIndex ni ns = indexConversion ni ns
+---------------------------------------------------------------------------------
+indexConversion :: (Ord b, Ord k) => Map k b -> Map (k,k) a -> Map (b, b) a
+indexConversion index ms = M.fromList $ map (\((k1,k2),c) -> ( ((M.!) index k1, (M.!) index k2), c)) (M.toList ms)
+-------------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+createIndexes :: Ord t => Map (t, t) b -> (Map t Index, Map Index t)
+createIndexes = set2indexes . cooc2set
+  where
+    cooc2set :: Ord t => Map (t, t) a -> Set t
+    cooc2set cs' = foldl' (\s ((t1,t2),_) -> insert [t1,t2] s ) S.empty (M.toList cs')
+      where
+        insert as s = foldl' (\s' t -> S.insert t s') s as
+
+    set2indexes :: Ord t => Set t -> (Map t Index, Map Index t)
+    set2indexes s = (M.fromList toIndex', M.fromList fromIndex')
+      where
+        fromIndex' = zip [0..] xs
+        toIndex'   = zip xs [0..]
+        xs         = S.toList s
+
+
--- a/src/Gargantext/Viz/Graph/Utils.hs
+++ b/src/Gargantext/Viz/Graph/Utils.hs
+{-|
+Module      : Gargantext.Graph.Distances.Utils
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+These functions are used for Vector.Matrix only.
+
+-}
+
+{-# LANGUAGE BangPatterns      #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE Strict            #-}
+
+module Gargantext.Viz.Graph.Utils
+  where
+
+import Data.Matrix hiding (identity)
+
+import Data.Map (Map)
+import qualified Data.Map    as M
+
+import Data.Set (Set)
+import qualified Data.Set    as S
+
+import Data.Vector (Vector)
+import qualified Data.Vector as V
+
+import qualified Data.List   as L
+import Gargantext.Prelude
+
+------------------------------------------------------------------------
+-- | Some utils to build the matrix from cooccurrence results
+
+-- | For tests only, to be removed
+-- m1 :: Matrix Double
+-- m1 = fromList 300 300 [1..]
+------------------------------------------------------------------------
+------------------------------------------------------------------------
+data Axis = Col | Row
+------------------------------------------------------------------------
+-- | Matrix functions
+type AxisId = Int
+
+-- Data.Vector.Additions
+dropAt :: Int -> Vector a -> Vector a
+dropAt n v = debut <> (V.tail fin)
+  where
+    debut = V.take n v
+    fin   = V.drop n v
+
+total :: Num a => Matrix a -> a
+total m = V.sum $ V.map (\c -> V.sum (getCol c m)) (V.enumFromTo 1 (nOf Col m))
+
+nOf :: Axis -> Matrix a -> Int
+nOf Row = nrows
+nOf Col = ncols
+
+axis :: Axis -> AxisId -> Matrix a -> Vector a
+axis Col = getCol
+axis Row = getRow
+
+
+toListsWithIndex :: Matrix a ->  [((Int, Int), a)]
+toListsWithIndex m = concat' $ zip [1..] $ map (\c -> zip [1..] c) $ toLists m
+  where
+    concat' :: [(Int, [(Int, a)])] -> [((Int, Int), a)]
+    concat' xs = L.concat $ map (\(x, ys) -> map (\(y, a) -> ((x,y), a)) ys ) xs
+
+
--- a/stack.yaml
+++ b/stack.yaml
@@ -3,6 +3,7 @@ extra-package-dbs: []
 packages:
 - .
 - servant-job
+- clustering-louvain

 allow-newer: true
 extra-deps:
@@ -12,11 +13,14 @@ extra-deps:
  commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
 - git: https://github.com/delanoe/servant-static-th.git
  commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
+- accelerate-1.2.0.0
+- accelerate-io-1.2.0.0
 - aeson-1.2.4.0
 - aeson-lens-0.5.0.0
 - duckling-0.1.3.0
 - extra-1.5.3
 - full-text-search-0.2.1.4
+- fullstop-0.1.4
 - haskell-src-exts-1.18.2
 - http-types-0.12.1
 - protolude-0.2
@@ -32,4 +36,4 @@ extra-deps:
 - text-1.2.3.0
 - text-show-3.6.2
 - servant-flatten-0.2
-resolver: lts-10.6
+resolver: lts-11.10
--- a/swagger.json
+++ b/swagger.json