[ngrams] use the corenlp-types library

Now we don't keep custom corenlp stuff in our code and it's more portable. Also, spacy is refactored to use its own types as well as john snow nlp.

[ngrams] use the corenlp-types library
Now we don't keep custom corenlp stuff in our code and it's more portable. Also, spacy is refactored to use its own types as well as john snow nlp.
ff01b79c · Przemyslaw Kaminski · bdeed282 · ff01b79c · ff01b79c · ff01b79c
Verified Commit ff01b79c authored Dec 04, 2023 by Przemyslaw Kaminski
10 changed files
--- a/bin/update-cabal-project
+++ b/bin/update-cabal-project
@@ -11,7 +11,7 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
 # `expected_cabal_project_freeze_hash` with the
 # `sha256sum` result calculated on the `cabal.project` and `cabal.project.freeze`.
 # This ensures the files stay deterministic so that CI cache can kick in.
-expected_cabal_project_hash="f2efe6832145c093cfe16832b17b06cd4e2d94e85dd0390e713c46c40ee9e461"
+expected_cabal_project_hash="eb74b26c841d56bfeb59506b95ebb964f90d9255c64e48580f56dfa57f3d4a6c"
 expected_cabal_project_freeze_hash="796f0109611f3381278b1885ae1fa257c4177b99885eb04701938f1107c06ee5"
 cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-11-23T20:05:40Z'

--- a/cabal.project
+++ b/cabal.project
 -- Generated by stack2cabal
-index-state: 2023-12-04T09:05:40Z
+index-state: 2023-11-23T20:05:40Z
 with-compiler: ghc-8.10.7
@@ -76,6 +76,11 @@ source-repository-package
    location: https://gitlab.iscpif.fr/cgenie/patches-class.git
    tag: 125c7cb90ab8f0cd6ac4a526dbdf302d10c945e9
+source-repository-package
+    type: git
+    location: https://gitlab.iscpif.fr/gargantext/corenlp-types
+    tag: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
 source-repository-package
    type: git
    location: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
@@ -141,6 +146,11 @@ source-repository-package
    location: https://gitlab.iscpif.fr/gargantext/iso639.git
    tag: eab929d106833ded8011a0d6705135e3fc506a9c
+source-repository-package
+    type: git
+    location: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
+    tag: 339fd608341bd2652cf5c0e9e76a3293acffbea6
 source-repository-package
    type: git
    location: https://github.com/haskell-servant/servant.git

--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -273,7 +273,7 @@ library
      Gargantext.Core.Text.Terms.Mono.Token
      Gargantext.Core.Text.Terms.Mono.Token.En
      Gargantext.Core.Text.Terms.Multi.Group
-      Gargantext.Core.Text.Terms.Multi.PosTagging
+      Gargantext.Core.Text.Terms.Multi.CoreNLP
      Gargantext.Core.Text.Terms.Multi.PosTagging.Types
      Gargantext.Core.Text.Upload
      Gargantext.Core.Types.Search
@@ -426,6 +426,7 @@ library
    , conduit-extra ^>= 1.3.5
    , containers ^>= 0.6.5.1
    , contravariant ^>= 1.5.5
+    , corenlp-types ^>= 0.1.0.0
    , crawlerArxiv
    , crawlerHAL
    , crawlerISTEX

--- a/src/Gargantext/Core/Text/Terms/Multi.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi.hs
@@ -13,7 +13,11 @@ Multi-terms are ngrams where n > 1.
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
+module Gargantext.Core.Text.Terms.Multi
+  ( multiterms
+  , multiterms_rake
+  , tokenTags
+  , cleanTextForNLP )
  where
 import Control.Applicative
@@ -22,8 +26,7 @@ import Data.Text hiding (map, group, filter, concat)
 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
 import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
 import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
-import Gargantext.Core.Text.Terms.Multi.PosTagging
+import Gargantext.Core.Text.Terms.Multi.CoreNLP qualified as CoreNLP
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
 import Gargantext.Core.Types
 import Gargantext.Core.Utils (groupWithCounts)
@@ -31,9 +34,6 @@ import Gargantext.Prelude
 import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
 import Replace.Attoparsec.Text                            as RAT
-------------------------------------------------------------------
-type NLP_API = Lang -> Text -> IO PosSentences
 -------------------------------------------------------------------
 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
 multiterms nsc l txt = do
@@ -54,14 +54,20 @@ multiterms nsc l txt = do
 -------------------------------------------------------------------
 tokenTag2terms :: TokenTag -> Terms
-tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
+tokenTag2terms (TokenTag ws t _ _ _ _) =  Terms ws t
 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
-tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
+tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = do
-tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url
+  tt <- CoreNLP.coreNLPTokenTags url EN txt
+  pure $ map (groupTokens EN) tt
+tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = do
+  tt <- CoreNLP.coreNLPTokenTags url FR txt
+  pure $ map (groupTokens FR) tt
 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
  -- printDebug "NLP Debug" txt
-  tokenTagsWith l txt $ SpacyNLP.nlp url
+  tt <- SpacyNLP.nlpTokenTags url txt
+  pure $ map (groupTokens l) tt
+  -- tokenTagsWith l txt $ SpacyNLP.nlp url
 -- tokenTags FR txt = do
 --   -- printDebug "[Spacy Debug]" txt
 --   if txt == ""
@@ -69,12 +75,6 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
 --      else tokenTagsWith FR txt SpacyNLP.nlp
 tokenTags _ l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)
-tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
-tokenTagsWith lang txt nlp = map (groupTokens lang)
-                         <$> map tokens2tokensTags
-                         <$> map _sentenceTokens
-                         <$> _sentences
-                         <$> nlp lang txt
 ---- | This function analyses and groups (or not) ngrams according to

--- a/src/Gargantext/Core/Text/Terms/Multi/Group.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/Group.hs
@@ -21,12 +21,12 @@ import Gargantext.Prelude
 -- | FIXME p1 and p2 not really taken into account
 group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
-group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
+group2 p1 p2 (x@(TokenTag _ _ (Just p1') _ _ _):y@(TokenTag _ _ (Just p2') _ _ _):z) =
  if (p1 == p1') && (p2 == p2')
-     then group2 p1 p2 (x<>y : z)
+     then group2 p1 p2 (combineTokenTags x y : z)
     -- then (x : y : group2 p1 p2 (x<>y : z))
     else (x : group2 p1 p2 (y:z))
-group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
+group2 p1 p2 (x@(TokenTag _ _ Nothing _ _ _):y) = (x: group2 p1 p2 y)
-group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
+group2 _ _ [x@(TokenTag _ _ (Just _) _ _ _)] = [x]
-group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
+group2 p1 p2 (x@(TokenTag _ _ (Just _) _ _ _):y@(TokenTag _ _ Nothing _ _ _):z) = (x:y: group2 p1 p2 (y:z))
 group2 _ _ [] = []
--- a/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
@@ -33,30 +33,37 @@ import Data.Map qualified as Map
 import Data.Set (fromList)
 import Data.Text (splitOn, pack, toLower)
 import Gargantext.Core (Lang(..))
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
 import Gargantext.Core.Types
 import Gargantext.Prelude hiding (ByteString, toLower)
 import Network.HTTP.Simple
 import Network.URI (URI(..))
+import Text.CoreNLP.Types qualified as CoreNLP
 -- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
-tokens2tokensTags :: [Token] -> [TokenTag]
+tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
 tokens2tokensTags ts = filter' $ map tokenTag ts
 ------------------------------------------------------------------------
-tokenTag :: Token -> TokenTag
+tokenTag :: CoreNLP.Token -> TokenTag
-tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
+tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
-                                   , _my_token_lemma = l'
+                                           , _my_token_lemma = l'
-                                   , _my_token_pos = _tokenPos
+                                           , _my_token_pos = fromCoreNLPPOS pos
-                                   , _my_token_ner = _tokenNer }
+                                           , _my_token_ner = fromCoreNLPNER ner }
  where
-    w' = split _tokenWord
+    w' = split word
-    l' = fromList (split _tokenLemma)
+    l' = fromList (split lemma)
    split :: Text -> [Text]
    split = splitOn " " . toLower
+fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
+fromCoreNLPPOS = decode . encode
+fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
+fromCoreNLPNER = decode . encode
 filter' :: [TokenTag] -> [TokenTag]
 filter' xs = filter isNgrams xs
    where
@@ -76,13 +83,41 @@ filter' xs = filter isNgrams xs
 --             },
 --
+properties :: Lang -> [(Text, Text)]
+properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
+properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
+                , ("tokenize.language", "fr") ]
+properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
+                , ("tokenize.language", "de") ]
+properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
+                , ("tokenize.language", "es") ]
+properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
+                , ("tokenize.language", "it") ]
+properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
+                , ("tokenize.language", "pl") ]
+properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
+                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
+                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
+                , ("tokenize.language", "zh") ]
+properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
 corenlp' :: ( FromJSON a
            , ConvertibleStrings p ByteString
            )
          => URI -> Lang -> p -> IO (Response a)
 corenlp' uri lang txt = do
-  req <- parseRequest $
+  req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
-         "POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
   -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
  --   printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
  catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
@@ -94,41 +129,12 @@ corenlp' uri lang txt = do
        throwIO e
      JSONConversionException _req _res _err -> throwIO e
  where
-    properties_ :: [(Text, Text)]
+    query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
-    properties_ = case lang of
+    props = (properties lang) <> [ ("outputFormat", "json") ]
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
-            EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
-            FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
-                  , ("tokenize.language", "fr") ]
-            DE -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
-                  , ("tokenize.language", "de") ]
-            ES -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
-                  , ("tokenize.language", "es") ]
-            IT -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
-                  , ("tokenize.language", "it") ]
-            PL -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
-                  , ("tokenize.language", "pl") ]
-            ZH -> [ ("annotators", "tokenize,pos,lemma,ner")
-                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-                  , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
-                  , ("tokenize.language", "zh") ]
-            l  -> panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
-    properties = properties_ <> [ ("outputFormat", "json") ]
-corenlp :: URI -> Lang -> Text -> IO PosSentences
+corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
 corenlp uri lang txt = do
  response <- corenlp' uri lang txt
  pure (getResponseBody response)
@@ -141,14 +147,14 @@ corenlp uri lang txt = do
 -- Named Entity Recognition example
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
-tokenWith :: URI -> (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
+tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
-tokenWith uri f lang s = map (map (\t -> (_tokenWord t, f t)))
+tokenWith uri f lang s = do
-                  <$> map _sentenceTokens
+  CoreNLP.Document { sentences } <- corenlp uri lang s
-                  <$> _sentences
+  pure (map (map (\t -> (CoreNLP.word t, f t)))
-                  <$> corenlp uri lang s
+         $ map CoreNLP.tokens sentences)
 ----------------------------------------------------------------------------------
 -- Here connect to the JohnSnow Server as it has been done above with the corenlp'
 -- We need the PosTagging according to the language and the lems
-serverNLP :: Lang -> Text -> IO PosSentences
+serverNLP :: Lang -> Text -> IO CoreNLP.Document
 serverNLP = undefined
--- a/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
@@ -7,6 +7,11 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
+NOTE: This started out as a wrapper around CoreNLP types, then was
+forced into Spacy and JohnSnowNLP. Currently, this is deprecated:
+CoreNLP and Spacy use their own types, JohnSnowNLP still uses this
+module, but it's not used as well.
 -}
 {-# LANGUAGE BangPatterns      #-}
@@ -46,7 +51,7 @@ data Properties = Properties { _propertiesAnnotators  :: !Text
 $(deriveJSON (unPrefix "_properties") ''Properties)
-data PosSentences = PosSentences { _sentences :: [Sentence]}
+data PosSentences = PosSentences { _sentences :: [Sentence] }
  deriving (Show, Generic)
 $(deriveJSON (unPrefix "_") ''PosSentences)
--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -22,6 +22,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
                             , DebugMode(..), withDebugMode
                             , Term(..), Terms(..), TermsCount, TermsWithCount
                             , TokenTag(..), POS(..), NER(..)
+                             , combineTokenTags, emptyTokenTag
                             , Label, Stems
                             , HasValidationError(..), assertValid
                             , Name
@@ -36,7 +37,6 @@ import Data.Aeson
 import Data.Aeson.TH (deriveJSON)
 import Data.Maybe
 import Data.Monoid
-import Data.Semigroup
 import Data.Set (empty)
 import Data.String
 import Data.Swagger (ToParamSchema)
@@ -150,25 +150,33 @@ instance FromJSON NER where
 instance ToJSON NER
-data TokenTag  = TokenTag { _my_token_word  :: [Text]
+data TokenTag  = TokenTag { _my_token_word         :: [Text]
-                          , _my_token_lemma :: Set Text
+                          , _my_token_lemma        :: Set Text
-                          , _my_token_pos   :: Maybe POS
+                          , _my_token_pos          :: Maybe POS
-                          , _my_token_ner   :: Maybe NER
+                          , _my_token_ner          :: Maybe NER
+                          , _my_token_offset_begin :: Int
+                          , _my_token_offset_end   :: Int
                          } deriving (Show)
-instance Semigroup TokenTag where
+-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-  (<>) (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _) = TokenTag (w1 <> w2) (s1 <> s2) p3 n1
+-- lemma combining is just wrong. You can't just "cat" <> "woman" to
-    where
+-- get a "Catwoman".
-      p3 = case (p1,p2) of
+combineTokenTags :: TokenTag -> TokenTag -> TokenTag
-        (Just JJ, Just NP) -> Just NP
+combineTokenTags (TokenTag w1 l1 p1 n1 s1 e1) (TokenTag w2 l2 p2 _ s2 e2) = TokenTag (w1 <> w2) (l1 <> l2) p3 n1 (minimum [s1, s2]) (maximum [e1, e2])
-        (Just VB, Just NP) -> Just NP
+  where
-        _                  -> p1
+    p3 = case (p1,p2) of
+      (Just JJ, Just NP) -> Just NP
+      (Just VB, Just NP) -> Just NP
-instance Monoid TokenTag where
+      _                  -> p1
-  mempty = TokenTag [] empty Nothing Nothing
-  mconcat = foldl mappend mempty
-  -- mappend t1 t2 = (<>) t1 t2
+emptyTokenTag :: TokenTag
+emptyTokenTag = TokenTag [] empty Nothing Nothing 0 0
+-- instance Monoid TokenTag where
+--   mempty = TokenTag [] empty Nothing Nothing 0 0
+--   mconcat = foldl mappend mempty
+--   -- mappend t1 t2 = (<>) t1 t2
 class HasValidationError e where

--- a/src/Gargantext/Utils/SpacyNLP.hs
+++ b/src/Gargantext/Utils/SpacyNLP.hs
@@ -20,22 +20,23 @@ module Gargantext.Utils.SpacyNLP where
 import Control.Lens
 import Data.Aeson (encode)
 import Data.Aeson.TH (deriveJSON)
+import Data.Set qualified as Set
 import Data.Text hiding (map, group, filter, concat, zip)
-import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types (POS(..), NER(..), TokenTag(..))
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
-import Gargantext.Core.Types (POS(..), NER(..))
 import Gargantext.Core.Utils.Prefix (unPrefix)
 import Gargantext.Prelude
 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
 import Network.URI (URI(..))
-data SpacyData = SpacyData { _spacy_data :: ![SpacyText]}
+data SpacyData = SpacyData { _spacy_data :: ![SpacyText] }
  deriving (Show)
 data SpacyText = SpacyText { _spacy_text :: !Text
                           , _spacy_tags :: ![SpacyTags]
                           } deriving (Show)
+-- | https://spacy.io/api/token/#attributes
 data SpacyTags =
  SpacyTags { _spacyTags_text :: !Text
            , _spacyTags_text_with_ws :: !Text
@@ -100,28 +101,24 @@ makeLenses ''SpacyTags
 makeLenses ''SpacyRequest
 ----------------------------------------------------------------
-spacyTagsToToken :: SpacyTags -> Token
-spacyTagsToToken st = Token (st ^. spacyTags_index)
-                   (st ^. spacyTags_normalized)
-                   (st ^. spacyTags_text)
-                   (st ^. spacyTags_lemma)
-                   (st ^. spacyTags_head_index)
-                   (st ^. spacyTags_char_offset)
-                   (Just $ st ^. spacyTags_pos)
-                   (Just $ st ^. spacyTags_ent_type)
-                   (Just $ st ^. spacyTags_prefix)
-                   (Just $ st ^. spacyTags_suffix)
-spacyDataToPosSentences :: SpacyData -> PosSentences
-spacyDataToPosSentences (SpacyData ds) = PosSentences
-   $  map (\(i, ts) -> Sentence i ts)
-   $ zip [1..]
-   $ map (\(SpacyText _ tags)-> map spacyTagsToToken tags) ds
+spacyDataToTokenTags :: SpacyData -> [[TokenTag]]
+spacyDataToTokenTags (SpacyData ds) =
+  map (\(SpacyText _ tags) -> spacyTagsToTokenTag <$> tags) ds
+spacyTagsToTokenTag :: SpacyTags -> TokenTag
+spacyTagsToTokenTag st = TokenTag { _my_token_word = split' (st ^. spacyTags_normalized)
+                                  , _my_token_lemma = Set.fromList (split' $ st ^. spacyTags_lemma)
+                                  , _my_token_pos = Just $ st ^. spacyTags_pos
+                                  , _my_token_ner = Just $ st ^. spacyTags_ent_type
+                                  , _my_token_offset_begin = 0 -- TODO
+                                  , _my_token_offset_end = 0 -- TODO
+                                  }
+  where
+    split' :: Text -> [Text]
+    split' = splitOn " " . Data.Text.toLower
 -----------------------------------------------------------------
-nlp :: URI -> Lang -> Text -> IO PosSentences
+nlpTokenTags :: URI -> Text -> IO [[TokenTag]]
-nlp uri _lang txt = spacyDataToPosSentences <$> spacyRequest uri txt
+nlpTokenTags uri txt = spacyDataToTokenTags <$> spacyRequest uri txt
-- nlp _ _ _ = panic "Make sure you have the right model for your lang for spacy Server"
-- nlp FR txt  = spacyDataToPosSentences <$> spacyRequest txt
-- nlp _  _ = panic "Make sure you have the right model for your lang for spacy Server"
--- a/stack.yaml
+++ b/stack.yaml
@@ -43,6 +43,8 @@ extra-deps:
    - servant-auth/servant-auth/
    - servant-auth/servant-auth-client/
    - servant-auth/servant-auth-server/
+- git: https://gitlab.iscpif.fr/gargantext/corenlp-types
+  commit: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
 - git: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
  commit: 339fd608341bd2652cf5c0e9e76a3293acffbea6
 - git: https://github.com/alpmestan/ekg-json.git