[ngrams] use the corenlp-types library

Now we don't keep custom corenlp stuff in our code and it's more portable. Also, spacy is refactored to use its own types as well as john snow nlp.

[ngrams] use the corenlp-types library
Now we don't keep custom corenlp stuff in our code and it's more portable. Also, spacy is refactored to use its own types as well as john snow nlp.
ff01b79c · Przemyslaw Kaminski · bdeed282 · ff01b79c · ff01b79c · ff01b79c
Verified Commit ff01b79c authored Dec 04, 2023 by Przemyslaw Kaminski
10 changed files
--- a/bin/update-cabal-project
+++ b/bin/update-cabal-project
@@ -11,7 +11,7 @@ STORE_DIR="${1:-$DEFAULT_STORE}"
 # `expected_cabal_project_freeze_hash` with the
 # `sha256sum` result calculated on the `cabal.project` and `cabal.project.freeze`.
 # This ensures the files stay deterministic so that CI cache can kick in.
-expected_cabal_project_hash="f2efe6832145c093cfe16832b17b06cd4e2d94e85dd0390e713c46c40ee9e461"
+expected_cabal_project_hash="eb74b26c841d56bfeb59506b95ebb964f90d9255c64e48580f56dfa57f3d4a6c"
 expected_cabal_project_freeze_hash="796f0109611f3381278b1885ae1fa257c4177b99885eb04701938f1107c06ee5"

 cabal --store-dir=$STORE_DIR v2-update 'hackage.haskell.org,2023-11-23T20:05:40Z'

--- a/cabal.project
+++ b/cabal.project
 -- Generated by stack2cabal

-index-state: 2023-12-04T09:05:40Z
+index-state: 2023-11-23T20:05:40Z

 with-compiler: ghc-8.10.7

@@ -76,6 +76,11 @@ source-repository-package
    location: https://gitlab.iscpif.fr/cgenie/patches-class.git
    tag: 125c7cb90ab8f0cd6ac4a526dbdf302d10c945e9

+source-repository-package
+    type: git
+    location: https://gitlab.iscpif.fr/gargantext/corenlp-types
+    tag: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
+
 source-repository-package
    type: git
    location: https://gitlab.iscpif.fr/gargantext/crawlers/arxiv-api.git
@@ -141,6 +146,11 @@ source-repository-package
    location: https://gitlab.iscpif.fr/gargantext/iso639.git
    tag: eab929d106833ded8011a0d6705135e3fc506a9c

+source-repository-package
+    type: git
+    location: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
+    tag: 339fd608341bd2652cf5c0e9e76a3293acffbea6
+
 source-repository-package
    type: git
    location: https://github.com/haskell-servant/servant.git

--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -273,7 +273,7 @@ library
      Gargantext.Core.Text.Terms.Mono.Token
      Gargantext.Core.Text.Terms.Mono.Token.En
      Gargantext.Core.Text.Terms.Multi.Group
-      Gargantext.Core.Text.Terms.Multi.PosTagging
+      Gargantext.Core.Text.Terms.Multi.CoreNLP
      Gargantext.Core.Text.Terms.Multi.PosTagging.Types
      Gargantext.Core.Text.Upload
      Gargantext.Core.Types.Search
@@ -426,6 +426,7 @@ library
    , conduit-extra ^>= 1.3.5
    , containers ^>= 0.6.5.1
    , contravariant ^>= 1.5.5
+    , corenlp-types ^>= 0.1.0.0
    , crawlerArxiv
    , crawlerHAL
    , crawlerISTEX

--- a/src/Gargantext/Core/Text/Terms/Multi.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi.hs
@@ -13,7 +13,11 @@ Multi-terms are ngrams where n > 1.

 {-# LANGUAGE OverloadedStrings #-}

-module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
+module Gargantext.Core.Text.Terms.Multi
+  ( multiterms
+  , multiterms_rake
+  , tokenTags
+  , cleanTextForNLP )
  where

 import Control.Applicative
@@ -22,8 +26,7 @@ import Data.Text hiding (map, group, filter, concat)
 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
 import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
 import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
-import Gargantext.Core.Text.Terms.Multi.PosTagging
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
+import Gargantext.Core.Text.Terms.Multi.CoreNLP qualified as CoreNLP
 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
 import Gargantext.Core.Types
 import Gargantext.Core.Utils (groupWithCounts)
@@ -31,9 +34,6 @@ import Gargantext.Prelude
 import Gargantext.Utils.SpacyNLP qualified as SpacyNLP
 import Replace.Attoparsec.Text                            as RAT

-------------------------------------------------------------------
-type NLP_API = Lang -> Text -> IO PosSentences
-
 -------------------------------------------------------------------
 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
 multiterms nsc l txt = do
@@ -54,14 +54,20 @@ multiterms nsc l txt = do

 -------------------------------------------------------------------
 tokenTag2terms :: TokenTag -> Terms
-tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
+tokenTag2terms (TokenTag ws t _ _ _ _) =  Terms ws t

 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
-tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
-tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = tokenTagsWith FR txt $ corenlp url
+tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = do
+  tt <- CoreNLP.coreNLPTokenTags url EN txt
+  pure $ map (groupTokens EN) tt
+tokenTags (NLPServerConfig { server = CoreNLP, url }) FR txt = do
+  tt <- CoreNLP.coreNLPTokenTags url FR txt
+  pure $ map (groupTokens FR) tt
 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
  -- printDebug "NLP Debug" txt
-  tokenTagsWith l txt $ SpacyNLP.nlp url
+  tt <- SpacyNLP.nlpTokenTags url txt
+  pure $ map (groupTokens l) tt
+  -- tokenTagsWith l txt $ SpacyNLP.nlp url
 -- tokenTags FR txt = do
 --   -- printDebug "[Spacy Debug]" txt
 --   if txt == ""
@@ -69,12 +75,6 @@ tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
 --      else tokenTagsWith FR txt SpacyNLP.nlp
 tokenTags _ l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (show l)

-tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
-tokenTagsWith lang txt nlp = map (groupTokens lang)
-                         <$> map tokens2tokensTags
-                         <$> map _sentenceTokens
-                         <$> _sentences
-                         <$> nlp lang txt


 ---- | This function analyses and groups (or not) ngrams according to

--- a/src/Gargantext/Core/Text/Terms/Multi/Group.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/Group.hs
@@ -21,12 +21,12 @@ import Gargantext.Prelude

 -- | FIXME p1 and p2 not really taken into account
 group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
-group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
+group2 p1 p2 (x@(TokenTag _ _ (Just p1') _ _ _):y@(TokenTag _ _ (Just p2') _ _ _):z) =
  if (p1 == p1') && (p2 == p2')
-     then group2 p1 p2 (x<>y : z)
+     then group2 p1 p2 (combineTokenTags x y : z)
     -- then (x : y : group2 p1 p2 (x<>y : z))
     else (x : group2 p1 p2 (y:z))
-group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
-group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
-group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
+group2 p1 p2 (x@(TokenTag _ _ Nothing _ _ _):y) = (x: group2 p1 p2 y)
+group2 _ _ [x@(TokenTag _ _ (Just _) _ _ _)] = [x]
+group2 p1 p2 (x@(TokenTag _ _ (Just _) _ _ _):y@(TokenTag _ _ Nothing _ _ _):z) = (x:y: group2 p1 p2 (y:z))
 group2 _ _ [] = []
--- a/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
@@ -33,30 +33,37 @@ import Data.Map qualified as Map
 import Data.Set (fromList)
 import Data.Text (splitOn, pack, toLower)
 import Gargantext.Core (Lang(..))
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
 import Gargantext.Core.Types
 import Gargantext.Prelude hiding (ByteString, toLower)
 import Network.HTTP.Simple
 import Network.URI (URI(..))
+import Text.CoreNLP.Types qualified as CoreNLP

 -- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP

 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
-tokens2tokensTags :: [Token] -> [TokenTag]
+tokens2tokensTags :: [CoreNLP.Token] -> [TokenTag]
 tokens2tokensTags ts = filter' $ map tokenTag ts
 ------------------------------------------------------------------------
-tokenTag :: Token -> TokenTag
-tokenTag (Token { .. }) = TokenTag { _my_token_word = w'
+tokenTag :: CoreNLP.Token -> TokenTag
+tokenTag (CoreNLP.Token { .. }) = TokenTag { _my_token_word = w'
                                           , _my_token_lemma = l'
-                                   , _my_token_pos = _tokenPos
-                                   , _my_token_ner = _tokenNer }
+                                           , _my_token_pos = fromCoreNLPPOS pos
+                                           , _my_token_ner = fromCoreNLPNER ner }
  where
-    w' = split _tokenWord
-    l' = fromList (split _tokenLemma)
+    w' = split word
+    l' = fromList (split lemma)
    split :: Text -> [Text]
    split = splitOn " " . toLower

+fromCoreNLPPOS :: CoreNLP.PennPOS -> Maybe POS
+fromCoreNLPPOS = decode . encode
+
+fromCoreNLPNER :: CoreNLP.NamedEntity -> Maybe NER
+fromCoreNLPNER = decode . encode
+
+
 filter' :: [TokenTag] -> [TokenTag]
 filter' xs = filter isNgrams xs
    where
@@ -76,59 +83,58 @@ filter' xs = filter isNgrams xs
 --             },
 --

-corenlp' :: ( FromJSON a
-            , ConvertibleStrings p ByteString
-            )
-          => URI -> Lang -> p -> IO (Response a)
-corenlp' uri lang txt = do
-  req <- parseRequest $
-         "POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
-   -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-  --   printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
-  catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
-    case e of
-      JSONParseException _req res _err -> do
-        let body = getResponseBody res
-        printDebug "[corenlp'] request text" (cs txt :: ByteString)
-        printDebug "[corenlp'] response body (error)" body
-        throwIO e
-      JSONConversionException _req _res _err -> throwIO e
-  where
-    properties_ :: [(Text, Text)]
-    properties_ = case lang of
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
-            EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
-            FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+properties :: Lang -> [(Text, Text)]
+properties EN = [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
+properties FR = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
                , ("tokenize.language", "fr") ]
-            DE -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+properties DE = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger")
                , ("tokenize.language", "de") ]
-            ES -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+properties ES = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/spanish.tagger")
                , ("tokenize.language", "es") ]
-            IT -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+properties IT = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
                , ("tokenize.language", "it") ]
-            PL -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
+properties PL = [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                  -- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
                , ("tokenize.language", "pl") ]
-            ZH -> [ ("annotators", "tokenize,pos,lemma,ner")
+properties ZH = [ ("annotators", "tokenize,pos,lemma,ner")
                  -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
                , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger")
                , ("tokenize.language", "zh") ]
-            l  -> panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
+properties l = panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
+

-    properties = properties_ <> [ ("outputFormat", "json") ]
+corenlp' :: ( FromJSON a
+            , ConvertibleStrings p ByteString
+            )
+          => URI -> Lang -> p -> IO (Response a)
+corenlp' uri lang txt = do
+  req <- parseRequest $ "POST " <> show (uri { uriQuery = query })
+   -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
+  --   printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
+  catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
+    case e of
+      JSONParseException _req res _err -> do
+        let body = getResponseBody res
+        printDebug "[corenlp'] request text" (cs txt :: ByteString)
+        printDebug "[corenlp'] response body (error)" body
+        throwIO e
+      JSONConversionException _req _res _err -> throwIO e
+  where
+    query = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList props)
+    props = (properties lang) <> [ ("outputFormat", "json") ]



-corenlp :: URI -> Lang -> Text -> IO PosSentences
+corenlp :: URI -> Lang -> Text -> IO CoreNLP.Document
 corenlp uri lang txt = do
  response <- corenlp' uri lang txt
  pure (getResponseBody response)
@@ -141,14 +147,14 @@ corenlp uri lang txt = do
 -- Named Entity Recognition example
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
-tokenWith :: URI -> (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
-tokenWith uri f lang s = map (map (\t -> (_tokenWord t, f t)))
-                  <$> map _sentenceTokens
-                  <$> _sentences
-                  <$> corenlp uri lang s
+tokenWith :: URI -> (CoreNLP.Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
+tokenWith uri f lang s = do
+  CoreNLP.Document { sentences } <- corenlp uri lang s
+  pure (map (map (\t -> (CoreNLP.word t, f t)))
+         $ map CoreNLP.tokens sentences)

 ----------------------------------------------------------------------------------
 -- Here connect to the JohnSnow Server as it has been done above with the corenlp'
 -- We need the PosTagging according to the language and the lems
-serverNLP :: Lang -> Text -> IO PosSentences
+serverNLP :: Lang -> Text -> IO CoreNLP.Document
 serverNLP = undefined
--- a/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
+++ b/src/Gargantext/Core/Text/Terms/Multi/PosTagging/Types.hs
@@ -7,6 +7,11 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

+NOTE: This started out as a wrapper around CoreNLP types, then was
+forced into Spacy and JohnSnowNLP. Currently, this is deprecated:
+CoreNLP and Spacy use their own types, JohnSnowNLP still uses this
+module, but it's not used as well.
+
 -}

 {-# LANGUAGE BangPatterns      #-}
@@ -46,7 +51,7 @@ data Properties = Properties { _propertiesAnnotators  :: !Text

 $(deriveJSON (unPrefix "_properties") ''Properties)

-data PosSentences = PosSentences { _sentences :: [Sentence]}
+data PosSentences = PosSentences { _sentences :: [Sentence] }
  deriving (Show, Generic)

 $(deriveJSON (unPrefix "_") ''PosSentences)
--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -22,6 +22,7 @@ module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
                             , DebugMode(..), withDebugMode
                             , Term(..), Terms(..), TermsCount, TermsWithCount
                             , TokenTag(..), POS(..), NER(..)
+                             , combineTokenTags, emptyTokenTag
                             , Label, Stems
                             , HasValidationError(..), assertValid
                             , Name
@@ -36,7 +37,6 @@ import Data.Aeson
 import Data.Aeson.TH (deriveJSON)
 import Data.Maybe
 import Data.Monoid
-import Data.Semigroup
 import Data.Set (empty)
 import Data.String
 import Data.Swagger (ToParamSchema)
@@ -154,10 +154,15 @@ data TokenTag  = TokenTag { _my_token_word  :: [Text]
                          , _my_token_lemma        :: Set Text
                          , _my_token_pos          :: Maybe POS
                          , _my_token_ner          :: Maybe NER
+                          , _my_token_offset_begin :: Int
+                          , _my_token_offset_end   :: Int
                          } deriving (Show)

-instance Semigroup TokenTag where
-  (<>) (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _) = TokenTag (w1 <> w2) (s1 <> s2) p3 n1
+-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
+-- lemma combining is just wrong. You can't just "cat" <> "woman" to
+-- get a "Catwoman".
+combineTokenTags :: TokenTag -> TokenTag -> TokenTag
+combineTokenTags (TokenTag w1 l1 p1 n1 s1 e1) (TokenTag w2 l2 p2 _ s2 e2) = TokenTag (w1 <> w2) (l1 <> l2) p3 n1 (minimum [s1, s2]) (maximum [e1, e2])
  where
    p3 = case (p1,p2) of
      (Just JJ, Just NP) -> Just NP
@@ -165,10 +170,13 @@ instance Semigroup TokenTag where
      _                  -> p1


-instance Monoid TokenTag where
-  mempty = TokenTag [] empty Nothing Nothing
-  mconcat = foldl mappend mempty
-  -- mappend t1 t2 = (<>) t1 t2
+emptyTokenTag :: TokenTag
+emptyTokenTag = TokenTag [] empty Nothing Nothing 0 0
+
+-- instance Monoid TokenTag where
+--   mempty = TokenTag [] empty Nothing Nothing 0 0
+--   mconcat = foldl mappend mempty
+--   -- mappend t1 t2 = (<>) t1 t2


 class HasValidationError e where

--- a/src/Gargantext/Utils/SpacyNLP.hs
+++ b/src/Gargantext/Utils/SpacyNLP.hs
@@ -20,22 +20,23 @@ module Gargantext.Utils.SpacyNLP where
 import Control.Lens
 import Data.Aeson (encode)
 import Data.Aeson.TH (deriveJSON)
+import Data.Set qualified as Set
 import Data.Text hiding (map, group, filter, concat, zip)
-import Gargantext.Core (Lang(..))
-import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
-import Gargantext.Core.Types (POS(..), NER(..))
+import Gargantext.Core.Types (POS(..), NER(..), TokenTag(..))
 import Gargantext.Core.Utils.Prefix (unPrefix)
 import Gargantext.Prelude
 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
 import Network.URI (URI(..))


-data SpacyData = SpacyData { _spacy_data :: ![SpacyText]}
+data SpacyData = SpacyData { _spacy_data :: ![SpacyText] }
  deriving (Show)

 data SpacyText = SpacyText { _spacy_text :: !Text
                           , _spacy_tags :: ![SpacyTags]
                           } deriving (Show)
+
+-- | https://spacy.io/api/token/#attributes
 data SpacyTags =
  SpacyTags { _spacyTags_text :: !Text
            , _spacyTags_text_with_ws :: !Text
@@ -100,28 +101,24 @@ makeLenses ''SpacyTags
 makeLenses ''SpacyRequest

 ----------------------------------------------------------------
-spacyTagsToToken :: SpacyTags -> Token
-spacyTagsToToken st = Token (st ^. spacyTags_index)
-                   (st ^. spacyTags_normalized)
-                   (st ^. spacyTags_text)
-                   (st ^. spacyTags_lemma)
-                   (st ^. spacyTags_head_index)
-                   (st ^. spacyTags_char_offset)
-                   (Just $ st ^. spacyTags_pos)
-                   (Just $ st ^. spacyTags_ent_type)
-                   (Just $ st ^. spacyTags_prefix)
-                   (Just $ st ^. spacyTags_suffix)
-
-spacyDataToPosSentences :: SpacyData -> PosSentences
-spacyDataToPosSentences (SpacyData ds) = PosSentences
-   $  map (\(i, ts) -> Sentence i ts)
-   $ zip [1..]
-   $ map (\(SpacyText _ tags)-> map spacyTagsToToken tags) ds

+spacyDataToTokenTags :: SpacyData -> [[TokenTag]]
+spacyDataToTokenTags (SpacyData ds) =
+  map (\(SpacyText _ tags) -> spacyTagsToTokenTag <$> tags) ds
+
+
+spacyTagsToTokenTag :: SpacyTags -> TokenTag
+spacyTagsToTokenTag st = TokenTag { _my_token_word = split' (st ^. spacyTags_normalized)
+                                  , _my_token_lemma = Set.fromList (split' $ st ^. spacyTags_lemma)
+                                  , _my_token_pos = Just $ st ^. spacyTags_pos
+                                  , _my_token_ner = Just $ st ^. spacyTags_ent_type
+                                  , _my_token_offset_begin = 0 -- TODO
+                                  , _my_token_offset_end = 0 -- TODO
+                                  }
+  where
+    split' :: Text -> [Text]
+    split' = splitOn " " . Data.Text.toLower
 -----------------------------------------------------------------

-nlp :: URI -> Lang -> Text -> IO PosSentences
-nlp uri _lang txt = spacyDataToPosSentences <$> spacyRequest uri txt
-- nlp _ _ _ = panic "Make sure you have the right model for your lang for spacy Server"
-- nlp FR txt  = spacyDataToPosSentences <$> spacyRequest txt
-- nlp _  _ = panic "Make sure you have the right model for your lang for spacy Server"
+nlpTokenTags :: URI -> Text -> IO [[TokenTag]]
+nlpTokenTags uri txt = spacyDataToTokenTags <$> spacyRequest uri txt
--- a/stack.yaml
+++ b/stack.yaml
@@ -43,6 +43,8 @@ extra-deps:
    - servant-auth/servant-auth/
    - servant-auth/servant-auth-client/
    - servant-auth/servant-auth-server/
+- git: https://gitlab.iscpif.fr/gargantext/corenlp-types
+  commit: 429117fad5f8414d8eaa85b5f3c474c9d003cfeb
 - git: https://gitlab.iscpif.fr/gargantext/servant-xml-conduit.git
  commit: 339fd608341bd2652cf5c0e9e76a3293acffbea6
 - git: https://github.com/alpmestan/ekg-json.git