Commit d3ca8202 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] NLP using Spacy Server

parent 3a8af37d
......@@ -319,6 +319,7 @@ library
Gargantext.Utils.Aeson
Gargantext.Utils.JohnSnowNLP
Gargantext.Utils.Servant
Gargantext.Utils.SpacyNLP
Gargantext.Utils.UTCTime
Paths_gargantext
hs-source-dirs:
......
......@@ -75,6 +75,7 @@ library:
- Gargantext.Core.Types.Individu
- Gargantext.Core.Types.Main
- Gargantext.Core.Utils.Prefix
- Gargantext.Utils.SpacyNLP
- Gargantext.Database.Action.Flow
- Gargantext.Database.Action.Flow.Types
- Gargantext.Database.Action.User.New
......
......@@ -28,7 +28,8 @@ import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
-- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
-------------------------------------------------------------------
......@@ -51,7 +52,7 @@ tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags EN txt = tokenTagsWith EN txt corenlp
tokenTags FR txt = tokenTagsWith FR txt JohnSnow.nlp
tokenTags FR txt = tokenTagsWith FR txt SpacyNLP.nlp
tokenTags _ _ = panic "[G.C.T.T.Multi] NLP API not implemented yet"
tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
......
......@@ -9,6 +9,7 @@ Portability : POSIX
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
......@@ -22,27 +23,27 @@ import Gargantext.Prelude
import GHC.Generics
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
, _tokenOriginalText :: Text
, _tokenLemma :: Text
, _tokenCharacterOffsetBegin :: Int
, _tokenCharacterOffsetEnd :: Int
, _tokenPos :: Maybe POS
, _tokenNer :: Maybe NER
, _tokenBefore :: Maybe Text
, _tokenAfter :: Maybe Text
data Token = Token { _tokenIndex :: !Int
, _tokenWord :: !Text
, _tokenOriginalText :: !Text
, _tokenLemma :: !Text
, _tokenCharacterOffsetBegin :: !Int
, _tokenCharacterOffsetEnd :: !Int
, _tokenPos :: !(Maybe POS)
, _tokenNer :: !(Maybe NER)
, _tokenBefore :: !(Maybe Text)
, _tokenAfter :: !(Maybe Text)
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token]
data Sentence = Sentence { _sentenceIndex :: !Int
, _sentenceTokens :: ![Token]
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_sentence") ''Sentence)
data Properties = Properties { _propertiesAnnotators :: Text
, _propertiesOutputFormat :: Text
data Properties = Properties { _propertiesAnnotators :: !Text
, _propertiesOutputFormat :: !Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_properties") ''Properties)
......
......@@ -126,7 +126,7 @@ instance FromJSON POS where
instance ToJSON POS
instance Hashable POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
data NER = PERSON | ORGANIZATION | LOCATION | NoNER { noNer :: !Text }
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
......@@ -134,9 +134,11 @@ instance FromJSON NER where
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "PER" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
ner "LOC" = LOCATION
ner x = NoNER (cs x)
instance ToJSON NER
......
{-|
Module : Gargantext.Utils.SpacyNLP
Description : John Snow NLP API connexion
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Spacy ecosystem: https://github.com/explosion/spaCy
Server to be used: https://gitlab.iscpif.fr/gargantext/spacy-server
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Utils.SpacyNLP where
import Control.Lens
import Data.Aeson (encode)
import Data.Aeson.TH (deriveJSON)
import Data.Text hiding (map, group, filter, concat, zip)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..), NER(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
data SpacyData = SpacyData { _spacy_data :: ![SpacyText]}
deriving (Show)
data SpacyText = SpacyText { _spacy_text :: !Text
, _spacy_tags :: ![SpacyTags]
} deriving (Show)
data SpacyTags =
SpacyTags { _spacyTags_text :: !Text
, _spacyTags_text_with_ws :: !Text
, _spacyTags_whitespace :: !Text
, _spacyTags_head :: !Text
, _spacyTags_head_index :: !Int
, _spacyTags_left_edge :: !Text
, _spacyTags_right_edge :: !Text
, _spacyTags_index :: Int
, _spacyTags_ent_type :: !NER
, _spacyTags_ent_iob :: !Text
, _spacyTags_lemma :: !Text
, _spacyTags_normalized :: !Text
, _spacyTags_shape :: !Text
, _spacyTags_prefix :: !Text
, _spacyTags_suffix :: !Text
, _spacyTags_is_alpha :: Bool
, _spacyTags_is_ascii :: Bool
, _spacyTags_is_digit :: Bool
, _spacyTags_is_title :: Bool
, _spacyTags_is_punct :: Bool
, _spacyTags_is_left_punct :: Bool
, _spacyTags_is_right_punct :: Bool
, _spacyTags_is_space :: Bool
, _spacyTags_is_bracket :: Bool
, _spacyTags_is_quote :: Bool
, _spacyTags_is_currency :: Bool
, _spacyTags_like_url :: Bool
, _spacyTags_like_num :: Bool
, _spacyTags_like_email :: Bool
, _spacyTags_is_oov :: Bool
, _spacyTags_is_stop :: Bool
, _spacyTags_pos :: POS
, _spacyTags_tag :: POS
, _spacyTags_dep :: !Text
, _spacyTags_lang :: !Text
, _spacyTags_prob :: !Int
, _spacyTags_char_offset :: !Int
} deriving (Show)
data SpacyRequest = SpacyRequest { _spacyRequest_text :: !Text }
deriving (Show)
spacyRequest :: Text -> IO SpacyData
spacyRequest txt = do
url <- parseRequest $ unpack "POST http://localhost:8001/pos"
let request = setRequestBodyLBS (encode $ SpacyRequest txt) url
result <- httpJSON request :: IO (Response SpacyData)
pure $ getResponseBody result
-- Instances
deriveJSON (unPrefix "_spacy_") ''SpacyData
deriveJSON (unPrefix "_spacy_") ''SpacyText
deriveJSON (unPrefix "_spacyTags_") ''SpacyTags
deriveJSON (unPrefix "_spacyRequest_") ''SpacyRequest
makeLenses ''SpacyData
makeLenses ''SpacyText
makeLenses ''SpacyTags
makeLenses ''SpacyRequest
----------------------------------------------------------------
spacyTagsToToken :: SpacyTags -> Token
spacyTagsToToken st = Token (st ^. spacyTags_index)
(st ^. spacyTags_normalized)
(st ^. spacyTags_text)
(st ^. spacyTags_lemma)
(st ^. spacyTags_head_index)
(st ^. spacyTags_char_offset)
(Just $ st ^. spacyTags_pos)
(Just $ st ^. spacyTags_ent_type)
(Just $ st ^. spacyTags_prefix)
(Just $ st ^. spacyTags_suffix)
spacyDataToPosSentences :: SpacyData -> PosSentences
spacyDataToPosSentences (SpacyData ds) = PosSentences
$ map (\(i, ts) -> Sentence i ts)
$ zip [1..]
$ map (\(SpacyText _ tags)-> map spacyTagsToToken tags) ds
-----------------------------------------------------------------
nlp :: Lang -> Text -> IO PosSentences
nlp FR txt = spacyDataToPosSentences <$> spacyRequest txt
nlp _ _ = panic "Make sure you have the right model for your lang for spacy Server"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment