[nlp] add sample support for languages to corenlp

parent 29aee119
...@@ -5,7 +5,7 @@ cabal-version: 1.12 ...@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
name: gargantext name: gargantext
version: 0.0.6.9.8.6.2 version: 0.0.6.9.8.6.2
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
......
...@@ -28,13 +28,13 @@ import Servant.API ...@@ -28,13 +28,13 @@ import Servant.API
-- | Language of a Text -- | Language of a Text
-- For simplicity, we suppose text has an homogenous language -- For simplicity, we suppose text has an homogenous language
-- --
-- Next steps: | DE | IT | SP
--
-- - EN == english -- - EN == english
-- - FR == french -- - FR == french
-- - DE == deutch (not implemented yet) -- - DE == deutch
-- - IT == italian (not implemented yet) -- - IT == italian
-- - SP == spanish (not implemented yet) -- - ES == spanish
-- - PL == polish
-- - CN == chinese
-- --
-- ... add your language and help us to implement it (: -- ... add your language and help us to implement it (:
......
...@@ -27,6 +27,8 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging ...@@ -27,6 +27,8 @@ module Gargantext.Core.Text.Terms.Multi.PosTagging
import Data.Aeson import Data.Aeson
import Data.ByteString.Lazy.Internal (ByteString) import Data.ByteString.Lazy.Internal (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BSL
import qualified Data.Map as Map
import Data.Set (fromList) import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower) import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -79,14 +81,43 @@ corenlp' :: ( FromJSON a ...@@ -79,14 +81,43 @@ corenlp' :: ( FromJSON a
) )
=> URI -> Lang -> p -> IO (Response a) => URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do corenlp' uri lang txt = do
let properties = case lang of req <- parseRequest $
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" "POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
FR -> "{\"annotators\": \"tokenize,ssplit,pos,lemma,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}" -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
_ -> panic $ pack "not implemented yet" httpJSON $ setRequestBodyLBS (cs txt) req
req <- parseRequest $ "POST " <> show (uri { uriQuery = "?properties=" <> properties }) where
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq . properties_ :: [(Text, Text)]
let request = setRequestBodyLBS (cs txt) req properties_ = case lang of
httpJSON request -- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "fr") ]
DE -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/german-hgc.tagger")
, ("tokenize.language", "de") ]
ES -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/spanish.tagger")
, ("tokenize.language", "es") ]
IT -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "it") ]
PL -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
, ("tokenize.language", "pl") ]
CN -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/chinese-distsim.tagger")
, ("tokenize.language", "zh") ]
l -> panic $ pack $ "corenlp for language " <> show l <> " is not implemented yet"
properties = properties_ <> [ ("outputFormat", "json") ]
corenlp :: URI -> Lang -> Text -> IO PosSentences corenlp :: URI -> Lang -> Text -> IO PosSentences
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment