Commit 7c074fc8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/fix/386' into dev

parents 294ed193 e53d4b86
......@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1.
-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where
import Data.Attoparsec.Text as DAT ( digit, space, notChar, string )
import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
......@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
cleanTextForNLP = unifySpaces . removeUrls
where
remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
......@@ -82,7 +82,7 @@ corenlp' :: ( FromJSON a
=> URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do
req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) })
"POST " <> show (uri { uriQuery = "?properties=" <> BSL.unpack (encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
......@@ -97,7 +97,7 @@ corenlp' uri lang txt = do
properties_ :: [(Text, Text)]
properties_ = case lang of
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ]
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ), ("tokenize.options", "splitHyphenated=false") ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment