Commit 7c074fc8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/fix/386' into dev

parents 294ed193 e53d4b86
...@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1. ...@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1.
-} -}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP) module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where where
import Data.Attoparsec.Text as DAT ( digit, space, notChar, string ) import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
...@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens ...@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens
-- TODO: make tests here -- TODO: make tests here
cleanTextForNLP :: Text -> Text cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls cleanTextForNLP = unifySpaces . removeUrls
where where
remove x = RAT.streamEdit x (const "") remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ") unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www" removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space) removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
...@@ -82,7 +82,7 @@ corenlp' :: ( FromJSON a ...@@ -82,7 +82,7 @@ corenlp' :: ( FromJSON a
=> URI -> Lang -> p -> IO (Response a) => URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do corenlp' uri lang txt = do
req <- parseRequest $ req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) }) "POST " <> show (uri { uriQuery = "?properties=" <> BSL.unpack (encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq . -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString) -- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e -> catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
...@@ -97,7 +97,7 @@ corenlp' uri lang txt = do ...@@ -97,7 +97,7 @@ corenlp' uri lang txt = do
properties_ :: [(Text, Text)] properties_ :: [(Text, Text)]
properties_ = case lang of properties_ = case lang of
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings -- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ] EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ), ("tokenize.options", "splitHyphenated=false") ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner") FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz") -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger") , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment