Commit 4e21f839 authored by Yoelis Acourt's avatar Yoelis Acourt

configure coreNLP tokenization to group hyphaneted words

parent 238628a4
...@@ -82,7 +82,7 @@ corenlp' :: ( FromJSON a ...@@ -82,7 +82,7 @@ corenlp' :: ( FromJSON a
=> URI -> Lang -> p -> IO (Response a) => URI -> Lang -> p -> IO (Response a)
corenlp' uri lang txt = do corenlp' uri lang txt = do
req <- parseRequest $ req <- parseRequest $
"POST " <> show (uri { uriQuery = "?properties=" <> (BSL.unpack $ encode $ toJSON $ Map.fromList properties) }) "POST " <> show (uri { uriQuery = "?properties=" <> BSL.unpack (encode $ toJSON $ Map.fromList properties) })
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq . -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString) -- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e -> catch (httpJSON $ setRequestBodyLBS (cs txt) req) $ \e ->
...@@ -97,7 +97,7 @@ corenlp' uri lang txt = do ...@@ -97,7 +97,7 @@ corenlp' uri lang txt = do
properties_ :: [(Text, Text)] properties_ :: [(Text, Text)]
properties_ = case lang of properties_ = case lang of
-- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings -- TODO: Add: Aeson.encode $ Aeson.toJSON $ Map.fromList [()] instead of these hardcoded JSON strings
EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ) ] EN -> [ ("annotators", "tokenize,ssplit,pos,ner" ), ("tokenize.options", "splitHyphenated=false") ]
FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner") FR -> [ ("annotators", "tokenize,ssplit,pos,lemma,ner")
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz") -- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
, ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger") , ("pos.model", "edu/stanford/nlp/models/pos-tagger/models/french.tagger")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment