Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
22d25fbe
Verified
Commit
22d25fbe
authored
Dec 04, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] add multi/corenlp.hs
parent
ff01b79c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
171 additions
and
0 deletions
+171
-0
CoreNLP.hs
src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
+171
-0
No files found.
src/Gargantext/Core/Text/Terms/Multi/CoreNLP.hs
0 → 100644
View file @
22d25fbe
{-|
Module : Gargantext.Core.Text.Terms.Multi.CoreNLP
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In corpus linguistics, part-of-speech tagging (POS tagging or PoS
tagging or POST), also called grammatical tagging or word-category
disambiguation, is the process of marking up a word in a text (corpus)
as corresponding to a particular part of speech,[1] based on both its
definition and its context—i.e., its relationship with adjacent and
related words in a phrase, sentence, or paragraph. A simplified form of
this is commonly taught to school-age children, in the identification of
words as nouns, verbs, adjectives, adverbs, etc.
Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module
Gargantext.Core.Text.Terms.Multi.CoreNLP
where
import
Data.Aeson
import
Data.ByteString.Lazy.Char8
qualified
as
BSL
import
Data.ByteString.Lazy.Internal
(
ByteString
)
import
Data.Map
qualified
as
Map
import
Data.Set
(
fromList
)
import
Data.Text
(
splitOn
,
pack
,
toLower
)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
import
Gargantext.Prelude
hiding
(
ByteString
,
toLower
)
import
Network.HTTP.Simple
import
Network.URI
(
URI
(
..
))
import
Text.CoreNLP.Types
qualified
as
CoreNLP
-- import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
------------------------------------------------------------------------
------------------------------------------------------------------------
tokens2tokensTags
::
[
CoreNLP
.
Token
]
->
[
TokenTag
]
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
------------------------------------------------------------------------
tokenTag
::
CoreNLP
.
Token
->
TokenTag
tokenTag
(
CoreNLP
.
Token
{
..
})
=
TokenTag
{
_my_token_word
=
w'
,
_my_token_lemma
=
l'
,
_my_token_pos
=
fromCoreNLPPOS
pos
,
_my_token_ner
=
fromCoreNLPNER
ner
,
_my_token_offset_begin
=
characterOffsetBegin
,
_my_token_offset_end
=
characterOffsetEnd
}
where
w'
=
split
word
l'
=
fromList
(
split
lemma
)
split
::
Text
->
[
Text
]
split
=
splitOn
" "
.
toLower
fromCoreNLPPOS
::
CoreNLP
.
PennPOS
->
Maybe
POS
fromCoreNLPPOS
=
decode
.
encode
fromCoreNLPNER
::
CoreNLP
.
NamedEntity
->
Maybe
NER
fromCoreNLPNER
=
decode
.
encode
filter'
::
[
TokenTag
]
->
[
TokenTag
]
filter'
xs
=
filter
isNgrams
xs
where
isNgrams
(
TokenTag
{
..
})
=
isJust
_my_token_pos
||
isJust
_my_token_ner
------------------------------------------------------------------------
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
-- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
-- // dependency parser
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
properties
::
Lang
->
[(
Text
,
Text
)]
properties
EN
=
[
(
"annotators"
,
"tokenize,ssplit,pos,ner"
)
]
properties
FR
=
[
(
"annotators"
,
"tokenize,ssplit,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/french.tagger"
)
,
(
"tokenize.language"
,
"fr"
)
]
properties
DE
=
[
(
"annotators"
,
"tokenize,ssplit,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/german-hgc.tagger"
)
,
(
"tokenize.language"
,
"de"
)
]
properties
ES
=
[
(
"annotators"
,
"tokenize,ssplit,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/spanish.tagger"
)
,
(
"tokenize.language"
,
"es"
)
]
properties
IT
=
[
(
"annotators"
,
"tokenize,ssplit,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
,
(
"tokenize.language"
,
"it"
)
]
properties
PL
=
[
(
"annotators"
,
"tokenize,ssplit,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
-- , ("pos.model", "edu/stanford/nlp/models/pos-tagger/french/french.tagger")
,
(
"tokenize.language"
,
"pl"
)
]
properties
ZH
=
[
(
"annotators"
,
"tokenize,pos,lemma,ner"
)
-- , ("parse.model", "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz")
,
(
"pos.model"
,
"edu/stanford/nlp/models/pos-tagger/models/chinese-distsim.tagger"
)
,
(
"tokenize.language"
,
"zh"
)
]
properties
l
=
panic
$
pack
$
"corenlp for language "
<>
show
l
<>
" is not implemented yet"
corenlp'
::
(
FromJSON
a
,
ConvertibleStrings
p
ByteString
)
=>
URI
->
Lang
->
p
->
IO
(
Response
a
)
corenlp'
uri
lang
txt
=
do
req
<-
parseRequest
$
"POST "
<>
show
(
uri
{
uriQuery
=
query
})
-- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'hello world, hello' | jq .
-- printDebug "[corenlp] sending body" $ (cs txt :: ByteString)
catch
(
httpJSON
$
setRequestBodyLBS
(
cs
txt
)
req
)
$
\
e
->
case
e
of
JSONParseException
_req
res
_err
->
do
let
body
=
getResponseBody
res
printDebug
"[corenlp'] request text"
(
cs
txt
::
ByteString
)
printDebug
"[corenlp'] response body (error)"
body
throwIO
e
JSONConversionException
_req
_res
_err
->
throwIO
e
where
query
=
"?properties="
<>
(
BSL
.
unpack
$
encode
$
toJSON
$
Map
.
fromList
props
)
props
=
(
properties
lang
)
<>
[
(
"outputFormat"
,
"json"
)
]
corenlp
::
URI
->
Lang
->
Text
->
IO
CoreNLP
.
Document
corenlp
uri
lang
txt
=
do
response
<-
corenlp'
uri
lang
txt
pure
(
getResponseBody
response
)
coreNLPTokenTags
::
URI
->
Lang
->
Text
->
IO
[[
TokenTag
]]
coreNLPTokenTags
uri
lang
txt
=
do
document
<-
corenlp
uri
lang
txt
pure
$
map
tokens2tokensTags
$
allTokens
document
-- | parseWith
-- Part Of Speech example
-- parseWith _tokenPos "Hello world."
-- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
URI
->
(
CoreNLP
.
Token
->
t
)
->
Lang
->
Text
->
IO
[[(
Text
,
t
)]]
tokenWith
uri
f
lang
s
=
do
document
<-
corenlp
uri
lang
s
pure
$
map
(
map
(
\
t
->
(
CoreNLP
.
word
t
,
f
t
)))
$
allTokens
document
allTokens
::
CoreNLP
.
Document
->
[[
CoreNLP
.
Token
]]
allTokens
(
CoreNLP
.
Document
{
sentences
})
=
CoreNLP
.
tokens
<$>
sentences
----------------------------------------------------------------------------------
-- Here connect to the JohnSnow Server as it has been done above with the corenlp'
-- We need the PosTagging according to the language and the lems
serverNLP
::
Lang
->
Text
->
IO
CoreNLP
.
Document
serverNLP
=
undefined
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment