Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ea51f50d
Commit
ea51f50d
authored
Nov 22, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] NLP Ngrams parser works for French _and_ English.
parent
0f0feaac
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
48 additions
and
17 deletions
+48
-17
CoreNLP.hs
src/Data/Gargantext/Ngrams/CoreNLP.hs
+11
-5
Fr.hs
src/Data/Gargantext/Ngrams/Lang/Fr.hs
+8
-3
Parser.hs
src/Data/Gargantext/Ngrams/Parser.hs
+22
-6
Parsers.hs
src/Data/Gargantext/Parsers.hs
+5
-1
Main.hs
test/Main.hs
+1
-1
Fr.hs
test/Ngrams/Lang/Fr.hs
+1
-1
No files found.
src/Data/Gargantext/Ngrams/CoreNLP.hs
View file @
ea51f50d
...
...
@@ -8,7 +8,9 @@ module Data.Gargantext.Ngrams.CoreNLP where
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
GHC.Generics
import
Data.Monoid
((
<>
))
import
Data.Gargantext.Types.Main
(
Language
(
..
))
import
Data.Gargantext.Prelude
import
Data.Gargantext.Utils.Prefix
(
unPrefix
)
import
Data.Text
(
Text
)
...
...
@@ -78,9 +80,13 @@ corenlpPretty txt = do
-- print $ getResponseHeader "Content-Type" response
S8
.
putStrLn
$
Yaml
.
encode
(
getResponseBody
response
::
Sentences
)
corenlp
::
String
->
IO
Sentences
corenlp
txt
=
do
url
<-
parseRequest
"POST http://localhost:9000/?properties={
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
corenlp
::
Language
->
String
->
IO
Sentences
corenlp
lang
txt
=
do
let
properties
=
case
lang
of
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
parse.model
\"
:
\"
edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz
\"
,
\"
pos.model
\"
:
\"
edu/stanford/nlp/models/pos-tagger/french/french.tagger
\"
,
\"
tokenize.language
\"
:
\"
fr
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
url
<-
parseRequest
$
"POST http://localhost:9000/?properties="
<>
properties
let
request
=
setRequestBodyJSON
txt
url
response
<-
httpJSON
request
pure
(
getResponseBody
response
::
Sentences
)
...
...
@@ -93,8 +99,8 @@ corenlp txt = do
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
(
Token
->
t
)
->
String
->
IO
[[(
Text
,
t
)]]
tokenWith
f
s
=
pm
(
pm
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
s
tokenWith
::
(
Token
->
t
)
->
Language
->
String
->
IO
[[(
Text
,
t
)]]
tokenWith
f
lang
s
=
pm
(
pm
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
lang
s
src/Data/Gargantext/Ngrams/Lang/Fr.hs
View file @
ea51f50d
...
...
@@ -13,9 +13,9 @@ selectNgrams xs = pf selectNgrams' xs
selectNgrams'
(
_
,
"N"
,
_
)
=
True
selectNgrams'
(
_
,
"NC"
,
_
)
=
True
selectNgrams'
(
_
,
"NN+CC"
,
_
)
=
True
-- FIXME NER in French must be improved
-- selectNgrams' (_,_ ,"I-PERS
") = True
-- selectNgrams' (_,_ ,"I-LIEU"
) = True
selectNgrams'
(
_
,
_
,
"PERSON"
)
=
True
selectNgrams'
(
_
,
_
,
"ORGANIZATION
"
)
=
True
selectNgrams'
(
_
,
_
,
"LOCATION"
)
=
True
selectNgrams'
(
_
,
_
,
_
)
=
False
...
...
@@ -53,6 +53,11 @@ groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", y
-- /!\ sometimes N instead of NC (why?)
groupNgrams
((
x
,
"ADJ"
,
_
)
:
(
y
,
"N"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
"NC"
,
yy
)
:
xs
)
groupNgrams
((
x
,
_
,
"PERSON"
)
:
(
y
,
yy
,
"PERSON"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"PERSON"
)
:
xs
)
groupNgrams
((
x
,
_
,
"ORGANIZATION"
)
:
(
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
-- Si aucune des règles précédentes n'est remplie
groupNgrams
(
x
:
xs
)
=
(
x
:
(
groupNgrams
xs
))
...
...
src/Data/Gargantext/Ngrams/Parser.hs
View file @
ea51f50d
...
...
@@ -12,17 +12,33 @@ import qualified Data.Gargantext.Ngrams.Lang.En as En
import
qualified
Data.Gargantext.Ngrams.Lang.Fr
as
Fr
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.
-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text
-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are
-- ps : Common words function in Haskell do not take sentence into account
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams
::
Language
->
String
->
IO
[[
Ngrams
]]
extractNgrams
lang
s
=
pm
(
groupNgrams
lang
)
<$>
extractNgrams'
s
extractNgrams
lang
s
=
pm
(
groupNgrams
lang
)
<$>
extractNgrams'
lang
s
extractNgrams'
::
String
->
IO
[[
Ngrams
]]
extractNgrams'
t
=
pm
(
pm
token2text
)
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
t
extractNgrams'
::
Language
->
String
->
IO
[[
Ngrams
]]
extractNgrams'
lang
t
=
pm
(
pm
token2text
)
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
lang
t
-- | This function selects ngrams according to grammars specific
-- of each language.
...
...
src/Data/Gargantext/Parsers.hs
View file @
ea51f50d
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
)
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
,
module
Data
.
Gargantext
.
Parsers
.
Date
)
where
import
Data.Gargantext.Parsers.WOS
import
Data.Gargantext.Parsers.Date
test/Main.hs
View file @
ea51f50d
...
...
@@ -6,7 +6,7 @@ import qualified Ngrams.Metrics as Metrics
main
::
IO
()
main
=
do
Occ
.
parsersTest
Lang
.
ngramsExtractionTest
FR
Lang
.
ngramsExtractionTest
EN
Metrics
.
main
--Lang.ngramsExtractionTest FR
test/Ngrams/Lang/Fr.hs
View file @
ea51f50d
...
...
@@ -42,5 +42,5 @@ ngramsExtractionTest = hspec $ do
it
"Groupe: Nom commun + préposition + Nom commun + prép + Nom commun"
$
do
let
textFr1
=
"L'heure d'arrivée des coureurs dépend de la météo du jour."
testFr1
<-
pm
(
selectNgrams
FR
)
<$>
(
extractNgrams
FR
)
textFr1
testFr1
`
shouldBe
`
[[(
"heure d' arrivée des coureurs"
,
"NC"
,
"
I-ORG
"
),(
"météo du jour"
,
"NC"
,
"O"
)]]
testFr1
`
shouldBe
`
[[(
"heure d' arrivée des coureurs"
,
"NC"
,
"
O
"
),(
"météo du jour"
,
"NC"
,
"O"
)]]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment