Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
199
Issues
199
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ea51f50d
Commit
ea51f50d
authored
Nov 22, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] NLP Ngrams parser works for French _and_ English.
parent
0f0feaac
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
48 additions
and
17 deletions
+48
-17
CoreNLP.hs
src/Data/Gargantext/Ngrams/CoreNLP.hs
+11
-5
Fr.hs
src/Data/Gargantext/Ngrams/Lang/Fr.hs
+8
-3
Parser.hs
src/Data/Gargantext/Ngrams/Parser.hs
+22
-6
Parsers.hs
src/Data/Gargantext/Parsers.hs
+5
-1
Main.hs
test/Main.hs
+1
-1
Fr.hs
test/Ngrams/Lang/Fr.hs
+1
-1
No files found.
src/Data/Gargantext/Ngrams/CoreNLP.hs
View file @
ea51f50d
...
@@ -8,7 +8,9 @@ module Data.Gargantext.Ngrams.CoreNLP where
...
@@ -8,7 +8,9 @@ module Data.Gargantext.Ngrams.CoreNLP where
import
Data.Aeson
import
Data.Aeson
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Aeson.TH
(
deriveJSON
)
import
GHC.Generics
import
GHC.Generics
import
Data.Monoid
((
<>
))
import
Data.Gargantext.Types.Main
(
Language
(
..
))
import
Data.Gargantext.Prelude
import
Data.Gargantext.Prelude
import
Data.Gargantext.Utils.Prefix
(
unPrefix
)
import
Data.Gargantext.Utils.Prefix
(
unPrefix
)
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
...
@@ -78,9 +80,13 @@ corenlpPretty txt = do
...
@@ -78,9 +80,13 @@ corenlpPretty txt = do
-- print $ getResponseHeader "Content-Type" response
-- print $ getResponseHeader "Content-Type" response
S8
.
putStrLn
$
Yaml
.
encode
(
getResponseBody
response
::
Sentences
)
S8
.
putStrLn
$
Yaml
.
encode
(
getResponseBody
response
::
Sentences
)
corenlp
::
String
->
IO
Sentences
corenlp
::
Language
->
String
->
IO
Sentences
corenlp
txt
=
do
corenlp
lang
txt
=
do
url
<-
parseRequest
"POST http://localhost:9000/?properties={
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
let
properties
=
case
lang
of
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
parse.model
\"
:
\"
edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz
\"
,
\"
pos.model
\"
:
\"
edu/stanford/nlp/models/pos-tagger/french/french.tagger
\"
,
\"
tokenize.language
\"
:
\"
fr
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
url
<-
parseRequest
$
"POST http://localhost:9000/?properties="
<>
properties
let
request
=
setRequestBodyJSON
txt
url
let
request
=
setRequestBodyJSON
txt
url
response
<-
httpJSON
request
response
<-
httpJSON
request
pure
(
getResponseBody
response
::
Sentences
)
pure
(
getResponseBody
response
::
Sentences
)
...
@@ -93,8 +99,8 @@ corenlp txt = do
...
@@ -93,8 +99,8 @@ corenlp txt = do
-- Named Entity Recognition example
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
(
Token
->
t
)
->
String
->
IO
[[(
Text
,
t
)]]
tokenWith
::
(
Token
->
t
)
->
Language
->
String
->
IO
[[(
Text
,
t
)]]
tokenWith
f
s
=
pm
(
pm
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
s
tokenWith
f
lang
s
=
pm
(
pm
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
pm
_sentenceTokens
<$>
sentences
<$>
corenlp
lang
s
src/Data/Gargantext/Ngrams/Lang/Fr.hs
View file @
ea51f50d
...
@@ -13,9 +13,9 @@ selectNgrams xs = pf selectNgrams' xs
...
@@ -13,9 +13,9 @@ selectNgrams xs = pf selectNgrams' xs
selectNgrams'
(
_
,
"N"
,
_
)
=
True
selectNgrams'
(
_
,
"N"
,
_
)
=
True
selectNgrams'
(
_
,
"NC"
,
_
)
=
True
selectNgrams'
(
_
,
"NC"
,
_
)
=
True
selectNgrams'
(
_
,
"NN+CC"
,
_
)
=
True
selectNgrams'
(
_
,
"NN+CC"
,
_
)
=
True
-- FIXME NER in French must be improved
selectNgrams'
(
_
,
_
,
"PERSON"
)
=
True
-- selectNgrams' (_,_ ,"I-PERS
") = True
selectNgrams'
(
_
,
_
,
"ORGANIZATION
"
)
=
True
-- selectNgrams' (_,_ ,"I-LIEU"
) = True
selectNgrams'
(
_
,
_
,
"LOCATION"
)
=
True
selectNgrams'
(
_
,
_
,
_
)
=
False
selectNgrams'
(
_
,
_
,
_
)
=
False
...
@@ -53,6 +53,11 @@ groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", y
...
@@ -53,6 +53,11 @@ groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", y
-- /!\ sometimes N instead of NC (why?)
-- /!\ sometimes N instead of NC (why?)
groupNgrams
((
x
,
"ADJ"
,
_
)
:
(
y
,
"N"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
"NC"
,
yy
)
:
xs
)
groupNgrams
((
x
,
"ADJ"
,
_
)
:
(
y
,
"N"
,
yy
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
"NC"
,
yy
)
:
xs
)
groupNgrams
((
x
,
_
,
"PERSON"
)
:
(
y
,
yy
,
"PERSON"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"PERSON"
)
:
xs
)
groupNgrams
((
x
,
_
,
"ORGANIZATION"
)
:
(
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
=
groupNgrams
((
x
<>
" "
<>
y
,
yy
,
"ORGANIZATION"
)
:
xs
)
-- Si aucune des règles précédentes n'est remplie
-- Si aucune des règles précédentes n'est remplie
groupNgrams
(
x
:
xs
)
=
(
x
:
(
groupNgrams
xs
))
groupNgrams
(
x
:
xs
)
=
(
x
:
(
groupNgrams
xs
))
...
...
src/Data/Gargantext/Ngrams/Parser.hs
View file @
ea51f50d
...
@@ -12,17 +12,33 @@ import qualified Data.Gargantext.Ngrams.Lang.En as En
...
@@ -12,17 +12,33 @@ import qualified Data.Gargantext.Ngrams.Lang.En as En
import
qualified
Data.Gargantext.Ngrams.Lang.Fr
as
Fr
import
qualified
Data.Gargantext.Ngrams.Lang.Fr
as
Fr
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.
-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text
-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are
-- ps : Common words function in Haskell do not take sentence into account
-- TODO for scientific papers: add maesures
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
-- TODO add the p score regex
extractNgrams
::
Language
->
String
->
IO
[[
Ngrams
]]
extractNgrams
::
Language
->
String
->
IO
[[
Ngrams
]]
extractNgrams
lang
s
=
pm
(
groupNgrams
lang
)
<$>
extractNgrams'
s
extractNgrams
lang
s
=
pm
(
groupNgrams
lang
)
<$>
extractNgrams'
lang
s
extractNgrams'
::
String
->
IO
[[
Ngrams
]]
extractNgrams'
::
Language
->
String
->
IO
[[
Ngrams
]]
extractNgrams'
t
=
pm
(
pm
token2text
)
extractNgrams'
lang
t
=
pm
(
pm
token2text
)
<$>
pm
_sentenceTokens
<$>
pm
_sentenceTokens
<$>
sentences
<$>
sentences
<$>
corenlp
t
<$>
corenlp
lang
t
-- | This function selects ngrams according to grammars specific
-- | This function selects ngrams according to grammars specific
-- of each language.
-- of each language.
...
...
src/Data/Gargantext/Parsers.hs
View file @
ea51f50d
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
)
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
,
module
Data
.
Gargantext
.
Parsers
.
Date
)
where
where
import
Data.Gargantext.Parsers.WOS
import
Data.Gargantext.Parsers.WOS
import
Data.Gargantext.Parsers.Date
test/Main.hs
View file @
ea51f50d
...
@@ -6,7 +6,7 @@ import qualified Ngrams.Metrics as Metrics
...
@@ -6,7 +6,7 @@ import qualified Ngrams.Metrics as Metrics
main
::
IO
()
main
::
IO
()
main
=
do
main
=
do
Occ
.
parsersTest
Occ
.
parsersTest
Lang
.
ngramsExtractionTest
FR
Lang
.
ngramsExtractionTest
EN
Lang
.
ngramsExtractionTest
EN
Metrics
.
main
Metrics
.
main
--Lang.ngramsExtractionTest FR
test/Ngrams/Lang/Fr.hs
View file @
ea51f50d
...
@@ -42,5 +42,5 @@ ngramsExtractionTest = hspec $ do
...
@@ -42,5 +42,5 @@ ngramsExtractionTest = hspec $ do
it
"Groupe: Nom commun + préposition + Nom commun + prép + Nom commun"
$
do
it
"Groupe: Nom commun + préposition + Nom commun + prép + Nom commun"
$
do
let
textFr1
=
"L'heure d'arrivée des coureurs dépend de la météo du jour."
let
textFr1
=
"L'heure d'arrivée des coureurs dépend de la météo du jour."
testFr1
<-
pm
(
selectNgrams
FR
)
<$>
(
extractNgrams
FR
)
textFr1
testFr1
<-
pm
(
selectNgrams
FR
)
<$>
(
extractNgrams
FR
)
textFr1
testFr1
`
shouldBe
`
[[(
"heure d' arrivée des coureurs"
,
"NC"
,
"
I-ORG
"
),(
"météo du jour"
,
"NC"
,
"O"
)]]
testFr1
`
shouldBe
`
[[(
"heure d' arrivée des coureurs"
,
"NC"
,
"
O
"
),(
"météo du jour"
,
"NC"
,
"O"
)]]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment