Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
e5871a7d
Commit
e5871a7d
authored
Oct 26, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CLEAN] improving groups with lemma (WIP)
parent
66ca6fd7
Pipeline
#1169
failed with stage
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
39 deletions
+36
-39
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+8
-12
En.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
+12
-12
Fr.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/Fr.hs
+4
-4
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+8
-7
Types.hs
src/Gargantext/Core/Types.hs
+4
-4
No files found.
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
e5871a7d
...
...
@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
)
import
Data.List
(
concat
)
import
qualified
Data.Set
as
S
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.En
as
En
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.Fr
as
Fr
...
...
@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
multiterms
::
Lang
->
Text
->
IO
[
Terms
]
multiterms
lang
txt
=
concat
<$>
map
(
map
(
tokenTag2terms
lang
)
)
<$>
map
(
map
tokenTag2terms
)
<$>
map
(
filter
(
\
t
->
_my_token_pos
t
==
Just
NP
))
<$>
tokenTags
lang
txt
tokenTag2terms
::
Lang
->
TokenTag
->
Terms
tokenTag2terms
lang
(
TokenTag
w
t
_
_
)
=
Terms
w
t'
where
t'
=
S
.
fromList
$
map
(
stem
lang
)
$
S
.
toList
t
tokenTag2terms
::
TokenTag
->
Terms
tokenTag2terms
(
TokenTag
ws
t
_
_
)
=
Terms
ws
t
tokenTags
::
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags
lang
s
=
map
(
group
lang
)
<$>
tokenTags'
lang
s
tokenTags
lang
s
=
map
(
group
Tokens
lang
)
<$>
tokenTags'
lang
s
tokenTags'
::
Lang
->
Text
->
IO
[[
TokenTag
]]
...
...
@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
group
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
group
EN
=
En
.
group
group
FR
=
Fr
.
group
group
_
=
panic
$
pack
"group
:: Lang not implemeted yet"
group
Tokens
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
EN
=
En
.
groupTokens
group
Tokens
FR
=
Fr
.
groupTokens
group
Tokens
_
=
panic
$
pack
"groupTokens
:: Lang not implemeted yet"
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
View file @
e5871a7d
...
...
@@ -13,7 +13,7 @@ the tokens into extracted terms.
-}
module
Gargantext.Core.Text.Terms.Multi.Lang.En
(
group
)
module
Gargantext.Core.Text.Terms.Multi.Lang.En
(
group
Tokens
)
where
import
Gargantext.Prelude
...
...
@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group
------------------------------------------------------------------------
-- | Rule grammar to group tokens
group
::
[
TokenTag
]
->
[
TokenTag
]
group
[]
=
[]
group
ntags
=
group2
NP
NP
$
group2
NP
VB
--
$ group2 NP IN
$
group2
IN
DT
--
$ group2 VB NP
$
group2
JJ
NP
$
group2
JJ
JJ
$
group2
JJ
CC
$
ntags
group
Tokens
::
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
[]
=
[]
group
Tokens
ntags
=
group2
NP
NP
$
group2
NP
VB
--
$ group2 NP IN
$
group2
IN
DT
--
$ group2 VB NP
$
group2
JJ
NP
$
group2
JJ
JJ
$
group2
JJ
CC
$
ntags
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
...
...
src/Gargantext/Core/Text/Terms/Multi/Lang/Fr.hs
View file @
e5871a7d
...
...
@@ -14,16 +14,16 @@ is ADJectiv in french.
-}
module
Gargantext.Core.Text.Terms.Multi.Lang.Fr
(
group
)
module
Gargantext.Core.Text.Terms.Multi.Lang.Fr
(
group
Tokens
)
where
import
Gargantext.Prelude
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.Group
(
group2
)
group
::
[
TokenTag
]
->
[
TokenTag
]
group
[]
=
[]
group
ntags
=
group2
NP
NP
group
Tokens
::
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
[]
=
[]
group
Tokens
ntags
=
group2
NP
NP
$
group2
NP
VB
-- group2 NP IN
-- group2 IN DT
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
e5871a7d
...
...
@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
------------------------------------------------------------------------
tokenTag
::
Token
->
TokenTag
tokenTag
(
Token
_
_
w
s
_
_
p
n
_
_
)
=
TokenTag
w'
s
'
p
n
tokenTag
(
Token
_
_
w
l
_
_
p
n
_
_
)
=
TokenTag
w'
l
'
p
n
where
w'
=
split
w
s'
=
fromList
(
split
s
)
l'
=
fromList
(
split
l
)
split
=
splitOn
(
pack
" "
)
.
toLower
filter'
::
[
TokenTag
]
->
[
TokenTag
]
...
...
@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
------------------------------------------------------------------------
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
...
...
@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences)
corenlp'
::
(
FromJSON
a
,
ConvertibleStrings
p
ByteString
)
=>
Lang
->
p
->
IO
(
Response
a
)
,
ConvertibleStrings
p
ByteString
)
=>
Lang
->
p
->
IO
(
Response
a
)
corenlp'
lang
txt
=
do
let
properties
=
case
lang
of
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
...
...
@@ -142,9 +143,9 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
(
Token
->
t
)
->
Lang
->
Text
->
IO
[[(
Text
,
t
)]]
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
map
_sentenceTokens
<$>
_sentences
<$>
_sentences
<$>
corenlp
lang
s
src/Gargantext/Core/Types.hs
View file @
e5871a7d
...
...
@@ -109,10 +109,10 @@ instance FromJSON NER where
instance
ToJSON
NER
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
,
_my_token_
stem
::
Set
Text
,
_my_token_pos
::
Maybe
POS
,
_my_token_ner
::
Maybe
NER
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
,
_my_token_
lemma
::
Set
Text
,
_my_token_pos
::
Maybe
POS
,
_my_token_ner
::
Maybe
NER
}
deriving
(
Show
)
instance
Semigroup
TokenTag
where
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment