Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
e5871a7d
Commit
e5871a7d
authored
Oct 26, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[CLEAN] improving groups with lemma (WIP)
parent
66ca6fd7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
39 deletions
+36
-39
Multi.hs
src/Gargantext/Core/Text/Terms/Multi.hs
+8
-12
En.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
+12
-12
Fr.hs
src/Gargantext/Core/Text/Terms/Multi/Lang/Fr.hs
+4
-4
PosTagging.hs
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
+8
-7
Types.hs
src/Gargantext/Core/Types.hs
+4
-4
No files found.
src/Gargantext/Core/Text/Terms/Multi.hs
View file @
e5871a7d
...
@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
...
@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
)
import
Data.Text
hiding
(
map
,
group
,
filter
,
concat
)
import
Data.List
(
concat
)
import
Data.List
(
concat
)
import
qualified
Data.Set
as
S
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Multi.PosTagging
import
Gargantext.Core.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.En
as
En
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.En
as
En
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.Fr
as
Fr
import
qualified
Gargantext.Core.Text.Terms.Multi.Lang.Fr
as
Fr
...
@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
...
@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
multiterms
::
Lang
->
Text
->
IO
[
Terms
]
multiterms
::
Lang
->
Text
->
IO
[
Terms
]
multiterms
lang
txt
=
concat
multiterms
lang
txt
=
concat
<$>
map
(
map
(
tokenTag2terms
lang
)
)
<$>
map
(
map
tokenTag2terms
)
<$>
map
(
filter
(
\
t
->
_my_token_pos
t
==
Just
NP
))
<$>
map
(
filter
(
\
t
->
_my_token_pos
t
==
Just
NP
))
<$>
tokenTags
lang
txt
<$>
tokenTags
lang
txt
tokenTag2terms
::
Lang
->
TokenTag
->
Terms
tokenTag2terms
::
TokenTag
->
Terms
tokenTag2terms
lang
(
TokenTag
w
t
_
_
)
=
Terms
w
t'
tokenTag2terms
(
TokenTag
ws
t
_
_
)
=
Terms
ws
t
where
t'
=
S
.
fromList
$
map
(
stem
lang
)
$
S
.
toList
t
tokenTags
::
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags
::
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags
lang
s
=
map
(
group
lang
)
<$>
tokenTags'
lang
s
tokenTags
lang
s
=
map
(
group
Tokens
lang
)
<$>
tokenTags'
lang
s
tokenTags'
::
Lang
->
Text
->
IO
[[
TokenTag
]]
tokenTags'
::
Lang
->
Text
->
IO
[[
TokenTag
]]
...
@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags
...
@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags
---- | This function analyses and groups (or not) ngrams according to
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
---- specific grammars of each language.
group
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
::
Lang
->
[
TokenTag
]
->
[
TokenTag
]
group
EN
=
En
.
group
group
Tokens
EN
=
En
.
groupTokens
group
FR
=
Fr
.
group
group
Tokens
FR
=
Fr
.
groupTokens
group
_
=
panic
$
pack
"group
:: Lang not implemeted yet"
group
Tokens
_
=
panic
$
pack
"groupTokens
:: Lang not implemeted yet"
src/Gargantext/Core/Text/Terms/Multi/Lang/En.hs
View file @
e5871a7d
...
@@ -13,7 +13,7 @@ the tokens into extracted terms.
...
@@ -13,7 +13,7 @@ the tokens into extracted terms.
-}
-}
module
Gargantext.Core.Text.Terms.Multi.Lang.En
(
group
)
module
Gargantext.Core.Text.Terms.Multi.Lang.En
(
group
Tokens
)
where
where
import
Gargantext.Prelude
import
Gargantext.Prelude
...
@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group
...
@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Rule grammar to group tokens
-- | Rule grammar to group tokens
group
::
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
::
[
TokenTag
]
->
[
TokenTag
]
group
[]
=
[]
group
Tokens
[]
=
[]
group
ntags
=
group2
NP
NP
group
Tokens
ntags
=
group2
NP
NP
$
group2
NP
VB
$
group2
NP
VB
--
$ group2 NP IN
--
$ group2 NP IN
$
group2
IN
DT
$
group2
IN
DT
--
$ group2 VB NP
--
$ group2 VB NP
$
group2
JJ
NP
$
group2
JJ
NP
$
group2
JJ
JJ
$
group2
JJ
JJ
$
group2
JJ
CC
$
group2
JJ
CC
$
ntags
$
ntags
------------------------------------------------------------------------
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
...
...
src/Gargantext/Core/Text/Terms/Multi/Lang/Fr.hs
View file @
e5871a7d
...
@@ -14,16 +14,16 @@ is ADJectiv in french.
...
@@ -14,16 +14,16 @@ is ADJectiv in french.
-}
-}
module
Gargantext.Core.Text.Terms.Multi.Lang.Fr
(
group
)
module
Gargantext.Core.Text.Terms.Multi.Lang.Fr
(
group
Tokens
)
where
where
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Core.Types
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms.Multi.Group
(
group2
)
import
Gargantext.Core.Text.Terms.Multi.Group
(
group2
)
group
::
[
TokenTag
]
->
[
TokenTag
]
group
Tokens
::
[
TokenTag
]
->
[
TokenTag
]
group
[]
=
[]
group
Tokens
[]
=
[]
group
ntags
=
group2
NP
NP
group
Tokens
ntags
=
group2
NP
NP
$
group2
NP
VB
$
group2
NP
VB
-- group2 NP IN
-- group2 NP IN
-- group2 IN DT
-- group2 IN DT
...
...
src/Gargantext/Core/Text/Terms/Multi/PosTagging.hs
View file @
e5871a7d
...
@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag]
...
@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
tokens2tokensTags
ts
=
filter'
$
map
tokenTag
ts
------------------------------------------------------------------------
------------------------------------------------------------------------
tokenTag
::
Token
->
TokenTag
tokenTag
::
Token
->
TokenTag
tokenTag
(
Token
_
_
w
s
_
_
p
n
_
_
)
=
TokenTag
w'
s
'
p
n
tokenTag
(
Token
_
_
w
l
_
_
p
n
_
_
)
=
TokenTag
w'
l
'
p
n
where
where
w'
=
split
w
w'
=
split
w
s'
=
fromList
(
split
s
)
l'
=
fromList
(
split
l
)
split
=
splitOn
(
pack
" "
)
.
toLower
split
=
splitOn
(
pack
" "
)
.
toLower
filter'
::
[
TokenTag
]
->
[
TokenTag
]
filter'
::
[
TokenTag
]
->
[
TokenTag
]
...
@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs
...
@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
isNgrams
(
TokenTag
_
_
p
n
)
=
isJust
p
||
isJust
n
------------------------------------------------------------------------
------------------------------------------------------------------------
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
data
Sentence
=
Sentence
{
_sentenceIndex
::
Int
,
_sentenceTokens
::
[
Token
]
,
_sentenceTokens
::
[
Token
]
}
deriving
(
Show
,
Generic
)
}
deriving
(
Show
,
Generic
)
...
@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences)
...
@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences)
corenlp'
::
(
FromJSON
a
corenlp'
::
(
FromJSON
a
,
ConvertibleStrings
p
ByteString
)
=>
,
ConvertibleStrings
p
ByteString
Lang
->
p
->
IO
(
Response
a
)
)
=>
Lang
->
p
->
IO
(
Response
a
)
corenlp'
lang
txt
=
do
corenlp'
lang
txt
=
do
let
properties
=
case
lang
of
let
properties
=
case
lang
of
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
EN
->
"{
\"
annotators
\"
:
\"
tokenize,ssplit,pos,ner
\"
,
\"
outputFormat
\"
:
\"
json
\"
}"
...
@@ -142,9 +143,9 @@ corenlp lang txt = do
...
@@ -142,9 +143,9 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith
::
(
Token
->
t
)
->
Lang
->
Text
->
IO
[[(
Text
,
t
)]]
tokenWith
::
(
Token
->
t
)
->
Lang
->
Text
->
IO
[[(
Text
,
t
)]]
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
tokenWith
f
lang
s
=
map
(
map
(
\
t
->
(
_tokenWord
t
,
f
t
)))
<$>
map
_sentenceTokens
<$>
map
_sentenceTokens
<$>
_sentences
<$>
_sentences
<$>
corenlp
lang
s
<$>
corenlp
lang
s
src/Gargantext/Core/Types.hs
View file @
e5871a7d
...
@@ -109,10 +109,10 @@ instance FromJSON NER where
...
@@ -109,10 +109,10 @@ instance FromJSON NER where
instance
ToJSON
NER
instance
ToJSON
NER
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
data
TokenTag
=
TokenTag
{
_my_token_word
::
[
Text
]
,
_my_token_
stem
::
Set
Text
,
_my_token_
lemma
::
Set
Text
,
_my_token_pos
::
Maybe
POS
,
_my_token_pos
::
Maybe
POS
,
_my_token_ner
::
Maybe
NER
,
_my_token_ner
::
Maybe
NER
}
deriving
(
Show
)
}
deriving
(
Show
)
instance
Semigroup
TokenTag
where
instance
Semigroup
TokenTag
where
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment