Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
195
Issues
195
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
a5bd188c
Commit
a5bd188c
authored
Jun 21, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[NGRAMS] Unsupervised extraction OK.
parent
3706928e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
48 additions
and
26 deletions
+48
-26
Flow.hs
src/Gargantext/Database/Flow.hs
+0
-2
Terms.hs
src/Gargantext/Text/Terms.hs
+35
-12
Eleve.hs
src/Gargantext/Text/Terms/Eleve.hs
+12
-9
Mono.hs
src/Gargantext/Text/Terms/Mono.hs
+1
-3
No files found.
src/Gargantext/Database/Flow.hs
View file @
a5bd188c
...
@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
...
@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
instance
ExtractNgramsT
HyperdataDocument
instance
ExtractNgramsT
HyperdataDocument
where
where
extractNgramsT
=
extractNgramsT'
extractNgramsT
=
extractNgramsT'
...
@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
...
@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
pure
$
DocumentIdWithNgrams
d
e
pure
$
DocumentIdWithNgrams
d
e
-- FLOW LIST
-- FLOW LIST
-- | TODO check optimization
-- | TODO check optimization
mapNodeIdNgrams
::
[
DocumentIdWithNgrams
a
]
mapNodeIdNgrams
::
[
DocumentIdWithNgrams
a
]
...
...
src/Gargantext/Text/Terms.hs
View file @
a5bd188c
...
@@ -44,18 +44,23 @@ import Gargantext.Core
...
@@ -44,18 +44,23 @@ import Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Data.Set
as
Set
import
qualified
Data.List
as
List
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
Text
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Eleve
(
mainEleve
)
import
Gargantext.Text.Terms.Eleve
(
mainEleve
With
,
Tries
,
Token
,
buildTries
,
toToken
)
data
TermType
lang
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_size
::
Int
,
_tt_model
::
Maybe
(
Tries
Token
()
)
}
makeLenses
''
T
ermType
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
--group :: [Text] -> [Text]
...
@@ -67,7 +72,10 @@ makeLenses ''TermType
...
@@ -67,7 +72,10 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user).
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
termTypeLang
=
mapM
(
terms
termTypeLang
)
extractTerms
(
Unsupervised
l
n
m
)
xs
=
mapM
(
terms
(
Unsupervised
l
n
m'
))
xs
where
m'
=
maybe
(
Just
$
newTries
n
(
Text
.
intercalate
" "
xs
))
Just
m
extractTerms
termTypeLang
xs
=
mapM
(
terms
termTypeLang
)
xs
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Terms from Text
-- | Terms from Text
-- Mono : mono terms
-- Mono : mono terms
...
@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
...
@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
lang
n
m
)
txt
=
termsUnsupervised
m'
n
lang
txt
where
m'
=
maybe
(
newTries
n
txt
)
identity
m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
------------------------------------------------------------------------
text2term
::
Lang
->
[
Text
]
->
Terms
text2term
_
[]
=
Terms
[]
Set
.
empty
text2term
lang
txt
=
Terms
txt
(
Set
.
fromList
$
map
(
stem
lang
)
txt
)
isPunctuation
::
Text
->
Bool
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;."
::
String
)
<$>
(
"!?(),;."
::
String
)
...
@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
...
@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- language agnostic extraction
-- TODO: remove IO
-- TODO: remove IO
-- TODO: newtype BlockText
-- TODO: newtype BlockText
extractTermsUnsupervised
::
Int
->
Text
->
[[
Text
]]
termsUnsupervised
::
Tries
Token
()
->
Int
->
Lang
->
Text
->
IO
[
Terms
]
extractTermsUnsupervised
n
=
termsUnsupervised
m
n
l
=
List
.
nub
pure
.
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
.
map
(
text2term
l
)
.
List
.
nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>
1
))
.
List
.
concat
.
List
.
concat
.
mainEleve
n
.
mainEleveWith
m
n
.
map
(
map
Text
.
toLower
)
.
uniText
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
newTries
::
Int
->
Text
->
Tries
Token
()
.
sentences
newTries
n
t
=
buildTries
n
(
fmap
toToken
$
uniText
t
)
uniText
::
Text
->
[[
Text
]]
uniText
=
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
.
sentences
-- | TODO get sentences according to lang
src/Gargantext/Text/Terms/Eleve.hs
View file @
a5bd188c
...
@@ -367,16 +367,19 @@ split inE t0 ts =
...
@@ -367,16 +367,19 @@ split inE t0 ts =
------------------------------------------------------------------------
------------------------------------------------------------------------
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
n
i
nput
=
split
n
info_autonomy
(
t
::
Tries
Token
(
I
Double
))
<$>
inp
mainEleve
n
i
=
mainEleveWith
m
n
i
where
where
inp
=
toToken
<$>
input
m
=
buildTries
n
(
fmap
toToken
i
)
t
=
normalizeEntropy
info_entropy_var
set_autonomy
.
evTrie
identity
set_entropy_var
.
entropyTrie
isTerminal
$
buildTries
n
inp
mainEleveWith
::
Tries
Token
()
->
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleveWith
m
n
i
=
fmap
(
split
n
info_autonomy
t
)
(
fmap
toToken
i
)
where
t
::
Tries
Token
(
I
Double
)
t
=
normalizeEntropy
info_entropy_var
set_autonomy
$
evTrie
identity
set_entropy_var
$
entropyTrie
isTerminal
m
---------------------------------------------
---------------------------------------------
---------------------------
type
Checks
e
=
[(
Text
,
Int
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
)]
type
Checks
e
=
[(
Text
,
Int
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
)]
...
@@ -484,8 +487,8 @@ checks2 = []
...
@@ -484,8 +487,8 @@ checks2 = []
]
]
-}
-}
runTests
::
IO
()
runTests
Eleve
::
IO
()
runTests
=
runTests
Eleve
=
forM_
forM_
[(
"example0"
,
3
,
example0
,
checks0
)
[(
"example0"
,
3
,
example0
,
checks0
)
,(
"example0"
,
2
,
example0
,
[]
)
,(
"example0"
,
2
,
example0
,
[]
)
...
...
src/Gargantext/Text/Terms/Mono.hs
View file @
a5bd188c
...
@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
...
@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts
::
Text
->
[
Text
]
monoTexts
::
Text
->
[
Text
]
monoTexts
=
L
.
concat
.
monoTextsBySentence
monoTexts
=
L
.
concat
.
monoTextsBySentence
-- | TODO use text2term only
monoText2term
::
Lang
->
Text
->
Terms
monoText2term
::
Lang
->
Text
->
Terms
monoText2term
lang
txt
=
Terms
[
txt
]
(
S
.
singleton
$
stem
lang
txt
)
monoText2term
lang
txt
=
Terms
[
txt
]
(
S
.
singleton
$
stem
lang
txt
)
...
@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
...
@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence
=
map
T
.
words
monoTextsBySentence
=
map
T
.
words
.
T
.
split
isSep
.
T
.
split
isSep
.
T
.
toLower
.
T
.
toLower
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment