Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
a5bd188c
Commit
a5bd188c
authored
Jun 21, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[NGRAMS] Unsupervised extraction OK.
parent
3706928e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
48 additions
and
26 deletions
+48
-26
Flow.hs
src/Gargantext/Database/Flow.hs
+0
-2
Terms.hs
src/Gargantext/Text/Terms.hs
+35
-12
Eleve.hs
src/Gargantext/Text/Terms/Eleve.hs
+12
-9
Mono.hs
src/Gargantext/Text/Terms/Mono.hs
+1
-3
No files found.
src/Gargantext/Database/Flow.hs
View file @
a5bd188c
...
...
@@ -302,7 +302,6 @@ instance ExtractNgramsT HyperdataContact
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
=
extractNgramsT'
...
...
@@ -362,7 +361,6 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
pure
$
DocumentIdWithNgrams
d
e
-- FLOW LIST
-- | TODO check optimization
mapNodeIdNgrams
::
[
DocumentIdWithNgrams
a
]
...
...
src/Gargantext/Text/Terms.hs
View file @
a5bd188c
...
...
@@ -44,18 +44,23 @@ import Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Data.Set
as
Set
import
qualified
Data.List
as
List
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Eleve
(
mainEleve
)
import
Gargantext.Text.Terms.Eleve
(
mainEleve
With
,
Tries
,
Token
,
buildTries
,
toToken
)
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_size
::
Int
,
_tt_model
::
Maybe
(
Tries
Token
()
)
}
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
...
...
@@ -67,7 +72,10 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
termTypeLang
=
mapM
(
terms
termTypeLang
)
extractTerms
(
Unsupervised
l
n
m
)
xs
=
mapM
(
terms
(
Unsupervised
l
n
m'
))
xs
where
m'
=
maybe
(
Just
$
newTries
n
(
Text
.
intercalate
" "
xs
))
Just
m
extractTerms
termTypeLang
xs
=
mapM
(
terms
termTypeLang
)
xs
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
...
...
@@ -78,9 +86,16 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
(
Multi
lang
)
txt
=
multiterms
lang
txt
terms
(
MonoMulti
lang
)
txt
=
terms
(
Multi
lang
)
txt
terms
(
Unsupervised
lang
n
m
)
txt
=
termsUnsupervised
m'
n
lang
txt
where
m'
=
maybe
(
newTries
n
txt
)
identity
m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
text2term
::
Lang
->
[
Text
]
->
Terms
text2term
_
[]
=
Terms
[]
Set
.
empty
text2term
lang
txt
=
Terms
txt
(
Set
.
fromList
$
map
(
stem
lang
)
txt
)
isPunctuation
::
Text
->
Bool
isPunctuation
x
=
List
.
elem
x
$
(
Text
.
pack
.
pure
)
<$>
(
"!?(),;."
::
String
)
...
...
@@ -89,14 +104,22 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
extractTermsUnsupervised
::
Int
->
Text
->
[[
Text
]]
extractTermsUnsupervised
n
=
List
.
nub
.
(
List
.
filter
(
\
l
->
List
.
length
l
>
1
))
termsUnsupervised
::
Tries
Token
()
->
Int
->
Lang
->
Text
->
IO
[
Terms
]
termsUnsupervised
m
n
l
=
pure
.
map
(
text2term
l
)
.
List
.
nub
.
(
List
.
filter
(
\
l'
->
List
.
length
l'
>
1
))
.
List
.
concat
.
mainEleve
n
.
map
(
map
Text
.
toLower
)
.
map
(
List
.
filter
(
not
.
isPunctuation
))
.
mainEleveWith
m
n
.
uniText
newTries
::
Int
->
Text
->
Tries
Token
()
newTries
n
t
=
buildTries
n
(
fmap
toToken
$
uniText
t
)
uniText
::
Text
->
[[
Text
]]
uniText
=
map
(
List
.
filter
(
not
.
isPunctuation
))
.
map
tokenize
.
sentences
.
sentences
-- | TODO get sentences according to lang
src/Gargantext/Text/Terms/Eleve.hs
View file @
a5bd188c
...
...
@@ -367,16 +367,19 @@ split inE t0 ts =
------------------------------------------------------------------------
mainEleve
::
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleve
n
i
nput
=
split
n
info_autonomy
(
t
::
Tries
Token
(
I
Double
))
<$>
inp
mainEleve
n
i
=
mainEleveWith
m
n
i
where
inp
=
toToken
<$>
input
t
=
normalizeEntropy
info_entropy_var
set_autonomy
.
evTrie
identity
set_entropy_var
.
entropyTrie
isTerminal
$
buildTries
n
inp
m
=
buildTries
n
(
fmap
toToken
i
)
mainEleveWith
::
Tries
Token
()
->
Int
->
[[
Text
]]
->
[[[
Text
]]]
mainEleveWith
m
n
i
=
fmap
(
split
n
info_autonomy
t
)
(
fmap
toToken
i
)
where
t
::
Tries
Token
(
I
Double
)
t
=
normalizeEntropy
info_entropy_var
set_autonomy
$
evTrie
identity
set_entropy_var
$
entropyTrie
isTerminal
m
---------------------------------------------
---------------------------------------------
---------------------------
type
Checks
e
=
[(
Text
,
Int
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
,
e
)]
...
...
@@ -484,8 +487,8 @@ checks2 = []
]
-}
runTests
::
IO
()
runTests
=
runTests
Eleve
::
IO
()
runTests
Eleve
=
forM_
[(
"example0"
,
3
,
example0
,
checks0
)
,(
"example0"
,
2
,
example0
,
[]
)
...
...
src/Gargantext/Text/Terms/Mono.hs
View file @
a5bd188c
...
...
@@ -47,6 +47,7 @@ monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts
::
Text
->
[
Text
]
monoTexts
=
L
.
concat
.
monoTextsBySentence
-- | TODO use text2term only
monoText2term
::
Lang
->
Text
->
Terms
monoText2term
lang
txt
=
Terms
[
txt
]
(
S
.
singleton
$
stem
lang
txt
)
...
...
@@ -54,6 +55,3 @@ monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence
=
map
T
.
words
.
T
.
split
isSep
.
T
.
toLower
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment