Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
150
Issues
150
List
Board
Labels
Milestones
Merge Requests
4
Merge Requests
4
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
16f3bbd8
Commit
16f3bbd8
authored
Jun 21, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[WORKFLOW] Unsupervised ngrams extraction implemented.
parent
a5bd188c
Pipeline
#485
failed with stage
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
73 additions
and
47 deletions
+73
-47
Flow.hs
src/Gargantext/Database/Flow.hs
+64
-45
Terms.hs
src/Gargantext/Text/Terms.hs
+9
-2
No files found.
src/Gargantext/Database/Flow.hs
View file @
16f3bbd8
...
...
@@ -20,10 +20,12 @@ Portability : POSIX
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE
FlexibleContexts
#-}
{-# LANGUAGE
ConstrainedClassMethods
#-}
module
Gargantext.Database.Flow
-- (flowDatabase, ngrams2list)
where
...
...
@@ -58,14 +60,14 @@ import Gargantext.Database.Utils (Cmd, CmdM)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Prelude
import
Gargantext.Text.Terms.Eleve
(
buildTries
,
toToken
)
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.Parsers
(
parseFile
,
FileFormat
)
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
)
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
,
extractTerms
,
uniText
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
import
Servant
(
ServantErr
)
import
System.FilePath
(
FilePath
)
--
import qualified Data.List as List
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
qualified
Data.Text
as
Text
import
qualified
Gargantext.Database.Node.Document.Add
as
Doc
(
add
)
...
...
@@ -82,6 +84,7 @@ type FlowCorpus a = ( AddUniqId a
,
UniqId
a
,
InsertDb
a
,
ExtractNgramsT
a
,
HasText
a
)
------------------------------------------------------------------------
...
...
@@ -186,8 +189,17 @@ insertMasterDocs c lang hs = do
ids
<-
insertDb
masterUserId
masterCorpusId
hs'
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
Map
.
fromList
$
map
viewUniqId'
hs'
)
let
fixLang
(
Unsupervised
l
n
m
)
=
Unsupervised
l
n
m'
where
m'
=
case
m
of
Nothing
->
Just
$
buildTries
n
(
fmap
toToken
$
uniText
$
Text
.
intercalate
" "
$
List
.
concat
$
map
hasText
documentsWithId
)
m''
->
m''
fixLang
l
=
l
lang'
=
fixLang
lang
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
maps
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
lang
)
documentsWithId
maps
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
lang
'
)
documentsWithId
terms2id
<-
insertNgrams
$
Map
.
keys
maps
let
indexedNgrams
=
Map
.
mapKeys
(
indexNgrams
terms2id
)
maps
...
...
@@ -265,6 +277,10 @@ data DocumentWithId a = DocumentWithId
,
documentData
::
!
a
}
deriving
(
Show
)
instance
HasText
a
=>
HasText
(
DocumentWithId
a
)
where
hasText
(
DocumentWithId
_
a
)
=
hasText
a
mergeData
::
Map
HashId
ReturnId
->
Map
HashId
a
->
[
DocumentWithId
a
]
...
...
@@ -280,12 +296,18 @@ data DocumentIdWithNgrams a = DocumentIdWithNgrams
,
document_ngrams
::
!
(
Map
Ngrams
(
Map
NgramsType
Int
))
}
deriving
(
Show
)
-- TODO extractNgrams according to Type of Data
class
ExtractNgramsT
h
where
extractNgramsT
::
TermType
Lang
->
h
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
::
HasText
h
=>
TermType
Lang
->
h
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
class
HasText
h
where
hasText
::
h
->
[
Text
]
instance
HasText
HyperdataContact
where
hasText
=
undefined
instance
ExtractNgramsT
HyperdataContact
where
...
...
@@ -300,19 +322,20 @@ instance ExtractNgramsT HyperdataContact
pure
$
Map
.
fromList
$
[(
a'
,
Map
.
singleton
Authors
1
)
|
a'
<-
authors
]
instance
HasText
HyperdataDocument
where
hasText
h
=
catMaybes
[
_hyperdataDocument_title
h
,
_hyperdataDocument_abstract
h
]
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
=
extractNgramsT'
extractNgramsT'
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT''
lang
hd
extractNgramsT
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
where
extractNgramsT'
'
::
TermType
Lang
->
HyperdataDocument
extractNgramsT
'
::
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
'
lang'
doc
=
do
extractNgramsT
'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
...
...
@@ -325,14 +348,10 @@ extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang'
leText
)
<$>
liftIO
(
extractTerms
lang'
$
hasText
doc
)
pure
$
Map
.
fromList
$
[(
source
,
Map
.
singleton
Sources
1
)]
<>
[(
i'
,
Map
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
...
...
src/Gargantext/Text/Terms.hs
View file @
16f3bbd8
...
...
@@ -72,10 +72,17 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
TermType
Lang
->
[
Text
]
->
IO
[[
Terms
]]
extractTerms
(
Unsupervised
l
n
m
)
xs
=
mapM
(
terms
(
Unsupervised
l
n
m'
))
xs
extractTerms
(
Unsupervised
l
n
m
)
xs
=
mapM
(
terms
(
Unsupervised
l
n
(
Just
m'
)))
xs
where
m'
=
maybe
(
Just
$
newTries
n
(
Text
.
intercalate
" "
xs
))
Just
m
m'
=
case
m
of
Just
m''
->
m''
Nothing
->
newTries
n
(
Text
.
intercalate
" "
xs
)
extractTerms
termTypeLang
xs
=
mapM
(
terms
termTypeLang
)
xs
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment