Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
161
Issues
161
List
Board
Labels
Milestones
Merge Requests
8
Merge Requests
8
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
d2e8a845
Commit
d2e8a845
authored
Mar 20, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FLOW/FIX] extraction of ngrams ok for big corpora.
parent
38ebe154
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
64 deletions
+66
-64
Main.hs
bin/gargantext-import/Main.hs
+6
-4
Flow.hs
src/Gargantext/Database/Flow.hs
+59
-60
GrandDebat.hs
src/Gargantext/Text/Parsers/GrandDebat.hs
+1
-0
No files found.
bin/gargantext-import/Main.hs
View file @
d2e8a845
...
...
@@ -49,9 +49,12 @@ main = do
--}
let
cmdCorpus
::
forall
m
.
FlowCmdM
DevEnv
ServantErr
m
=>
m
CorpusId
cmdCorpus
=
do
docs
<-
liftIO
(
splitEvery
500
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]])
ids
<-
flowCorpus
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Mono
FR
)
docs
pure
ids
docs
<-
liftIO
(
splitEvery
500
<$>
take
10000
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]]
)
flowCorpus
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Multi
FR
)
docs
-- cmd = {-createUsers >>-} cmdCorpus
...
...
@@ -63,4 +66,3 @@ main = do
_
<-
runCmdDev
env
cmdCorpus
pure
()
src/Gargantext/Database/Flow.hs
View file @
d2e8a845
...
...
@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import
Control.Monad
(
mapM_
)
import
Control.Monad.IO.Class
(
liftIO
)
import
Data.List
(
concat
)
...
...
@@ -58,7 +59,7 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
import
Gargantext.Database.Utils
(
Cmd
,
CmdM
)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Prelude
--import Gargantext.Text.List
import
Gargantext.Text.List
(
buildNgramsLists
)
--import Gargantext.Text.Parsers (parseDocs, FileFormat)
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Gargantext.Text.Terms
(
extractTerms
)
...
...
@@ -89,8 +90,8 @@ flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
flowCorpus
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
=>
Username
->
CorpusName
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
flowCorpus
u
cn
la
docs
=
do
ids
<-
mapM
(
\
doc
->
flowCorpusMaster
la
(
map
toHyperdataDocument
doc
))
docs
flowCorpusUser
u
cn
$
concat
ids
ids
<-
mapM
(
(
insertMasterDocs
la
)
.
(
map
toHyperdataDocument
))
docs
flowCorpusUser
u
cn
(
concat
ids
)
-- TODO query with complex query
...
...
@@ -102,23 +103,19 @@ flowCorpusSearchInDatabase u q = do
flowCorpusUser
u
q
ids
-- TODO uniformize language of corpus
flowCorpusMaster
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
NodeId
]
flowCorpusMaster
la
hd
=
insertMasterDocs
la
hd
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
=>
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
=>
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusUser
userName
corpusName
ids
=
do
-- User Flow
(
_
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
(
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
-- TODO: check if present already, ignore
_
<-
Doc
.
add
userCorpusId
ids
-- User List Flow
--
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
--
ngs <- buildNgramsLists userCorpusId masterCorpusId
--
userListId <- flowList userId userCorpusId ngs
--
printDebug "userListId" userListId
(
_masterUserId
,
_masterRootId
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
""
ngs
<-
buildNgramsLists
userCorpusId
masterCorpusId
userListId
<-
flowList
userId
userCorpusId
ngs
printDebug
"userListId"
userListId
-- User Graph Flow
--
_
<-
mkGraph
userCorpusId
userId
...
...
@@ -134,20 +131,23 @@ flowCorpusUser userName corpusName ids = do
insertMasterDocs
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
DocId
]
insertMasterDocs
lang
hs
=
do
let
hs'
=
map
addUniqIdsDoc
hs
let
hyperdataDocuments'
=
map
(
\
h
->
ToDbDocument
h
)
hs'
(
masterUserId
,
_
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
corpusMasterName
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
hyperdataDocuments'
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs
)
-- TODO Type NodeDocumentUnicised
let
hs'
=
map
addUniqIdsDoc
hs
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
$
map
ToDbDocument
hs'
-- ^ TODO Type class to insert Doc
-- ^ TODO Type Class AddUnicity where unicity = addUnicity
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs'
)
docsWithNgrams
<-
documentIdWithNgrams
(
extractNgramsT
lang
)
documentsWithId
let
maps
=
mapNodeIdNgrams
docsWithNgrams
terms2id
<-
insertNgrams
$
DM
.
keys
maps
let
indexedNgrams
=
DM
.
mapKeys
(
indexNgrams
terms2id
)
maps
_
<-
insertToNodeNgrams
indexedNgrams
_
<-
insertToNodeNgrams
indexedNgrams
pure
$
map
reId
ids
...
...
@@ -220,51 +220,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
,
document_ngrams
::
!
(
Map
Ngrams
(
Map
NgramsType
Int
))
}
deriving
(
Show
)
-- TODO extractNgrams according to Type of Data
extractNgramsT
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
extractNgramsT'
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
True
->
(
ng
,
y
)
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
extractNgramsT'
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang'
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
True
->
(
ng
,
y
)
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
documentIdWithNgrams
::
HasNodeError
err
...
...
src/Gargantext/Text/Parsers/GrandDebat.hs
View file @
d2e8a845
...
...
@@ -97,6 +97,7 @@ class ReadFile a
instance
ReadFile
[
GrandDebatReference
]
where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile
fp
=
P
.
parseLazyByteString
(
P
.
arrayOf
P
.
value
)
<$>
DBL
.
readFile
fp
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment