Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
199
Issues
199
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
d2e8a845
Commit
d2e8a845
authored
Mar 20, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FLOW/FIX] extraction of ngrams ok for big corpora.
parent
38ebe154
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
66 additions
and
64 deletions
+66
-64
Main.hs
bin/gargantext-import/Main.hs
+6
-4
Flow.hs
src/Gargantext/Database/Flow.hs
+59
-60
GrandDebat.hs
src/Gargantext/Text/Parsers/GrandDebat.hs
+1
-0
No files found.
bin/gargantext-import/Main.hs
View file @
d2e8a845
...
@@ -49,9 +49,12 @@ main = do
...
@@ -49,9 +49,12 @@ main = do
--}
--}
let
cmdCorpus
::
forall
m
.
FlowCmdM
DevEnv
ServantErr
m
=>
m
CorpusId
let
cmdCorpus
::
forall
m
.
FlowCmdM
DevEnv
ServantErr
m
=>
m
CorpusId
cmdCorpus
=
do
cmdCorpus
=
do
docs
<-
liftIO
(
splitEvery
500
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]])
docs
<-
liftIO
(
splitEvery
500
ids
<-
flowCorpus
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Mono
FR
)
docs
<$>
take
10000
pure
ids
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]]
)
flowCorpus
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Multi
FR
)
docs
-- cmd = {-createUsers >>-} cmdCorpus
-- cmd = {-createUsers >>-} cmdCorpus
...
@@ -63,4 +66,3 @@ main = do
...
@@ -63,4 +66,3 @@ main = do
_
<-
runCmdDev
env
cmdCorpus
_
<-
runCmdDev
env
cmdCorpus
pure
()
pure
()
src/Gargantext/Database/Flow.hs
View file @
d2e8a845
...
@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
...
@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import
Control.Monad
(
mapM_
)
import
Control.Monad
(
mapM_
)
import
Control.Monad.IO.Class
(
liftIO
)
import
Control.Monad.IO.Class
(
liftIO
)
import
Data.List
(
concat
)
import
Data.List
(
concat
)
...
@@ -58,7 +59,7 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
...
@@ -58,7 +59,7 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
import
Gargantext.Database.Utils
(
Cmd
,
CmdM
)
import
Gargantext.Database.Utils
(
Cmd
,
CmdM
)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Prelude
import
Gargantext.Prelude
--import Gargantext.Text.List
import
Gargantext.Text.List
(
buildNgramsLists
)
--import Gargantext.Text.Parsers (parseDocs, FileFormat)
--import Gargantext.Text.Parsers (parseDocs, FileFormat)
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms
(
extractTerms
)
...
@@ -89,8 +90,8 @@ flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
...
@@ -89,8 +90,8 @@ flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
flowCorpus
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
flowCorpus
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
=>
Username
->
CorpusName
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
=>
Username
->
CorpusName
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
flowCorpus
u
cn
la
docs
=
do
flowCorpus
u
cn
la
docs
=
do
ids
<-
mapM
(
\
doc
->
flowCorpusMaster
la
(
map
toHyperdataDocument
doc
))
docs
ids
<-
mapM
(
(
insertMasterDocs
la
)
.
(
map
toHyperdataDocument
))
docs
flowCorpusUser
u
cn
$
concat
ids
flowCorpusUser
u
cn
(
concat
ids
)
-- TODO query with complex query
-- TODO query with complex query
...
@@ -102,23 +103,19 @@ flowCorpusSearchInDatabase u q = do
...
@@ -102,23 +103,19 @@ flowCorpusSearchInDatabase u q = do
flowCorpusUser
u
q
ids
flowCorpusUser
u
q
ids
-- TODO uniformize language of corpus
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
flowCorpusMaster
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
NodeId
]
=>
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusMaster
la
hd
=
insertMasterDocs
la
hd
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
=>
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusUser
userName
corpusName
ids
=
do
flowCorpusUser
userName
corpusName
ids
=
do
-- User Flow
-- User Flow
(
_
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
(
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
-- TODO: check if present already, ignore
-- TODO: check if present already, ignore
_
<-
Doc
.
add
userCorpusId
ids
_
<-
Doc
.
add
userCorpusId
ids
-- User List Flow
-- User List Flow
--
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
(
_masterUserId
,
_masterRootId
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
""
--
ngs <- buildNgramsLists userCorpusId masterCorpusId
ngs
<-
buildNgramsLists
userCorpusId
masterCorpusId
--
userListId <- flowList userId userCorpusId ngs
userListId
<-
flowList
userId
userCorpusId
ngs
--
printDebug "userListId" userListId
printDebug
"userListId"
userListId
-- User Graph Flow
-- User Graph Flow
--
_
<-
mkGraph
userCorpusId
userId
--
_
<-
mkGraph
userCorpusId
userId
...
@@ -134,20 +131,23 @@ flowCorpusUser userName corpusName ids = do
...
@@ -134,20 +131,23 @@ flowCorpusUser userName corpusName ids = do
insertMasterDocs
::
FlowCmdM
env
ServantErr
m
insertMasterDocs
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
DocId
]
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
DocId
]
insertMasterDocs
lang
hs
=
do
insertMasterDocs
lang
hs
=
do
let
hs'
=
map
addUniqIdsDoc
hs
let
hyperdataDocuments'
=
map
(
\
h
->
ToDbDocument
h
)
hs'
(
masterUserId
,
_
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
corpusMasterName
(
masterUserId
,
_
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
corpusMasterName
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
hyperdataDocuments'
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs
)
-- TODO Type NodeDocumentUnicised
let
hs'
=
map
addUniqIdsDoc
hs
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
$
map
ToDbDocument
hs'
-- ^ TODO Type class to insert Doc
-- ^ TODO Type Class AddUnicity where unicity = addUnicity
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs'
)
docsWithNgrams
<-
documentIdWithNgrams
(
extractNgramsT
lang
)
documentsWithId
docsWithNgrams
<-
documentIdWithNgrams
(
extractNgramsT
lang
)
documentsWithId
let
maps
=
mapNodeIdNgrams
docsWithNgrams
let
maps
=
mapNodeIdNgrams
docsWithNgrams
terms2id
<-
insertNgrams
$
DM
.
keys
maps
terms2id
<-
insertNgrams
$
DM
.
keys
maps
let
indexedNgrams
=
DM
.
mapKeys
(
indexNgrams
terms2id
)
maps
let
indexedNgrams
=
DM
.
mapKeys
(
indexNgrams
terms2id
)
maps
_
<-
insertToNodeNgrams
indexedNgrams
_
<-
insertToNodeNgrams
indexedNgrams
pure
$
map
reId
ids
pure
$
map
reId
ids
...
@@ -220,51 +220,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
...
@@ -220,51 +220,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
,
document_ngrams
::
!
(
Map
Ngrams
(
Map
NgramsType
Int
))
,
document_ngrams
::
!
(
Map
Ngrams
(
Map
NgramsType
Int
))
}
deriving
(
Show
)
}
deriving
(
Show
)
-- TODO extractNgrams according to Type of Data
extractNgramsT
::
HasNodeError
err
extractNgramsT
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
extractNgramsT'
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
extractNgramsT'
::
HasNodeError
err
True
->
(
ng
,
y
)
=>
TermType
Lang
->
HyperdataDocument
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang'
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
True
->
(
ng
,
y
)
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
documentIdWithNgrams
::
HasNodeError
err
documentIdWithNgrams
::
HasNodeError
err
...
...
src/Gargantext/Text/Parsers/GrandDebat.hs
View file @
d2e8a845
...
@@ -97,6 +97,7 @@ class ReadFile a
...
@@ -97,6 +97,7 @@ class ReadFile a
instance
ReadFile
[
GrandDebatReference
]
instance
ReadFile
[
GrandDebatReference
]
where
where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile
fp
=
P
.
parseLazyByteString
(
P
.
arrayOf
P
.
value
)
<$>
DBL
.
readFile
fp
readFile
fp
=
P
.
parseLazyByteString
(
P
.
arrayOf
P
.
value
)
<$>
DBL
.
readFile
fp
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment