Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
45cddcc1
Unverified
Commit
45cddcc1
authored
Mar 21, 2019
by
Nicolas Pouillard
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' of
https://gitlab.iscpif.fr/gargantext/haskell-gargantext
into dev
parents
863694e6
8f0b7cd4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
130 additions
and
105 deletions
+130
-105
Main.hs
bin/gargantext-import/Main.hs
+25
-13
Flow.hs
src/Gargantext/Database/Flow.hs
+83
-71
NgramsByNode.hs
src/Gargantext/Database/Metrics/NgramsByNode.hs
+9
-11
List.hs
src/Gargantext/Text/List.hs
+12
-10
GrandDebat.hs
src/Gargantext/Text/Parsers/GrandDebat.hs
+1
-0
No files found.
bin/gargantext-import/Main.hs
View file @
45cddcc1
...
...
@@ -19,10 +19,11 @@ Import a corpus binary.
module
Main
where
import
Prelude
(
read
)
import
Control.Exception
(
finally
)
import
Servant
(
ServantErr
)
import
Gargantext.Prelude
import
Gargantext.Database.Flow
(
FlowCmdM
,
flowCorpus
''
)
import
Gargantext.Database.Flow
(
FlowCmdM
,
flowCorpus
)
import
Gargantext.Text.Parsers
(
FileFormat
(
CsvHalFormat
))
import
Gargantext.Database.Utils
(
Cmd
,
)
import
Gargantext.Database.Types.Node
(
CorpusId
)
...
...
@@ -38,27 +39,38 @@ import Control.Monad.IO.Class (liftIO)
main
::
IO
()
main
=
do
[
user
,
iniPath
,
name
,
corpusPath
]
<-
getArgs
[
user
Create
,
user
,
name
,
iniPath
,
limit
,
corpusPath
]
<-
getArgs
--{-
let
createUsers
::
Cmd
ServantErr
Int64
createUsers
=
insertUsersDemo
{-
let c
md
Corpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
c
mdCorpus = flowCorpus (cs user) (cs name) (Mono
EN) CsvHalFormat corpusPath
let c
sv
Corpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
c
svCorpus = flowCorpus (cs user) (cs name) (Multi
EN) CsvHalFormat corpusPath
--}
let
cmdCorpus
::
forall
m
.
FlowCmdM
DevEnv
ServantErr
m
=>
m
[
CorpusId
]
cmdCorpus
=
do
docs
<-
liftIO
(
splitEvery
3000
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]])
ids
<-
flowCorpus''
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Mono
FR
)
docs
pure
ids
let
debatCorpus
::
forall
m
.
FlowCmdM
DevEnv
ServantErr
m
=>
m
CorpusId
debatCorpus
=
do
docs
<-
liftIO
(
splitEvery
500
<$>
take
(
read
limit
::
Int
)
<$>
readFile
corpusPath
::
IO
[[
GrandDebatReference
]]
)
flowCorpus
(
Text
.
pack
user
)
(
Text
.
pack
name
)
(
Multi
FR
)
docs
-- cmd = {-createUsers >>-} cmdCorpus
env
<-
newDevEnvWith
iniPath
-- Better if we keep only one call to runCmdDev.
_
<-
runCmdDev
env
createUsers
_
<-
runCmdDev
env
cmdCorpus
_
<-
if
userCreate
==
"true"
then
runCmdDev
env
createUsers
else
pure
0
--(cs "false")
_
<-
runCmdDev
env
debatCorpus
{-
_ <- if corpusType == "csv"
then runCmdDev env csvCorpus
else if corpusType == "debat"
then runCmdDev env debatCorpus
else panic "corpusType unknown: try \"csv\" or \"debat\""
-}
pure
()
src/Gargantext/Database/Flow.hs
View file @
45cddcc1
...
...
@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import
Control.Monad
(
mapM_
)
import
Control.Monad.IO.Class
(
liftIO
)
import
Data.List
(
concat
)
...
...
@@ -58,11 +59,12 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
import
Gargantext.Database.Utils
(
Cmd
,
CmdM
)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Prelude
--import Gargantext.Text.List
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
)
import
Gargantext.Text.List
(
buildNgramsLists
)
--
import Gargantext.Text.Parsers (parseDocs, FileFormat)
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
import
qualified
Gargantext.Text.Parsers.GrandDebat
as
GD
import
Servant
(
ServantErr
)
import
System.FilePath
(
FilePath
)
import
qualified
Data.Map
as
DM
...
...
@@ -76,21 +78,33 @@ type FlowCmdM env err m =
,
HasRepoVar
env
)
flowCorpusDebat
::
FlowCmdM
env
ServantErr
m
=>
Username
->
CorpusName
->
Int
->
FilePath
->
m
CorpusId
flowCorpusDebat
u
n
l
fp
=
do
docs
<-
liftIO
(
splitEvery
500
<$>
take
l
<$>
GD
.
readFile
fp
::
IO
[[
GD
.
GrandDebatReference
]]
)
flowCorpus
u
n
(
Multi
FR
)
docs
{-
flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
flowCorpus
u
cn
la
ff
fp
=
liftIO
(
parseDocs
ff
fp
)
>>=
\
docs
->
flowCorpus'
u
cn
la
docs
flowCorpus u cn la ff fp =
undefined --
liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
--{-
flowCorpus''
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
flowCorpus''' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId]
flowCorpus''
u
cn
la
docs
=
mapM
(
\
doc
->
flowCorpus'
u
cn
la
doc
)
docs
flowCorpus''
'
u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
--}
flowCorpus
'
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
=>
Username
->
CorpusName
->
TermType
Lang
->
[
a
]
->
m
CorpusId
flowCorpus
'
u
cn
la
docs
=
do
ids
<-
flowCorpusMaster
la
(
map
toHyperdataDocument
docs
)
flowCorpusUser
u
cn
ids
flowCorpus
::
(
FlowCmdM
env
ServantErr
m
,
ToHyperdataDocument
a
)
=>
Username
->
CorpusName
->
TermType
Lang
->
[
[
a
]
]
->
m
CorpusId
flowCorpus
u
cn
la
docs
=
do
ids
<-
mapM
((
insertMasterDocs
la
)
.
(
map
toHyperdataDocument
))
docs
flowCorpusUser
FR
u
cn
(
concat
ids
)
-- TODO query with complex query
...
...
@@ -99,29 +113,25 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
flowCorpusSearchInDatabase
u
q
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMkRootWithCorpus
userMaster
""
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
flowCorpusUser
u
q
ids
-- TODO uniformize language of corpus
flowCorpusMaster
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
NodeId
]
flowCorpusMaster
la
hd
=
(
insertMasterDocs
la
)
$
(
map
addUniqIdsDoc
)
hd
flowCorpusUser
FR
u
q
ids
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
=>
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusUser
userName
corpusName
ids
=
do
flowCorpusUser
::
FlowCmdM
env
ServantErr
m
=>
Lang
->
Username
->
CorpusName
->
[
NodeId
]
->
m
CorpusId
flowCorpusUser
l
userName
corpusName
ids
=
do
-- User Flow
(
_
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
(
userId
,
_rootId
,
userCorpusId
)
<-
getOrMkRootWithCorpus
userName
corpusName
-- TODO: check if present already, ignore
_
<-
Doc
.
add
userCorpusId
ids
-- User List Flow
--
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
--ngs <- buildNgramsLists
userCorpusId masterCorpusId
--
userListId <- flowList userId userCorpusId ngs
--
printDebug "userListId" userListId
(
_masterUserId
,
_masterRootId
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
""
ngs
<-
buildNgramsLists
l
2
3
userCorpusId
masterCorpusId
userListId
<-
flowList
userId
userCorpusId
ngs
printDebug
"userListId"
userListId
-- User Graph Flow
--
_
<-
mkGraph
userCorpusId
userId
_
<-
mkGraph
userCorpusId
userId
-- User Dashboard Flow
-- _ <- mkDashboard userCorpusId userId
...
...
@@ -134,20 +144,23 @@ flowCorpusUser userName corpusName ids = do
insertMasterDocs
::
FlowCmdM
env
ServantErr
m
=>
TermType
Lang
->
[
HyperdataDocument
]
->
m
[
DocId
]
insertMasterDocs
lang
hs
=
do
let
hyperdataDocuments'
=
map
(
\
h
->
ToDbDocument
h
)
hs
(
masterUserId
,
_
,
masterCorpusId
)
<-
getOrMkRootWithCorpus
userMaster
corpusMasterName
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
hyperdataDocuments'
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs
)
-- TODO Type NodeDocumentUnicised
let
hs'
=
map
addUniqIdsDoc
hs
ids
<-
insertDocuments
masterUserId
masterCorpusId
NodeDocument
$
map
ToDbDocument
hs'
-- ^ TODO Type class to insert Doc
-- ^ TODO Type Class AddUnicity where unicity = addUnicity
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hs'
)
docsWithNgrams
<-
documentIdWithNgrams
(
extractNgramsT
lang
)
documentsWithId
let
maps
=
mapNodeIdNgrams
docsWithNgrams
terms2id
<-
insertNgrams
$
DM
.
keys
maps
let
indexedNgrams
=
DM
.
mapKeys
(
indexNgrams
terms2id
)
maps
_
<-
insertToNodeNgrams
indexedNgrams
_
<-
insertToNodeNgrams
indexedNgrams
pure
$
map
reId
ids
...
...
@@ -220,51 +233,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
,
document_ngrams
::
!
(
Map
Ngrams
(
Map
NgramsType
Int
))
}
deriving
(
Show
)
-- TODO extractNgrams according to Type of Data
extractNgramsT
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
extractNgramsT'
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
True
->
(
ng
,
y
)
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
extractNgramsT'
::
HasNodeError
err
=>
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hyperdataDocument_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
_hyperdataDocument_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
_hyperdataDocument_authors
doc
leText
=
catMaybes
[
_hyperdataDocument_title
doc
,
_hyperdataDocument_abstract
doc
]
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftIO
(
extractTerms
lang'
leText
)
pure
$
DM
.
fromList
$
[(
source
,
DM
.
singleton
Sources
1
)]
<>
[(
i'
,
DM
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
<>
[(
a'
,
DM
.
singleton
Authors
1
)
|
a'
<-
authors
]
<>
[(
t'
,
DM
.
singleton
NgramsTerms
1
)
|
t'
<-
terms'
]
filterNgramsT
::
Int
->
Map
Ngrams
(
Map
NgramsType
Int
)
->
Map
Ngrams
(
Map
NgramsType
Int
)
filterNgramsT
s
ms
=
DM
.
fromList
$
map
(
\
a
->
filter'
s
a
)
$
DM
.
toList
ms
where
filter'
s'
(
ng
@
(
Ngrams
t
n
),
y
)
=
case
(
Text
.
length
t
)
<
s'
of
True
->
(
ng
,
y
)
False
->
(
Ngrams
(
Text
.
take
s'
t
)
n
,
y
)
documentIdWithNgrams
::
HasNodeError
err
...
...
src/Gargantext/Database/Metrics/NgramsByNode.hs
View file @
45cddcc1
...
...
@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS
-- a first grouping option to user and get some
-- enriched data to better learn and improve that algo
ngramsGroup
::
Lang
->
Int
->
Int
->
Text
->
Text
ngramsGroup
l
m
n
=
Text
.
intercalate
" "
.
map
(
stem
l
)
.
take
n
.
List
.
sort
.
(
List
.
filter
(
\
t
->
Text
.
length
t
>
m
))
.
Text
.
splitOn
" "
.
Text
.
replace
"-"
" "
ngramsGroup
l
_m
_
n
=
Text
.
intercalate
" "
.
map
(
stem
l
)
--
. take n
.
List
.
sort
--
. (List.filter (\t -> Text.length t > m))
.
Text
.
splitOn
" "
.
Text
.
replace
"-"
" "
sortTficf
::
(
Map
Text
(
Double
,
Set
Text
))
...
...
@@ -171,8 +171,6 @@ queryNgramsOnlyByNodeUser = [sql|
GROUP BY nng.node_id, ng.terms
|]
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getNodesByNgramsMaster
::
UserCorpusId
->
MasterCorpusId
->
Cmd
err
(
Map
Text
(
Set
NodeId
))
...
...
@@ -183,10 +181,10 @@ selectNgramsByNodeMaster :: UserCorpusId -> MasterCorpusId -> Cmd err [(NodeId,
selectNgramsByNodeMaster
ucId
mcId
=
runPGSQuery
queryNgramsByNodeMaster
(
ucId
,
nodeTypeId
NodeDocument
,
nodeTypeId
NodeDocument
,
ngramsTypeId
NgramsTerms
,
mcId
,
nodeTypeId
NodeDocument
,
nodeTypeId
NodeDocument
,
ngramsTypeId
NgramsTerms
)
...
...
src/Gargantext/Text/List.hs
View file @
45cddcc1
...
...
@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Utils
(
Cmd
)
import
Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import
qualified
Data.Char
as
Char
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
...
...
@@ -33,10 +34,10 @@ import qualified Data.Set as Set
import
qualified
Data.Text
as
Text
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists
::
UserCorpusId
->
MasterCorpusId
buildNgramsLists
::
Lang
->
Int
->
Int
->
UserCorpusId
->
MasterCorpusId
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsLists
uCid
mCid
=
do
ngTerms
<-
buildNgramsTermsList
uCid
mCid
buildNgramsLists
l
n
m
uCid
mCid
=
do
ngTerms
<-
buildNgramsTermsList
l
n
m
uCid
mCid
othersTerms
<-
mapM
(
buildNgramsOthersList
uCid
identity
)
[
Authors
,
Sources
,
Institutes
]
pure
$
Map
.
unions
$
othersTerms
<>
[
ngTerms
]
...
...
@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do
]
-- TODO remove hard coded parameters
buildNgramsTermsList
::
UserCorpusId
->
MasterCorpusId
buildNgramsTermsList
::
Lang
->
Int
->
Int
->
UserCorpusId
->
MasterCorpusId
->
Cmd
err
(
Map
NgramsType
[
NgramsElement
])
buildNgramsTermsList
uCid
mCid
=
do
candidates
<-
sortTficf
<$>
getTficf'
uCid
mCid
(
ngramsGroup
EN
4
2
)
buildNgramsTermsList
l
n
m
uCid
mCid
=
do
candidates
<-
sortTficf
<$>
getTficf'
uCid
mCid
(
ngramsGroup
l
n
m
)
--printDebug "candidate" (length candidates)
let
termList
=
toTermList
(
isStopTerm
.
fst
)
candidates
--let termList = toTermList (isStopTerm . fst) candidates
let
termList
=
toTermList
((
\
_
->
False
)
.
fst
)
candidates
--printDebug "termlist" (length termList)
let
ngs
=
List
.
concat
$
map
toNgramsElement
termList
...
...
@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys
=
take
b
$
drop
a
ns
zs
=
drop
b
$
drop
a
ns
a
=
5
0
b
=
10
00
a
=
1
0
b
=
4
00
isStopTerm
::
Text
->
Bool
isStopTerm
x
=
Text
.
length
x
<
3
||
not
(
all
Char
.
isAlpha
(
Text
.
unpack
x'
))
where
x'
=
foldl
(
\
t
->
Text
.
replace
t
""
)
x'
=
foldl
(
\
t
->
Text
.
replace
t
"
a
"
)
x
[
"-"
,
" "
,
"/"
,
"("
,
")"
]
src/Gargantext/Text/Parsers/GrandDebat.hs
View file @
45cddcc1
...
...
@@ -97,6 +97,7 @@ class ReadFile a
instance
ReadFile
[
GrandDebatReference
]
where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile
fp
=
P
.
parseLazyByteString
(
P
.
arrayOf
P
.
value
)
<$>
DBL
.
readFile
fp
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment