Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
30aae72b
Commit
30aae72b
authored
Nov 08, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FLOW][DB][Ngrams] group function.
parent
0ee7da5d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
119 additions
and
79 deletions
+119
-79
API.hs
src/Gargantext/API.hs
+1
-2
Flow.hs
src/Gargantext/Database/Flow.hs
+96
-66
Ngrams.hs
src/Gargantext/Database/Ngrams.hs
+13
-2
Flow.hs
src/Gargantext/Text/Flow.hs
+9
-9
No files found.
src/Gargantext/API.hs
View file @
30aae72b
...
...
@@ -21,7 +21,6 @@ Thanks @yannEsposito for this.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
...
...
@@ -317,7 +316,7 @@ type Desc t n = Description (AppendSymbol (TypeName t) (AppendSymbol " | " n))
swaggerDoc
::
Swagger
swaggerDoc
=
toSwagger
(
Proxy
::
Proxy
GargAPI
)
&
info
.
title
.~
"Gargantext"
&
info
.
version
.~
"
0.1.0"
&
info
.
version
.~
"
4.0.2"
-- TODO same version as Gargantext
-- & info.base_url ?~ (URL "http://gargantext.org/")
&
info
.
description
?~
"REST API specifications"
-- & tags .~ Set.fromList [Tag "Garg" (Just "Main perations") Nothing]
...
...
src/Gargantext/Database/Flow.hs
View file @
30aae72b
...
...
@@ -25,56 +25,70 @@ authors
module
Gargantext.Database.Flow
where
import
System.FilePath
(
FilePath
)
import
Data.Maybe
(
Maybe
(
..
))
import
Data.Maybe
(
Maybe
(
..
)
,
catMaybes
)
import
Data.Text
(
Text
)
import
Data.Map
(
Map
)
import
Data.Tuple.Extra
(
both
,
second
)
import
qualified
Data.Map
as
DM
import
Gargantext.Core.Types
(
NodePoly
(
..
),
ListType
(
..
),
listId
)
import
Gargantext.Prelude
import
Gargantext.Database.Bashql
(
runCmd'
,
del
)
import
Gargantext.Database.
Types.Node
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.
Ngrams
(
insertNgrams
,
Ngrams
(
..
),
NgramsT
(
..
),
NgramsIndexed
(
..
),
indexNgramsT
,
ngramsTypeId
,
NgramsType
(
..
),
text2ngrams
)
import
Gargantext.Database.Node
(
getRoot
,
mkRoot
,
mkCorpus
,
Cmd
(
..
),
mkList
)
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Database.Node.Document.Insert
(
insertDocuments
,
ReturnId
(
..
),
addUniqIds
)
import
Gargantext.Database.Node.Document.Add
(
add
)
import
Gargantext.Database.Node.Document.Insert
(
insertDocuments
,
ReturnId
(
..
),
addUniqIds
)
import
Gargantext.Database.NodeNgram
(
NodeNgramPoly
(
..
),
insertNodeNgrams
)
import
Gargantext.Database.NodeNgramsNgrams
(
NodeNgramsNgramsPoly
(
..
),
insertNodeNgramsNgramsNew
)
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Prelude
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
(
WOS
))
import
Gargantext.Database.Ngrams
(
insertNgrams
,
Ngrams
(
..
),
NgramsT
(
..
),
NgramsIndexed
(
..
),
indexNgramsT
,
ngramsTypeId
)
type
UserId
=
Int
type
RootId
=
Int
type
CorpusId
=
Int
flow
::
FilePath
->
IO
Int
flow
fp
=
do
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
docs
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
docs
printDebug
"Docs IDs : "
ids
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
docs
printDebug
"Docs IDs : "
idsRepeat
(
_
,
_
,
corpusId2
)
<-
subFlow
"alexandre"
flow
::
FilePath
->
CorpusName
->
IO
[
Int
]
flow
fp
cName
=
do
-- Corus Flow
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
"Big Corpus"
-- Documents Flow
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
printDebug
"Docs IDs : "
(
length
ids
)
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
-- Ngrams Flow
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hyperdataDocuments
)
let
docsWithNgrams
=
documentIdWithNgrams
extractNgramsT
documentsWithId
let
maps
=
mapNodeIdNgrams
docsWithNgrams
indexedNgrams
<-
runCmd'
$
indexNgrams
maps
_
<-
runCmd'
$
insertToNodeNgrams
indexedNgrams
-- List Flow
listId
<-
runCmd'
$
listFlow
masterUserId
corpusId
indexedNgrams
printDebug
"list id:"
listId
printDebug
"Docs IDs : "
(
length
idsRepeat
)
(
_
,
_
,
corpusId2
)
<-
subFlow
"alexandre"
cName
inserted
<-
runCmd'
$
add
corpusId2
(
map
reId
ids
)
printDebug
"Inserted : "
inserted
printDebug
"Inserted : "
(
length
inserted
)
runCmd'
$
del
[
corpusId2
,
corpusId
]
pure
[
corpusId2
,
corpusId
]
--runCmd' $ del [corpusId2, corpusId]
subFlow
::
Username
->
IO
(
UserId
,
RootId
,
CorpusId
)
subFlow
username
=
do
type
CorpusName
=
Text
subFlow
::
Username
->
CorpusName
->
IO
(
UserId
,
RootId
,
CorpusId
)
subFlow
username
cName
=
do
maybeUserId
<-
runCmd'
(
getUser
username
)
let
userId
=
case
maybeUserId
of
Nothing
->
panic
"Error: User does not exist (yet)"
Nothing
->
panic
"Error: User does not exist (yet)"
-- mk NodeUser gargantua_id "Node Gargantua"
Just
user
->
userLight_id
user
...
...
@@ -82,24 +96,24 @@ subFlow username = do
rootId''
<-
case
rootId'
of
[]
->
runCmd'
(
mkRoot
userId
)
un
->
case
length
u
n
>=
2
of
True
->
panic
"Error: more than 1 userNode / user"
False
->
pure
rootId'
n
->
case
length
n
>=
2
of
True
->
panic
"Error: more than 1 userNode / user"
False
->
pure
rootId'
let
rootId
=
maybe
(
panic
"error rootId"
)
identity
(
head
rootId''
)
corpusId'
<-
runCmd'
$
mkCorpus
(
Just
"Corpus WOS"
)
Nothing
rootId
userId
corpusId'
<-
runCmd'
$
mkCorpus
(
Just
cName
)
Nothing
rootId
userId
let
corpusId
=
maybe
(
panic
"error corpusId"
)
identity
(
head
corpusId'
)
printDebug
"(username, userId, rootId, corpusId"
printDebug
"(username, userId, rootId, corpusId
)
"
(
username
,
userId
,
rootId
,
corpusId
)
pure
(
userId
,
rootId
,
corpusId
)
----------------------------------------------------------------
------------------------------------------------------------------------
type
HashId
=
Text
type
NodeId
=
Int
type
ListId
=
Int
type
ToInsert
=
Map
HashId
HyperdataDocument
type
Inserted
=
Map
HashId
ReturnId
toInsert
::
[
HyperdataDocument
]
->
Map
HashId
HyperdataDocument
toInsert
=
DM
.
fromList
.
map
(
\
d
->
(
hash
(
_hyperdataDocument_uniqIdBdd
d
),
d
))
...
...
@@ -111,24 +125,33 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
$
filter
(
\
r
->
reInserted
r
==
True
)
rs
data
DocumentWithId
=
DocumentWithId
{
documentId
::
NodeId
,
documentData
::
HyperdataDocument
}
DocumentWithId
{
documentId
::
NodeId
,
documentData
::
HyperdataDocument
}
mergeData
::
Map
HashId
ReturnId
->
Map
HashId
HyperdataDocument
->
[
DocumentWithId
]
mergeData
rs
hs
=
map
(
\
(
hash
,
r
)
->
DocumentWithId
(
reId
r
)
(
lookup'
hash
hs
))
$
DM
.
toList
rs
where
lookup'
h
xs
=
maybe
(
panic
$
"Error with "
<>
h
)
identity
(
DM
.
lookup
h
xs
)
------------------------------------------------------------------------
data
DocumentIdWithNgrams
=
DocumentIdWithNgrams
{
documentWithId
::
DocumentWithId
,
document_ngrams
::
Map
(
NgramsT
Ngrams
)
Int
}
DocumentIdWithNgrams
{
documentWithId
::
DocumentWithId
,
document_ngrams
::
Map
(
NgramsT
Ngrams
)
Int
}
-- TODO add Authors and Terms (Title + Abstract)
-- add f :: Text -> Text
-- newtype Ngrams = Ngrams Text
extractNgramsT
::
HyperdataDocument
->
Map
(
NgramsT
Ngrams
)
Int
extractNgramsT
doc
=
DM
.
fromList
$
[(
NgramsT
Sources
ngrams
,
1
)]
where
ngrams
=
text2ngrams
$
maybe
"Nothing"
identity
maybeNgrams
maybeNgrams
=
_hyperdataDocument_source
doc
documentIdWithNgrams
::
(
HyperdataDocument
->
Map
(
NgramsT
Ngrams
)
Int
)
documentIdWithNgrams
::
(
HyperdataDocument
->
Map
(
NgramsT
Ngrams
)
Int
)
->
[
DocumentWithId
]
->
[
DocumentIdWithNgrams
]
documentIdWithNgrams
f
=
map
(
\
d
->
DocumentIdWithNgrams
d
((
f
.
documentData
)
d
))
...
...
@@ -149,22 +172,41 @@ indexNgrams ng2nId = do
insertToNodeNgrams
::
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
Int
insertToNodeNgrams
m
=
insertNodeNgrams
[
NodeNgram
Nothing
nId
((
_ngramsId
.
_ngramsT
)
ng
)
(
fromIntegral
n
)
((
ngramsTypeId
.
_ngramsType
)
ng
)
|
(
ng
,
nId2int
)
<-
DM
.
toList
m
,
(
nId
,
n
)
<-
DM
.
toList
nId2int
|
(
ng
,
nId2int
)
<-
DM
.
toList
m
,
(
nId
,
n
)
<-
DM
.
toList
nId2int
]
------------------------------------------------------------------------
------------------------------------------------------------------------
listFlow
::
UserId
->
CorpusId
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
ListId
listFlow
uId
cId
ng
=
do
lId
<-
maybe
(
panic
"mkList error"
)
identity
<$>
head
<$>
mkList
cId
uId
-- TODO add stemming equivalence of 2 ngrams
let
groupEd
=
groupNgramsBy
(
\
(
NgramsT
t1
n1
)
(
NgramsT
t2
n2
)
->
if
(((
==
)
t1
t2
)
&&
((
==
)
n1
n2
))
then
(
Just
(
n1
,
n2
))
else
Nothing
)
ng
_
<-
insertGroups
lId
groupEd
-- compute Candidate / Map
let
lists
=
ngrams2list
ng
_
<-
insertLists
lId
lists
pure
lId
------------------------------------------------------------------------
groupNgramsBy
::
(
Ngrams
->
Ngrams
->
Bool
)
groupNgramsBy
::
(
NgramsT
NgramsIndexed
->
NgramsT
NgramsIndexed
->
Maybe
(
NgramsIndexed
,
NgramsIndexed
))
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Map
NgramsIndexed
NgramsIndexed
groupNgramsBy
=
undefined
groupNgramsBy
isEqual
cId
=
DM
.
fromList
$
catMaybes
[
isEqual
n1
n2
|
n1
<-
DM
.
keys
cId
,
n2
<-
DM
.
keys
cId
]
-- TODO check: do not insert duplicates
insertGroups
::
ListId
->
Map
NgramsIndexed
NgramsIndexed
->
Cmd
Int
insertGroups
lId
ngrs
=
insertNodeNgramsNgramsNew
$
[
NodeNgramsNgrams
lId
ng1
ng2
(
Just
1
)
|
(
ng1
,
ng2
)
<-
map
(
both
_ngramsId
)
$
DM
.
toList
ngrs
]
insertGroups
lId
ngrs
=
insertNodeNgramsNgramsNew
[
NodeNgramsNgrams
lId
ng1
ng2
(
Just
1
)
|
(
ng1
,
ng2
)
<-
map
(
both
_ngramsId
)
$
DM
.
toList
ngrs
]
------------------------------------------------------------------------
ngrams2list
::
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Map
ListType
NgramsIndexed
...
...
@@ -177,18 +219,6 @@ insertLists lId list2ngrams =
|
(
l
,
ngr
)
<-
map
(
second
_ngramsId
)
$
DM
.
toList
list2ngrams
]
listFlow
::
UserId
->
CorpusId
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
ListId
listFlow
uId
cId
ng
=
do
lId
<-
maybe
(
panic
"mkList error"
)
identity
<$>
head
<$>
mkList
cId
uId
-- TODO add stemming equivalence of 2 ngrams
let
groupEd
=
groupNgramsBy
(
==
)
ng
_
<-
insertGroups
lId
groupEd
-- compute Candidate / Map
let
lists
=
ngrams2list
ng
_
<-
insertLists
lId
lists
pure
lId
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Database/Ngrams.hs
View file @
30aae72b
...
...
@@ -27,7 +27,7 @@ module Gargantext.Database.Ngrams where
import
Control.Lens
(
makeLenses
)
import
Data.ByteString.Internal
(
ByteString
)
import
Data.Map
(
Map
,
fromList
,
lookup
)
import
Data.Text
(
Text
)
import
Data.Text
(
Text
,
splitOn
)
import
Database.PostgreSQL.Simple.FromRow
(
fromRow
,
field
)
import
Database.PostgreSQL.Simple.SqlQQ
(
sql
)
import
Database.PostgreSQL.Simple.ToField
(
toField
)
...
...
@@ -77,6 +77,7 @@ import qualified Database.PostgreSQL.Simple as DPS
-- ngrams in authors field of document has Authors Type
-- ngrams in text (title or abstract) of documents has Terms Type
data
NgramsType
=
Sources
|
Authors
|
Terms
deriving
(
Eq
)
ngramsTypeId
::
NgramsType
->
Int
ngramsTypeId
Terms
=
1
...
...
@@ -100,6 +101,9 @@ makeLenses ''Ngrams
instance
DPS
.
ToRow
Ngrams
where
toRow
(
Ngrams
t
s
)
=
[
toField
t
,
toField
s
]
text2ngrams
::
Text
->
Ngrams
text2ngrams
txt
=
Ngrams
txt
$
length
$
splitOn
" "
txt
-------------------------------------------------------------------------
-- | TODO put it in Gargantext.Text.Ngrams
-- Named entity are typed ngrams of Terms Ngrams
...
...
@@ -108,9 +112,16 @@ data NgramsT a =
,
_ngramsT
::
a
}
deriving
(
Generic
)
instance
Eq
(
NgramsT
a
)
where
(
==
)
=
(
==
)
instance
Eq
(
NgramsT
a
)
where
(
==
)
=
(
==
)
-- where NgramsT
-- t1 == t2
-- n1 == n2
instance
Ord
(
NgramsT
a
)
where
compare
=
compare
makeLenses
''
N
gramsT
-----------------------------------------------------------------------
data
NgramsIndexed
=
NgramsIndexed
...
...
src/Gargantext/Text/Flow.hs
View file @
30aae72b
...
...
@@ -59,15 +59,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
contextText
::
[
T
.
Text
]
contextText
=
map
T
.
pack
[
"The dog is an animal."
,
"The bird is a animal."
,
"The dog is a animal."
,
"The bird is a
n
animal."
,
"The dog is a
n
animal."
,
"The animal is a bird or a dog ?"
,
"The table is a object."
,
"The pen is a object."
,
"The table is a
n
object."
,
"The pen is a
n
object."
,
"The object is a pen or a table ?"
,
"The girl is
human body
."
,
"The boy is
human body
."
,
"The boy or the girl are human
body
."
,
"The girl is
a human
."
,
"The boy is
a human
."
,
"The boy or the girl are human."
]
...
...
@@ -75,7 +75,7 @@ contextText = map T.pack ["The dog is an animal."
data
TextFlow
=
CSV
FilePath
|
FullText
FilePath
|
Contexts
[
T
.
Text
]
|
DB
Connection
CorpusId
|
DB
V3
Connection
CorpusId
|
Query
T
.
Text
...
...
@@ -85,7 +85,7 @@ textFlow termType workType = do
FullText
path
->
splitBy
(
Sentences
5
)
<$>
readFile
path
CSV
path
->
readCsvOn
[
csv_title
,
csv_abstract
]
path
Contexts
ctxt
->
pure
ctxt
DB
con
corpusId
->
catMaybes
<$>
map
(
\
n
->
hyperdataDocumentV3_title
(
_node_hyperdata
n
)
<>
hyperdataDocumentV3_abstract
(
_node_hyperdata
n
))
<$>
getDocumentsV3WithParentId
con
corpusId
DB
V3
con
corpusId
->
catMaybes
<$>
map
(
\
n
->
hyperdataDocumentV3_title
(
_node_hyperdata
n
)
<>
hyperdataDocumentV3_abstract
(
_node_hyperdata
n
))
<$>
getDocumentsV3WithParentId
con
corpusId
_
->
undefined
-- TODO Query not supported
textFlow'
termType
contexts
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment