Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
purescript-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
152
Issues
152
List
Board
Labels
Milestones
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
purescript-gargantext
Commits
30aae72b
Commit
30aae72b
authored
Nov 08, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FLOW][DB][Ngrams] group function.
parent
0ee7da5d
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
119 additions
and
79 deletions
+119
-79
API.hs
src/Gargantext/API.hs
+1
-2
Flow.hs
src/Gargantext/Database/Flow.hs
+96
-66
Ngrams.hs
src/Gargantext/Database/Ngrams.hs
+13
-2
Flow.hs
src/Gargantext/Text/Flow.hs
+9
-9
No files found.
src/Gargantext/API.hs
View file @
30aae72b
...
...
@@ -21,7 +21,6 @@ Thanks @yannEsposito for this.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
...
...
@@ -317,7 +316,7 @@ type Desc t n = Description (AppendSymbol (TypeName t) (AppendSymbol " | " n))
swaggerDoc
::
Swagger
swaggerDoc
=
toSwagger
(
Proxy
::
Proxy
GargAPI
)
&
info
.
title
.~
"Gargantext"
&
info
.
version
.~
"
0.1.0"
&
info
.
version
.~
"
4.0.2"
-- TODO same version as Gargantext
-- & info.base_url ?~ (URL "http://gargantext.org/")
&
info
.
description
?~
"REST API specifications"
-- & tags .~ Set.fromList [Tag "Garg" (Just "Main perations") Nothing]
...
...
src/Gargantext/Database/Flow.hs
View file @
30aae72b
...
...
@@ -25,52 +25,66 @@ authors
module
Gargantext.Database.Flow
where
import
System.FilePath
(
FilePath
)
import
Data.Maybe
(
Maybe
(
..
))
import
Data.Maybe
(
Maybe
(
..
)
,
catMaybes
)
import
Data.Text
(
Text
)
import
Data.Map
(
Map
)
import
Data.Tuple.Extra
(
both
,
second
)
import
qualified
Data.Map
as
DM
import
Gargantext.Core.Types
(
NodePoly
(
..
),
ListType
(
..
),
listId
)
import
Gargantext.Prelude
import
Gargantext.Database.Bashql
(
runCmd'
,
del
)
import
Gargantext.Database.
Types.Node
(
HyperdataDocument
(
..
)
)
import
Gargantext.Database.
Ngrams
(
insertNgrams
,
Ngrams
(
..
),
NgramsT
(
..
),
NgramsIndexed
(
..
),
indexNgramsT
,
ngramsTypeId
,
NgramsType
(
..
),
text2ngrams
)
import
Gargantext.Database.Node
(
getRoot
,
mkRoot
,
mkCorpus
,
Cmd
(
..
),
mkList
)
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Database.Node.Document.Insert
(
insertDocuments
,
ReturnId
(
..
),
addUniqIds
)
import
Gargantext.Database.Node.Document.Add
(
add
)
import
Gargantext.Database.Node.Document.Insert
(
insertDocuments
,
ReturnId
(
..
),
addUniqIds
)
import
Gargantext.Database.NodeNgram
(
NodeNgramPoly
(
..
),
insertNodeNgrams
)
import
Gargantext.Database.NodeNgramsNgrams
(
NodeNgramsNgramsPoly
(
..
),
insertNodeNgramsNgramsNew
)
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Prelude
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
(
WOS
))
import
Gargantext.Database.Ngrams
(
insertNgrams
,
Ngrams
(
..
),
NgramsT
(
..
),
NgramsIndexed
(
..
),
indexNgramsT
,
ngramsTypeId
)
type
UserId
=
Int
type
RootId
=
Int
type
CorpusId
=
Int
flow
::
FilePath
->
IO
Int
flow
fp
=
do
flow
::
FilePath
->
CorpusName
->
IO
[
Int
]
flow
fp
cName
=
do
-- Corus Flow
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
"Big Corpus"
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
-- Documents Flow
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
printDebug
"Docs IDs : "
(
length
ids
)
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
docs
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
docs
printDebug
"Docs IDs : "
ids
-- Ngrams Flow
let
documentsWithId
=
mergeData
(
toInserted
ids
)
(
toInsert
hyperdataDocuments
)
let
docsWithNgrams
=
documentIdWithNgrams
extractNgramsT
documentsWithId
let
maps
=
mapNodeIdNgrams
docsWithNgrams
indexedNgrams
<-
runCmd'
$
indexNgrams
maps
_
<-
runCmd'
$
insertToNodeNgrams
indexedNgrams
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
docs
printDebug
"Docs IDs : "
idsRepeat
-- List Flow
listId
<-
runCmd'
$
listFlow
masterUserId
corpusId
indexedNgrams
printDebug
"list id:"
listId
(
_
,
_
,
corpusId2
)
<-
subFlow
"alexandre"
printDebug
"Docs IDs : "
(
length
idsRepeat
)
(
_
,
_
,
corpusId2
)
<-
subFlow
"alexandre"
cName
inserted
<-
runCmd'
$
add
corpusId2
(
map
reId
ids
)
printDebug
"Inserted : "
inserted
printDebug
"Inserted : "
(
length
inserted
)
runCmd'
$
del
[
corpusId2
,
corpusId
]
pure
[
corpusId2
,
corpusId
]
--runCmd' $ del [corpusId2, corpusId]
subFlow
::
Username
->
IO
(
UserId
,
RootId
,
CorpusId
)
subFlow
username
=
do
type
CorpusName
=
Text
subFlow
::
Username
->
CorpusName
->
IO
(
UserId
,
RootId
,
CorpusId
)
subFlow
username
cName
=
do
maybeUserId
<-
runCmd'
(
getUser
username
)
let
userId
=
case
maybeUserId
of
...
...
@@ -82,24 +96,24 @@ subFlow username = do
rootId''
<-
case
rootId'
of
[]
->
runCmd'
(
mkRoot
userId
)
un
->
case
length
u
n
>=
2
of
n
->
case
length
n
>=
2
of
True
->
panic
"Error: more than 1 userNode / user"
False
->
pure
rootId'
let
rootId
=
maybe
(
panic
"error rootId"
)
identity
(
head
rootId''
)
corpusId'
<-
runCmd'
$
mkCorpus
(
Just
"Corpus WOS"
)
Nothing
rootId
userId
corpusId'
<-
runCmd'
$
mkCorpus
(
Just
cName
)
Nothing
rootId
userId
let
corpusId
=
maybe
(
panic
"error corpusId"
)
identity
(
head
corpusId'
)
printDebug
"(username, userId, rootId, corpusId"
printDebug
"(username, userId, rootId, corpusId
)
"
(
username
,
userId
,
rootId
,
corpusId
)
pure
(
userId
,
rootId
,
corpusId
)
----------------------------------------------------------------
------------------------------------------------------------------------
type
HashId
=
Text
type
NodeId
=
Int
type
ListId
=
Int
type
ToInsert
=
Map
HashId
HyperdataDocument
type
Inserted
=
Map
HashId
ReturnId
toInsert
::
[
HyperdataDocument
]
->
Map
HashId
HyperdataDocument
toInsert
=
DM
.
fromList
.
map
(
\
d
->
(
hash
(
_hyperdataDocument_uniqIdBdd
d
),
d
))
...
...
@@ -111,23 +125,32 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
$
filter
(
\
r
->
reInserted
r
==
True
)
rs
data
DocumentWithId
=
DocumentWithId
{
documentId
::
NodeId
DocumentWithId
{
documentId
::
NodeId
,
documentData
::
HyperdataDocument
}
mergeData
::
Map
HashId
ReturnId
->
Map
HashId
HyperdataDocument
->
[
DocumentWithId
]
mergeData
rs
hs
=
map
(
\
(
hash
,
r
)
->
DocumentWithId
(
reId
r
)
(
lookup'
hash
hs
))
$
DM
.
toList
rs
where
lookup'
h
xs
=
maybe
(
panic
$
"Error with "
<>
h
)
identity
(
DM
.
lookup
h
xs
)
------------------------------------------------------------------------
data
DocumentIdWithNgrams
=
DocumentIdWithNgrams
{
documentWithId
::
DocumentWithId
,
document_ngrams
::
Map
(
NgramsT
Ngrams
)
Int
}
-- TODO add Authors and Terms (Title + Abstract)
-- add f :: Text -> Text
-- newtype Ngrams = Ngrams Text
extractNgramsT
::
HyperdataDocument
->
Map
(
NgramsT
Ngrams
)
Int
extractNgramsT
doc
=
DM
.
fromList
$
[(
NgramsT
Sources
ngrams
,
1
)]
where
ngrams
=
text2ngrams
$
maybe
"Nothing"
identity
maybeNgrams
maybeNgrams
=
_hyperdataDocument_source
doc
documentIdWithNgrams
::
(
HyperdataDocument
->
Map
(
NgramsT
Ngrams
)
Int
)
->
[
DocumentWithId
]
->
[
DocumentIdWithNgrams
]
documentIdWithNgrams
f
=
map
(
\
d
->
DocumentIdWithNgrams
d
((
f
.
documentData
)
d
))
...
...
@@ -149,20 +172,39 @@ indexNgrams ng2nId = do
insertToNodeNgrams
::
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
Int
insertToNodeNgrams
m
=
insertNodeNgrams
[
NodeNgram
Nothing
nId
((
_ngramsId
.
_ngramsT
)
ng
)
(
fromIntegral
n
)
((
ngramsTypeId
.
_ngramsType
)
ng
)
|
(
ng
,
nId2int
)
<-
DM
.
toList
m
,
(
nId
,
n
)
<-
DM
.
toList
nId2int
]
------------------------------------------------------------------------
------------------------------------------------------------------------
groupNgramsBy
::
(
Ngrams
->
Ngrams
->
Bool
)
listFlow
::
UserId
->
CorpusId
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
ListId
listFlow
uId
cId
ng
=
do
lId
<-
maybe
(
panic
"mkList error"
)
identity
<$>
head
<$>
mkList
cId
uId
-- TODO add stemming equivalence of 2 ngrams
let
groupEd
=
groupNgramsBy
(
\
(
NgramsT
t1
n1
)
(
NgramsT
t2
n2
)
->
if
(((
==
)
t1
t2
)
&&
((
==
)
n1
n2
))
then
(
Just
(
n1
,
n2
))
else
Nothing
)
ng
_
<-
insertGroups
lId
groupEd
-- compute Candidate / Map
let
lists
=
ngrams2list
ng
_
<-
insertLists
lId
lists
pure
lId
------------------------------------------------------------------------
groupNgramsBy
::
(
NgramsT
NgramsIndexed
->
NgramsT
NgramsIndexed
->
Maybe
(
NgramsIndexed
,
NgramsIndexed
))
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Map
NgramsIndexed
NgramsIndexed
groupNgramsBy
=
undefined
groupNgramsBy
isEqual
cId
=
DM
.
fromList
$
catMaybes
[
isEqual
n1
n2
|
n1
<-
DM
.
keys
cId
,
n2
<-
DM
.
keys
cId
]
-- TODO check: do not insert duplicates
insertGroups
::
ListId
->
Map
NgramsIndexed
NgramsIndexed
->
Cmd
Int
insertGroups
lId
ngrs
=
insertNodeNgramsNgramsNew
$
[
NodeNgramsNgrams
lId
ng1
ng2
(
Just
1
)
insertNodeNgramsNgramsNew
[
NodeNgramsNgrams
lId
ng1
ng2
(
Just
1
)
|
(
ng1
,
ng2
)
<-
map
(
both
_ngramsId
)
$
DM
.
toList
ngrs
]
...
...
@@ -177,18 +219,6 @@ insertLists lId list2ngrams =
|
(
l
,
ngr
)
<-
map
(
second
_ngramsId
)
$
DM
.
toList
list2ngrams
]
listFlow
::
UserId
->
CorpusId
->
Map
(
NgramsT
NgramsIndexed
)
(
Map
NodeId
Int
)
->
Cmd
ListId
listFlow
uId
cId
ng
=
do
lId
<-
maybe
(
panic
"mkList error"
)
identity
<$>
head
<$>
mkList
cId
uId
-- TODO add stemming equivalence of 2 ngrams
let
groupEd
=
groupNgramsBy
(
==
)
ng
_
<-
insertGroups
lId
groupEd
-- compute Candidate / Map
let
lists
=
ngrams2list
ng
_
<-
insertLists
lId
lists
pure
lId
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Database/Ngrams.hs
View file @
30aae72b
...
...
@@ -27,7 +27,7 @@ module Gargantext.Database.Ngrams where
import
Control.Lens
(
makeLenses
)
import
Data.ByteString.Internal
(
ByteString
)
import
Data.Map
(
Map
,
fromList
,
lookup
)
import
Data.Text
(
Text
)
import
Data.Text
(
Text
,
splitOn
)
import
Database.PostgreSQL.Simple.FromRow
(
fromRow
,
field
)
import
Database.PostgreSQL.Simple.SqlQQ
(
sql
)
import
Database.PostgreSQL.Simple.ToField
(
toField
)
...
...
@@ -77,6 +77,7 @@ import qualified Database.PostgreSQL.Simple as DPS
-- ngrams in authors field of document has Authors Type
-- ngrams in text (title or abstract) of documents has Terms Type
data
NgramsType
=
Sources
|
Authors
|
Terms
deriving
(
Eq
)
ngramsTypeId
::
NgramsType
->
Int
ngramsTypeId
Terms
=
1
...
...
@@ -100,6 +101,9 @@ makeLenses ''Ngrams
instance
DPS
.
ToRow
Ngrams
where
toRow
(
Ngrams
t
s
)
=
[
toField
t
,
toField
s
]
text2ngrams
::
Text
->
Ngrams
text2ngrams
txt
=
Ngrams
txt
$
length
$
splitOn
" "
txt
-------------------------------------------------------------------------
-- | TODO put it in Gargantext.Text.Ngrams
-- Named entity are typed ngrams of Terms Ngrams
...
...
@@ -108,9 +112,16 @@ data NgramsT a =
,
_ngramsT
::
a
}
deriving
(
Generic
)
instance
Eq
(
NgramsT
a
)
where
(
==
)
=
(
==
)
instance
Eq
(
NgramsT
a
)
where
(
==
)
=
(
==
)
-- where NgramsT
-- t1 == t2
-- n1 == n2
instance
Ord
(
NgramsT
a
)
where
compare
=
compare
makeLenses
''
N
gramsT
-----------------------------------------------------------------------
data
NgramsIndexed
=
NgramsIndexed
...
...
src/Gargantext/Text/Flow.hs
View file @
30aae72b
...
...
@@ -59,15 +59,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
contextText
::
[
T
.
Text
]
contextText
=
map
T
.
pack
[
"The dog is an animal."
,
"The bird is a animal."
,
"The dog is a animal."
,
"The bird is a
n
animal."
,
"The dog is a
n
animal."
,
"The animal is a bird or a dog ?"
,
"The table is a object."
,
"The pen is a object."
,
"The table is a
n
object."
,
"The pen is a
n
object."
,
"The object is a pen or a table ?"
,
"The girl is
human body
."
,
"The boy is
human body
."
,
"The boy or the girl are human
body
."
,
"The girl is
a human
."
,
"The boy is
a human
."
,
"The boy or the girl are human."
]
...
...
@@ -75,7 +75,7 @@ contextText = map T.pack ["The dog is an animal."
data
TextFlow
=
CSV
FilePath
|
FullText
FilePath
|
Contexts
[
T
.
Text
]
|
DB
Connection
CorpusId
|
DB
V3
Connection
CorpusId
|
Query
T
.
Text
...
...
@@ -85,7 +85,7 @@ textFlow termType workType = do
FullText
path
->
splitBy
(
Sentences
5
)
<$>
readFile
path
CSV
path
->
readCsvOn
[
csv_title
,
csv_abstract
]
path
Contexts
ctxt
->
pure
ctxt
DB
con
corpusId
->
catMaybes
<$>
map
(
\
n
->
hyperdataDocumentV3_title
(
_node_hyperdata
n
)
<>
hyperdataDocumentV3_abstract
(
_node_hyperdata
n
))
<$>
getDocumentsV3WithParentId
con
corpusId
DB
V3
con
corpusId
->
catMaybes
<$>
map
(
\
n
->
hyperdataDocumentV3_title
(
_node_hyperdata
n
)
<>
hyperdataDocumentV3_abstract
(
_node_hyperdata
n
))
<$>
getDocumentsV3WithParentId
con
corpusId
_
->
undefined
-- TODO Query not supported
textFlow'
termType
contexts
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment