Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
ab68f83b
Verified
Commit
ab68f83b
authored
Feb 15, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[nodestory] some Flow rewrite
Split into Extract/Types/Utils modules. Found some functions that are unused.
parent
6b9588b1
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
537 additions
and
408 deletions
+537
-408
gargantext.cabal
gargantext.cabal
+2
-0
NodeStory.hs
src/Gargantext/Core/NodeStory.hs
+3
-148
DB.hs
src/Gargantext/Core/NodeStory/DB.hs
+188
-0
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+44
-243
Extract.hs
src/Gargantext/Database/Action/Flow/Extract.hs
+111
-0
Types.hs
src/Gargantext/Database/Action/Flow/Types.hs
+41
-6
Utils.hs
src/Gargantext/Database/Action/Flow/Utils.hs
+145
-9
Metrics.hs
src/Gargantext/Database/Action/Metrics.hs
+3
-2
No files found.
gargantext.cabal
View file @
ab68f83b
...
...
@@ -122,6 +122,7 @@ library
Gargantext.Core.Methods.Similarities
Gargantext.Core.NLP
Gargantext.Core.NodeStory
Gargantext.Core.NodeStory.DB
Gargantext.Core.NodeStory.Types
Gargantext.Core.Text
Gargantext.Core.Text.Context
...
...
@@ -341,6 +342,7 @@ library
Gargantext.Database
Gargantext.Database.Action.Delete
Gargantext.Database.Action.Flow.Annuaire
Gargantext.Database.Action.Flow.Extract
Gargantext.Database.Action.Flow.List
Gargantext.Database.Action.Flow.Pairing
Gargantext.Database.Action.Flow.Utils
...
...
src/Gargantext/Core/NodeStory.hs
View file @
ab68f83b
{-|
Module : Gargantext.Core.NodeStory
Description : Node
API generation
Description : Node
Story
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -50,10 +50,6 @@ module Gargantext.Core.NodeStory
,
getNodesArchiveHistory
,
Archive
(
..
)
,
nodeExists
,
runPGSQuery
,
runPGSAdvisoryLock
,
runPGSAdvisoryUnlock
,
runPGSAdvisoryXactLock
,
getNodesIdWithType
,
fromDBNodeStoryEnv
,
upsertNodeStories
...
...
@@ -67,98 +63,23 @@ where
import
Control.Lens
((
^.
),
(
.~
),
(
%~
),
non
,
_Just
,
at
,
view
)
import
Control.Monad.Except
import
Data.HashMap.Strict
(
HashMap
)
import
Data.HashMap.Strict
qualified
as
HashMap
import
Data.Map.Strict
qualified
as
Map
import
Data.Map.Strict.Patch
qualified
as
PM
import
Data.Monoid
import
Data.Pool
(
Pool
,
withResource
)
import
Data.Set
qualified
as
Set
import
Database.PostgreSQL.Simple
qualified
as
PGS
import
Database.PostgreSQL.Simple.SqlQQ
(
sql
)
import
Database.PostgreSQL.Simple.ToField
qualified
as
PGS
import
Database.PostgreSQL.Simple.Types
(
Values
(
..
),
QualifiedIdentifier
(
..
))
import
Gargantext.API.Ngrams.Types
import
Gargantext.Core
(
toDBid
)
import
Gargantext.Core
.NodeStory.DB
import
Gargantext.Core.NodeStory.Types
import
Gargantext.Core.Types
(
ListId
,
NodeId
(
..
)
,
NodeType
)
import
Gargantext.Core.Types
(
ListId
,
NodeId
(
..
))
import
Gargantext.Database.Admin.Config
()
import
Gargantext.Database.Prelude
(
HasConnectionPool
(
..
))
import
Gargantext.Database.Query.Table.Ngrams
qualified
as
TableNgrams
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
)
import
Gargantext.Prelude
hiding
(
to
)
import
Gargantext.Prelude.Database
-- DB stuff
nodeExists
::
PGS
.
Connection
->
NodeId
->
IO
Bool
nodeExists
c
nId
=
(
==
[
PGS
.
Only
True
])
<$>
runPGSQuery
c
[
sql
|
SELECT true FROM nodes WHERE id = ? LIMIT 1
|]
(
PGS
.
Only
nId
)
getNodesIdWithType
::
PGS
.
Connection
->
NodeType
->
IO
[
NodeId
]
getNodesIdWithType
c
nt
=
do
ns
<-
runPGSQuery
c
query
(
PGS
.
Only
$
toDBid
nt
)
pure
$
map
(
\
(
PGS
.
Only
nId
)
->
UnsafeMkNodeId
nId
)
ns
where
query
::
PGS
.
Query
query
=
[
sql
|
SELECT id FROM nodes WHERE typename = ?
|]
-- /!\ This function is using an hard coded parameter
-- which depends on the Ngrams List Flow
-- Version > 5 is hard coded because by default
-- first version of history of manual change is 6
getNodesArchiveHistory
::
PGS
.
Connection
->
[
NodeId
]
->
IO
[(
NodeId
,
(
Map
NgramsType
[
HashMap
NgramsTerm
NgramsPatch
]))]
getNodesArchiveHistory
c
nodesId
=
do
as
<-
runPGSQuery
c
query
(
PGS
.
Only
$
Values
fields
nodesId
)
::
IO
[(
Int
,
TableNgrams
.
NgramsType
,
NgramsTerm
,
NgramsPatch
)]
pure
$
map
(
\
(
nId
,
ngramsType
,
terms
,
patch
)
->
(
UnsafeMkNodeId
nId
,
Map
.
singleton
ngramsType
[
HashMap
.
singleton
terms
patch
]
)
)
as
where
fields
=
[
QualifiedIdentifier
Nothing
"int4"
]
query
::
PGS
.
Query
query
=
[
sql
|
WITH nodes_id(nid) as (?)
SELECT node_id, ngrams_type_id, terms, patch
FROM node_story_archive_history
JOIN ngrams ON ngrams.id = ngrams_id
JOIN nodes_id n ON node_id = n.nid
WHERE version > 5
ORDER BY (version, node_story_archive_history.id) DESC
|]
insertNodeArchiveHistory
::
PGS
.
Connection
->
NodeId
->
Version
->
[
NgramsStatePatch'
]
->
IO
()
insertNodeArchiveHistory
_
_
_
[]
=
pure
()
insertNodeArchiveHistory
c
nodeId
version
(
h
:
hs
)
=
do
let
tuples
=
mconcat
$
(
\
(
nType
,
NgramsTablePatch
patch
)
->
(
\
(
term
,
p
)
->
(
nodeId
,
nType
,
term
,
p
))
<$>
PM
.
toList
patch
)
<$>
PM
.
toList
h
::
[(
NodeId
,
TableNgrams
.
NgramsType
,
NgramsTerm
,
NgramsPatch
)]
tuplesM
<-
mapM
(
\
(
nId
,
nType
,
term
,
patch
)
->
do
[
PGS
.
Only
ngramsId
]
<-
runPGSReturning
c
qInsert
[
PGS
.
Only
term
]
::
IO
[
PGS
.
Only
Int
]
pure
(
nId
,
nType
,
ngramsId
,
term
,
patch
)
)
tuples
::
IO
[(
NodeId
,
TableNgrams
.
NgramsType
,
Int
,
NgramsTerm
,
NgramsPatch
)]
_
<-
runPGSExecuteMany
c
query
$
((
\
(
nId
,
nType
,
termId
,
_term
,
patch
)
->
(
nId
,
nType
,
termId
,
patch
,
version
))
<$>
tuplesM
)
_
<-
insertNodeArchiveHistory
c
nodeId
version
hs
pure
()
where
qInsert
::
PGS
.
Query
qInsert
=
[
sql
|
INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id
|]
-- https://stackoverflow.com/questions/39224438/postgresql-insert-if-foreign-key-exists
query
::
PGS
.
Query
query
=
[
sql
|
INSERT INTO node_story_archive_history(node_id, ngrams_type_id, ngrams_id, patch, version)
VALUES (?, ?, ?, ?, ?)
|]
getNodeStory'
::
PGS
.
Connection
->
NodeId
->
IO
ArchiveList
getNodeStory'
c
nId
=
do
...
...
@@ -196,14 +117,6 @@ getNodeStory c nId = do
a
<-
getNodeStory'
c
nId
pure
$
NodeStory
$
Map
.
singleton
nId
a
nodeStoriesQuery
::
PGS
.
Query
nodeStoriesQuery
=
[
sql
|
SELECT version, ngrams_type_id, terms, ngrams_repo_element
FROM node_stories
JOIN ngrams ON ngrams.id = ngrams_id
WHERE node_id = ?
|]
-- |Functions to convert archive state (which is a `Map NgramsType
-- (Map NgramsTerm NgramsRepoElement`)) to/from a flat list
archiveStateToList
::
NgramsState'
->
ArchiveStateList
...
...
@@ -224,53 +137,6 @@ insertNodeStory :: PGS.Connection -> NodeId -> ArchiveList -> IO ()
insertNodeStory
c
nId
a
=
do
insertArchiveStateList
c
nId
(
a
^.
a_version
)
(
archiveStateToList
$
a
^.
a_state
)
insertArchiveStateList
::
PGS
.
Connection
->
NodeId
->
Version
->
ArchiveStateList
->
IO
()
insertArchiveStateList
c
nodeId
version
as
=
do
mapM_
performInsert
as
where
performInsert
(
ngramsType
,
ngrams
,
ngramsRepoElement
)
=
do
[
PGS
.
Only
ngramsId
]
<-
tryInsertTerms
ngrams
_
<-
case
ngramsRepoElement
^.
nre_root
of
Nothing
->
pure
[]
Just
r
->
tryInsertTerms
r
mapM_
tryInsertTerms
$
ngramsRepoElement
^.
nre_children
runPGSExecute
c
query
(
nodeId
,
ngramsId
,
version
,
ngramsType
,
ngramsRepoElement
)
tryInsertTerms
::
NgramsTerm
->
IO
[
PGS
.
Only
Int
]
tryInsertTerms
t
=
runPGSReturning
c
qInsert
[
PGS
.
Only
t
]
qInsert
::
PGS
.
Query
qInsert
=
[
sql
|
INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id
|]
query
::
PGS
.
Query
query
=
[
sql
|
INSERT INTO node_stories(node_id, ngrams_id, version, ngrams_type_id, ngrams_repo_element)
VALUES (?, ?, ?, ?, ? :: jsonb)
|]
deleteArchiveStateList
::
PGS
.
Connection
->
NodeId
->
ArchiveStateList
->
IO
()
deleteArchiveStateList
c
nodeId
as
=
do
mapM_
(
\
(
nt
,
n
,
_
)
->
runPGSExecute
c
query
(
nodeId
,
nt
,
n
))
as
where
query
::
PGS
.
Query
query
=
[
sql
|
DELETE FROM node_stories
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateArchiveStateList
::
PGS
.
Connection
->
NodeId
->
Version
->
ArchiveStateList
->
IO
()
updateArchiveStateList
c
nodeId
version
as
=
do
let
params
=
(
\
(
nt
,
n
,
nre
)
->
(
nre
,
version
,
nodeId
,
nt
,
n
))
<$>
as
mapM_
(
runPGSExecute
c
query
)
params
where
query
::
PGS
.
Query
query
=
[
sql
|
UPDATE node_stories
SET ngrams_repo_element = ?, version = ?
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
-- | This function updates the node story and archive for given node_id.
updateNodeStory
::
PGS
.
Connection
->
NodeId
->
ArchiveList
->
ArchiveList
->
IO
()
updateNodeStory
c
nodeId
currentArchive
newArchive
=
do
...
...
@@ -342,17 +208,6 @@ upsertNodeStories c nodeId newArchive = do
-- printDebug "[upsertNodeStories] STOP nId" nId
updateNodeStoryVersion
::
PGS
.
Connection
->
NodeId
->
ArchiveList
->
IO
()
updateNodeStoryVersion
c
nodeId
newArchive
=
do
let
ngramsTypes
=
Map
.
keys
$
newArchive
^.
a_state
mapM_
(
\
nt
->
runPGSExecute
c
query
(
newArchive
^.
a_version
,
nodeId
,
nt
))
ngramsTypes
where
query
::
PGS
.
Query
query
=
[
sql
|
UPDATE node_stories
SET version = ?
WHERE node_id = ?
AND ngrams_type_id = ?
|]
-- | Returns a `NodeListStory`, updating the given one for given `NodeId`
nodeStoryInc
::
PGS
.
Connection
->
NodeListStory
->
NodeId
->
IO
NodeListStory
nodeStoryInc
c
ns
@
(
NodeStory
nls
)
nId
=
do
...
...
src/Gargantext/Core/NodeStory/DB.hs
0 → 100644
View file @
ab68f83b
{-|
Module : Gargantext.Core.NodeStory.DB
Description : NodeStory DB functions
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Core.NodeStory.DB
(
nodeExists
,
getNodesIdWithType
,
getNodesArchiveHistory
,
insertNodeArchiveHistory
,
nodeStoriesQuery
,
insertArchiveStateList
,
deleteArchiveStateList
,
updateArchiveStateList
,
updateNodeStoryVersion
)
where
import
Control.Lens
((
^.
))
import
Control.Monad.Except
import
Data.HashMap.Strict
(
HashMap
)
import
Data.HashMap.Strict
qualified
as
HashMap
import
Data.Map.Strict
qualified
as
Map
import
Data.Map.Strict.Patch
qualified
as
PM
import
Data.Monoid
import
Database.PostgreSQL.Simple
qualified
as
PGS
import
Database.PostgreSQL.Simple.SqlQQ
(
sql
)
import
Database.PostgreSQL.Simple.Types
(
Values
(
..
),
QualifiedIdentifier
(
..
))
import
Gargantext.API.Ngrams.Types
import
Gargantext.Core
(
toDBid
)
import
Gargantext.Core.NodeStory.Types
import
Gargantext.Core.Types
(
NodeId
(
..
),
NodeType
)
import
Gargantext.Database.Admin.Config
()
import
Gargantext.Database.Query.Table.Ngrams
qualified
as
TableNgrams
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
)
import
Gargantext.Prelude
hiding
(
to
)
import
Gargantext.Prelude.Database
nodeExists
::
PGS
.
Connection
->
NodeId
->
IO
Bool
nodeExists
c
nId
=
(
==
[
PGS
.
Only
True
])
<$>
runPGSQuery
c
[
sql
|
SELECT true FROM nodes WHERE id = ? LIMIT 1
|]
(
PGS
.
Only
nId
)
getNodesIdWithType
::
PGS
.
Connection
->
NodeType
->
IO
[
NodeId
]
getNodesIdWithType
c
nt
=
do
ns
<-
runPGSQuery
c
query
(
PGS
.
Only
$
toDBid
nt
)
pure
$
map
(
\
(
PGS
.
Only
nId
)
->
UnsafeMkNodeId
nId
)
ns
where
query
::
PGS
.
Query
query
=
[
sql
|
SELECT id FROM nodes WHERE typename = ?
|]
-- /!\ This function is using an hard coded parameter
-- which depends on the Ngrams List Flow
-- Version > 5 is hard coded because by default
-- first version of history of manual change is 6
getNodesArchiveHistory
::
PGS
.
Connection
->
[
NodeId
]
->
IO
[(
NodeId
,
(
Map
NgramsType
[
HashMap
NgramsTerm
NgramsPatch
]))]
getNodesArchiveHistory
c
nodesId
=
do
as
<-
runPGSQuery
c
query
(
PGS
.
Only
$
Values
fields
nodesId
)
::
IO
[(
Int
,
TableNgrams
.
NgramsType
,
NgramsTerm
,
NgramsPatch
)]
pure
$
map
(
\
(
nId
,
ngramsType
,
terms
,
patch
)
->
(
UnsafeMkNodeId
nId
,
Map
.
singleton
ngramsType
[
HashMap
.
singleton
terms
patch
]
)
)
as
where
fields
=
[
QualifiedIdentifier
Nothing
"int4"
]
query
::
PGS
.
Query
query
=
[
sql
|
WITH nodes_id(nid) as (?)
SELECT node_id, ngrams_type_id, terms, patch
FROM node_story_archive_history
JOIN ngrams ON ngrams.id = ngrams_id
JOIN nodes_id n ON node_id = n.nid
WHERE version > 5
ORDER BY (version, node_story_archive_history.id) DESC
|]
insertNodeArchiveHistory
::
PGS
.
Connection
->
NodeId
->
Version
->
[
NgramsStatePatch'
]
->
IO
()
insertNodeArchiveHistory
_
_
_
[]
=
pure
()
insertNodeArchiveHistory
c
nodeId
version
(
h
:
hs
)
=
do
let
tuples
=
mconcat
$
(
\
(
nType
,
NgramsTablePatch
patch
)
->
(
\
(
term
,
p
)
->
(
nodeId
,
nType
,
term
,
p
))
<$>
PM
.
toList
patch
)
<$>
PM
.
toList
h
::
[(
NodeId
,
TableNgrams
.
NgramsType
,
NgramsTerm
,
NgramsPatch
)]
tuplesM
<-
mapM
(
\
(
nId
,
nType
,
term
,
patch
)
->
do
[
PGS
.
Only
ngramsId
]
<-
runPGSReturning
c
qInsert
[
PGS
.
Only
term
]
::
IO
[
PGS
.
Only
Int
]
pure
(
nId
,
nType
,
ngramsId
,
term
,
patch
)
)
tuples
::
IO
[(
NodeId
,
TableNgrams
.
NgramsType
,
Int
,
NgramsTerm
,
NgramsPatch
)]
_
<-
runPGSExecuteMany
c
query
$
((
\
(
nId
,
nType
,
termId
,
_term
,
patch
)
->
(
nId
,
nType
,
termId
,
patch
,
version
))
<$>
tuplesM
)
_
<-
insertNodeArchiveHistory
c
nodeId
version
hs
pure
()
where
qInsert
::
PGS
.
Query
qInsert
=
[
sql
|
INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id
|]
-- https://stackoverflow.com/questions/39224438/postgresql-insert-if-foreign-key-exists
query
::
PGS
.
Query
query
=
[
sql
|
INSERT INTO node_story_archive_history(node_id, ngrams_type_id, ngrams_id, patch, version)
VALUES (?, ?, ?, ?, ?)
|]
nodeStoriesQuery
::
PGS
.
Query
nodeStoriesQuery
=
[
sql
|
SELECT version, ngrams_type_id, terms, ngrams_repo_element
FROM node_stories
JOIN ngrams ON ngrams.id = ngrams_id
WHERE node_id = ?
|]
-- Archive
insertArchiveStateList
::
PGS
.
Connection
->
NodeId
->
Version
->
ArchiveStateList
->
IO
()
insertArchiveStateList
c
nodeId
version
as
=
do
mapM_
performInsert
as
where
performInsert
(
ngramsType
,
ngrams
,
ngramsRepoElement
)
=
do
[
PGS
.
Only
ngramsId
]
<-
tryInsertTerms
ngrams
_
<-
case
ngramsRepoElement
^.
nre_root
of
Nothing
->
pure
[]
Just
r
->
tryInsertTerms
r
mapM_
tryInsertTerms
$
ngramsRepoElement
^.
nre_children
runPGSExecute
c
query
(
nodeId
,
ngramsId
,
version
,
ngramsType
,
ngramsRepoElement
)
tryInsertTerms
::
NgramsTerm
->
IO
[
PGS
.
Only
Int
]
tryInsertTerms
t
=
runPGSReturning
c
qInsert
[
PGS
.
Only
t
]
qInsert
::
PGS
.
Query
qInsert
=
[
sql
|
INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id
|]
query
::
PGS
.
Query
query
=
[
sql
|
INSERT INTO node_stories(node_id, ngrams_id, version, ngrams_type_id, ngrams_repo_element)
VALUES (?, ?, ?, ?, ? :: jsonb)
|]
deleteArchiveStateList
::
PGS
.
Connection
->
NodeId
->
ArchiveStateList
->
IO
()
deleteArchiveStateList
c
nodeId
as
=
do
mapM_
(
\
(
nt
,
n
,
_
)
->
runPGSExecute
c
query
(
nodeId
,
nt
,
n
))
as
where
query
::
PGS
.
Query
query
=
[
sql
|
DELETE FROM node_stories
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateArchiveStateList
::
PGS
.
Connection
->
NodeId
->
Version
->
ArchiveStateList
->
IO
()
updateArchiveStateList
c
nodeId
version
as
=
do
let
params
=
(
\
(
nt
,
n
,
nre
)
->
(
nre
,
version
,
nodeId
,
nt
,
n
))
<$>
as
mapM_
(
runPGSExecute
c
query
)
params
where
query
::
PGS
.
Query
query
=
[
sql
|
UPDATE node_stories
SET ngrams_repo_element = ?, version = ?
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateNodeStoryVersion
::
PGS
.
Connection
->
NodeId
->
ArchiveList
->
IO
()
updateNodeStoryVersion
c
nodeId
newArchive
=
do
let
ngramsTypes
=
Map
.
keys
$
newArchive
^.
a_state
mapM_
(
\
nt
->
runPGSExecute
c
query
(
newArchive
^.
a_version
,
nodeId
,
nt
))
ngramsTypes
where
query
::
PGS
.
Query
query
=
[
sql
|
UPDATE node_stories
SET version = ?
WHERE node_id = ?
AND ngrams_type_id = ?
|]
src/Gargantext/Database/Action/Flow.hs
View file @
ab68f83b
...
...
@@ -15,8 +15,6 @@ Portability : POSIX
-- TODO-EVENTS: InsertedNodes
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE InstanceSigs #-}
...
...
@@ -41,7 +39,6 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
,
addDocumentsToHyperCorpus
,
reIndexWith
,
docNgrams
,
getOrMkRoot
,
getOrMk_RootWithCorpus
...
...
@@ -50,53 +47,44 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
,
allDataOrigins
,
do_api
,
indexAllDocumentsWithPosTag
)
where
import
Conduit
import
Control.Lens
hiding
(
elements
,
Indexed
)
import
Data.
Aeson.TH
(
deriveJSON
)
import
Data.
Bifunctor
qualified
as
B
import
Data.Conduit
qualified
as
C
import
Data.Conduit.Internal
(
zipSources
)
import
Data.Conduit.List
qualified
as
CL
import
Data.Conduit.List
qualified
as
CList
import
Data.HashMap.Strict
qualified
as
HashMap
import
Data.List
qualified
as
List
import
Data.Map.Strict
(
lookup
)
import
Data.Map.Strict
qualified
as
Map
import
Data.Proxy
import
Data.Set
qualified
as
Set
import
Data.Swagger
import
Data.Text
qualified
as
T
import
EPO.API.Client.Types
qualified
as
EPO
import
Gargantext.API.Ngrams.Tools
(
getTermsWith
)
import
Gargantext.API.Ngrams.Types
qualified
as
NT
import
Gargantext.Core
(
Lang
(
..
),
PosTagAlgo
(
..
),
NLPServerConfig
)
import
Gargantext.Core
(
Lang
(
..
),
NLPServerConfig
)
import
Gargantext.Core
(
withDefaultLanguage
)
import
Gargantext.Core.Ext.IMTUser
(
readFile_Annuaire
)
import
Gargantext.Core.Flow.Types
import
Gargantext.Core.NLP
(
HasNLPServer
,
nlpServerGet
)
import
Gargantext.Core.NodeStory
(
HasNodeStory
)
import
Gargantext.Core.Text
import
Gargantext.Core.Text.Corpus.API
qualified
as
API
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
,
FileType
,
splitOn
)
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
,
FileType
)
import
Gargantext.Core.Text.List
(
buildNgramsLists
)
import
Gargantext.Core.Text.List.Group.WithStem
(
{-StopSize(..),-}
GroupParams
(
..
))
import
Gargantext.Core.Text.List.Social
(
FlowSocialListWith
(
..
))
import
Gargantext.Core.Text.Terms
import
Gargantext.Core.Text.Terms.Mono.Stem.En
(
stemIt
)
import
Gargantext.Core.Text.Terms.WithList
(
MatchedText
,
buildPatternsWith
,
termsInText
)
import
Gargantext.Core.Types
(
HasValidationError
,
POS
(
NP
),
TermsCount
)
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
)
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Types.Main
import
Gargantext.Core.Types.Query
(
Limit
)
import
Gargantext.Core.Utils
(
addTuples
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Data.HashMap.Strict.Utils
qualified
as
HashMap
import
Gargantext.Database.Action.Flow.Extract
()
-- ExtractNgramsT instances
import
Gargantext.Database.Action.Flow.List
import
Gargantext.Database.Action.Flow.Utils
(
docNgrams
,
documentIdWithNgrams
,
insertDocNgrams
,
insertDocs
,
mapNodeIdNgrams
)
import
Gargantext.Database.Action.Flow.Types
import
Gargantext.Database.Action.Flow.Utils
(
insertDocNgrams
,
DocumentIdWithNgrams
(
..
))
import
Gargantext.Database.Action.Metrics
(
updateNgramsOccurrences
,
updateContextScore
)
import
Gargantext.Database.Action.Search
(
searchDocInDatabase
)
import
Gargantext.Database.Admin.Config
(
userMaster
,
corpusMasterName
)
...
...
@@ -112,42 +100,23 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import
Gargantext.Database.Query.Table.NodeContext
(
selectDocNodes
)
import
Gargantext.Database.Query.Table.NodeNgrams
(
listInsertDb
,
getCgramsId
)
import
Gargantext.Database.Query.Tree.Root
(
getOrMkRoot
,
getOrMk_RootWithCorpus
)
import
Gargantext.Database.Schema.Context
import
Gargantext.Database.Schema.Node
(
NodePoly
(
..
),
node_id
)
import
Gargantext.Database.Schema.Node
(
node_hyperdata
)
import
Gargantext.Database.Types
import
Gargantext.Prelude
hiding
(
to
)
import
Gargantext.Prelude.Config
(
GargConfig
(
..
))
import
Gargantext.Prelude.Crypto.Hash
(
Hash
)
import
Gargantext.System.Logging
import
Gargantext.Utils.Jobs
(
JobHandle
,
MonadJobStatus
(
..
))
import
PUBMED.Types
qualified
as
PUBMED
import
qualified
Data.Bifunctor
as
B
------------------------------------------------------------------------
-- Imports for upgrade function
import
Gargantext.Database.Query.Tree.Root
(
getRootId
)
import
Gargantext.Database.Query.Tree
(
findNodesId
,
HasTreeError
)
import
Gargantext.Database.Query.Tree
(
HasTreeError
)
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
data
DataOrigin
=
InternalOrigin
{
_do_api
::
API
.
ExternalAPIs
}
|
ExternalOrigin
{
_do_api
::
API
.
ExternalAPIs
}
-- TODO Web
deriving
(
Generic
,
Eq
)
makeLenses
''
D
ataOrigin
deriveJSON
(
unPrefix
"_do_"
)
''
D
ataOrigin
instance
ToSchema
DataOrigin
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_do_"
)
allDataOrigins
::
[
DataOrigin
]
allDataOrigins
=
map
InternalOrigin
API
.
externalAPIs
<>
map
ExternalOrigin
API
.
externalAPIs
---------------
data
DataText
=
DataOld
!
[
NodeId
]
|
DataNew
!
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
)
--- | DataNew ![[HyperdataDocument]]
-- Show instance is not possible because of IO
printDataText
::
DataText
->
IO
()
...
...
@@ -311,9 +280,9 @@ flow c u cn la mfslw (count, docsC) jobHandle = do
-- TODO if public insertMasterDocs else insertUserDocs
nlpServer
<-
view
$
nlpServerGet
(
_tt_lang
la
)
runConduit
$
zipSources
(
yieldMany
([
1
..
]
::
[
Int
]))
docsC
.|
CList
.
chunksOf
100
.|
mapM_C
(
addDocumentsWithProgress
nlpServer
userCorpusId
)
.|
sinkNull
.|
CList
.
chunksOf
100
.|
mapM_C
(
addDocumentsWithProgress
nlpServer
userCorpusId
)
.|
sinkNull
$
(
logLocM
)
DEBUG
"Calling flowCorpusUser"
flowCorpusUser
(
la
^.
tt_lang
)
u
userCorpusId
listId
c
mfslw
...
...
@@ -384,26 +353,8 @@ flowCorpusUser :: ( HasNodeError err
->
Maybe
FlowSocialListWith
->
m
CorpusId
flowCorpusUser
l
user
userCorpusId
listId
ctype
mfslw
=
do
server
<-
view
(
nlpServerGet
l
)
-- User List Flow
(
masterUserId
,
_masterRootId
,
masterCorpusId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
ctype
buildSocialList
l
user
userCorpusId
listId
ctype
mfslw
--let gp = (GroupParams l 2 3 (StopSize 3))
-- Here the PosTagAlgo should be chosen according to the Lang
_
<-
case
mfslw
of
(
Just
(
NoList
_
))
->
do
-- printDebug "Do not build list" mfslw
pure
()
_
->
do
ngs
<-
buildNgramsLists
user
userCorpusId
masterCorpusId
mfslw
$
GroupWithPosTag
l
server
HashMap
.
empty
-- printDebug "flowCorpusUser:ngs" ngs
_userListId
<-
flowList_DbRepo
listId
ngs
_mastListId
<-
getOrMkList
masterCorpusId
masterUserId
pure
()
-- _ <- insertOccsUpdates userCorpusId mastListId
--
_
<-
mkPhylo
userCorpusId
userId
-- Annuaire Flow
...
...
@@ -415,6 +366,39 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do
pure
userCorpusId
buildSocialList
::
(
HasNodeError
err
,
HasValidationError
err
,
HasNLPServer
env
,
HasTreeError
err
,
HasNodeStory
env
err
m
,
MkCorpus
c
)
=>
Lang
->
User
->
CorpusId
->
ListId
->
Maybe
c
->
Maybe
FlowSocialListWith
->
m
()
buildSocialList
_l
_user
_userCorpusId
_listId
_ctype
(
Just
(
NoList
_
))
=
pure
()
buildSocialList
l
user
userCorpusId
listId
ctype
mfslw
=
do
-- User List Flow
(
masterUserId
,
_masterRootId
,
masterCorpusId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
ctype
nlpServer
<-
view
(
nlpServerGet
l
)
--let gp = (GroupParams l 2 3 (StopSize 3))
-- Here the PosTagAlgo should be chosen according to the Lang
ngs
<-
buildNgramsLists
user
userCorpusId
masterCorpusId
mfslw
$
GroupWithPosTag
l
nlpServer
HashMap
.
empty
-- printDebug "flowCorpusUser:ngs" ngs
_userListId
<-
flowList_DbRepo
listId
ngs
_mastListId
<-
getOrMkList
masterCorpusId
masterUserId
pure
()
insertMasterDocs
::
(
DbCmd'
env
err
m
,
HasNodeError
err
,
FlowCorpus
a
...
...
@@ -483,170 +467,6 @@ saveDocNgramsWith lId mapNgramsDocs' = do
------------------------------------------------------------------------
-- TODO Type NodeDocumentUnicised
insertDocs
::
(
DbCmd'
env
err
m
-- , FlowCorpus a
,
FlowInsertDB
a
,
HasNodeError
err
)
=>
UserId
->
CorpusId
->
[
a
]
->
m
([
ContextId
],
[
Indexed
ContextId
a
])
insertDocs
uId
cId
hs
=
do
let
docs
=
map
addUniqId
hs
newIds
<-
insertDb
uId
Nothing
docs
-- printDebug "newIds" newIds
let
newIds'
=
map
(
nodeId2ContextId
.
reId
)
newIds
documentsWithId
=
mergeData
(
toInserted
newIds
)
(
Map
.
fromList
$
map
viewUniqId'
docs
)
_
<-
Doc
.
add
cId
newIds'
pure
(
newIds'
,
map
(
B
.
first
nodeId2ContextId
)
documentsWithId
)
------------------------------------------------------------------------
viewUniqId'
::
UniqId
a
=>
a
->
(
Hash
,
a
)
viewUniqId'
d
=
maybe
err
(
\
h
->
(
h
,
d
))
(
view
uniqId
d
)
where
err
=
panicTrace
"[ERROR] Database.Flow.toInsert"
toInserted
::
[
ReturnId
]
->
Map
Hash
ReturnId
toInserted
=
Map
.
fromList
.
map
(
\
r
->
(
reUniqId
r
,
r
)
)
.
filter
(
\
r
->
reInserted
r
==
True
)
mergeData
::
Map
Hash
ReturnId
->
Map
Hash
a
->
[
Indexed
NodeId
a
]
mergeData
rs
=
catMaybes
.
map
toDocumentWithId
.
Map
.
toList
where
toDocumentWithId
(
sha
,
hpd
)
=
Indexed
<$>
fmap
reId
(
lookup
sha
rs
)
<*>
Just
hpd
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
documentIdWithNgrams
::
HasNodeError
err
=>
(
a
->
DBCmd
err
(
HashMap
.
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)))
->
[
Indexed
NodeId
a
]
->
DBCmd
err
[
DocumentIdWithNgrams
a
b
]
documentIdWithNgrams
f
=
traverse
toDocumentIdWithNgrams
where
toDocumentIdWithNgrams
d
=
do
e
<-
f
$
_unIndex
d
pure
$
DocumentIdWithNgrams
d
e
-- | TODO check optimization
mapNodeIdNgrams
::
(
Ord
b
,
Hashable
b
)
=>
[
DocumentIdWithNgrams
a
b
]
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
))
)
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
Map
.
unionWith
(
Map
.
unionWith
addTuples
))
.
fmap
f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f
::
DocumentIdWithNgrams
a
b
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
i
->
Map
.
singleton
nId
(
i
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
where
nId
=
_index
$
documentWithId
d
------------------------------------------------------------------------
instance
ExtractNgramsT
HyperdataContact
where
extractNgramsT
_ncs
l
hc
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
\
a
->
[
a
])
$
view
(
hc_who
.
_Just
.
cw_lastName
)
hc'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
))
|
a'
<-
authors
]
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
NLPServerConfig
->
TermType
Lang
->
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT
ncs
lang
hd
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extractNgramsT'
hd
where
extractNgramsT'
::
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
_hd_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
Institutes
(
doc
^.
hd_bdd
))
$
_hd_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
Authors
(
doc
^.
hd_bdd
))
$
_hd_authors
doc
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
ncs
lang
$
hasText
doc
)
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
(
Map
.
singleton
Sources
1
,
1
))
]
<>
[(
SimpleNgrams
i'
,
(
Map
.
singleton
Institutes
1
,
1
))
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
(
Map
.
singleton
Authors
1
,
1
))
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
(
Map
.
singleton
NgramsTerms
1
,
cnt'
))
|
(
t'
,
cnt'
)
<-
termsWithCounts'
]
instance
(
ExtractNgramsT
a
,
HasText
a
)
=>
ExtractNgramsT
(
Node
a
)
where
extractNgramsT
ncs
l
(
Node
{
_node_hyperdata
=
h
})
=
extractNgramsT
ncs
l
h
instance
HasText
a
=>
HasText
(
Node
a
)
where
hasText
(
Node
{
_node_hyperdata
=
h
})
=
hasText
h
-- | TODO putelsewhere
-- | Upgrade function
-- Suppose all documents are English (this is the case actually)
indexAllDocumentsWithPosTag
::
(
HasNodeStory
env
err
m
,
HasNLPServer
env
)
=>
m
()
indexAllDocumentsWithPosTag
=
do
rootId
<-
getRootId
(
UserName
userMaster
)
corpusIds
<-
findNodesId
rootId
[
NodeCorpus
]
docs
<-
List
.
concat
<$>
mapM
getDocumentsWithParentId
corpusIds
_
<-
mapM
extractInsert
(
splitEvery
1000
docs
)
pure
()
extractInsert
::
(
HasNodeStory
env
err
m
,
HasNLPServer
env
)
=>
[
Node
HyperdataDocument
]
->
m
()
extractInsert
docs
=
do
let
documentsWithId
=
map
(
\
doc
->
Indexed
(
doc
^.
node_id
)
doc
)
docs
let
lang
=
EN
ncs
<-
view
$
nlpServerGet
lang
mapNgramsDocs'
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
ncs
$
withLang
(
Multi
lang
)
documentsWithId
)
documentsWithId
_
<-
insertExtractedNgrams
$
HashMap
.
keys
mapNgramsDocs'
pure
()
...
...
@@ -680,22 +500,3 @@ reIndexWith cId lId nt lts = do
-- Saving the indexation in database
_
<-
mapM
(
saveDocNgramsWith
lId
)
ngramsByDoc
pure
()
docNgrams
::
Lang
->
NgramsType
->
[
NT
.
NgramsTerm
]
->
Gargantext
.
Database
.
Admin
.
Types
.
Node
.
Context
HyperdataDocument
->
[((
MatchedText
,
TermsCount
),
Map
NgramsType
(
Map
NodeId
Int
))]
docNgrams
lang
nt
ts
doc
=
List
.
zip
(
termsInText
lang
(
buildPatternsWith
lang
ts
)
$
T
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
)
(
List
.
cycle
[
Map
.
fromList
$
[(
nt
,
Map
.
singleton
(
doc
^.
context_id
)
1
)]])
src/Gargantext/Database/Action/Flow/Extract.hs
0 → 100644
View file @
ab68f83b
{-|
Module : Gargantext.Database.Flow.Extract
Description : Database Flow
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-}
module
Gargantext.Database.Action.Flow.Extract
where
import
Control.Lens
((
^.
),
_Just
,
view
)
import
Data.HashMap.Strict
qualified
as
HashMap
import
Data.Map.Strict
qualified
as
DM
import
Gargantext.Core
(
Lang
,
NLPServerConfig
,
PosTagAlgo
(
CoreNLP
))
import
Gargantext.Core.Text
(
HasText
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers
(
splitOn
)
import
Gargantext.Core.Text.Terms
(
ExtractNgramsT
,
ExtractedNgrams
(
..
),
TermType
,
cleanExtractedNgrams
,
enrichedTerms
,
extractNgramsT
,
extractTerms
,
tt_lang
)
import
Gargantext.Core.Types
(
POS
(
NP
),
TermsCount
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataContact
,
HyperdataDocument
,
cw_lastName
,
hc_who
,
hd_authors
,
hd_bdd
,
hd_institutes
,
hd_source
)
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Prelude
(
DBCmd
)
import
Gargantext.Database.Schema.Ngrams
import
Gargantext.Database.Schema.Node
(
NodePoly
(
..
))
import
Gargantext.Prelude
------------------------------------------------------------------------
instance
ExtractNgramsT
HyperdataContact
where
extractNgramsT
_ncs
l
hc
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
\
a
->
[
a
])
$
view
(
hc_who
.
_Just
.
cw_lastName
)
hc'
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
a'
,
(
DM
.
singleton
Authors
1
,
1
))
|
a'
<-
authors
]
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
NLPServerConfig
->
TermType
Lang
->
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT
ncs
lang
hd
=
HashMap
.
mapKeys
(
cleanExtractedNgrams
255
)
<$>
extractNgramsT'
hd
where
extractNgramsT'
::
HyperdataDocument
->
DBCmd
err
(
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
Int
,
TermsCount
))
extractNgramsT'
doc
=
do
let
source
=
text2ngrams
$
maybe
"Nothing"
identity
$
doc
^.
hd_source
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
Institutes
(
doc
^.
hd_bdd
))
$
doc
^.
hd_institutes
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
Authors
(
doc
^.
hd_bdd
))
$
doc
^.
hd_authors
termsWithCounts'
<-
map
(
\
(
t
,
cnt
)
->
(
enrichedTerms
(
lang
^.
tt_lang
)
CoreNLP
NP
t
,
cnt
))
<$>
concat
<$>
liftBase
(
extractTerms
ncs
lang
$
hasText
doc
)
pure
$
HashMap
.
fromList
$
[(
SimpleNgrams
source
,
(
DM
.
singleton
Sources
1
,
1
))
]
<>
[(
SimpleNgrams
i'
,
(
DM
.
singleton
Institutes
1
,
1
))
|
i'
<-
institutes
]
<>
[(
SimpleNgrams
a'
,
(
DM
.
singleton
Authors
1
,
1
))
|
a'
<-
authors
]
<>
[(
EnrichedNgrams
t'
,
(
DM
.
singleton
NgramsTerms
1
,
cnt'
))
|
(
t'
,
cnt'
)
<-
termsWithCounts'
]
instance
(
ExtractNgramsT
a
,
HasText
a
)
=>
ExtractNgramsT
(
Node
a
)
where
extractNgramsT
ncs
l
(
Node
{
_node_hyperdata
=
h
})
=
extractNgramsT
ncs
l
h
instance
HasText
a
=>
HasText
(
Node
a
)
where
hasText
(
Node
{
_node_hyperdata
=
h
})
=
hasText
h
-- Apparently unused functions
-- extractInsert :: ( HasNodeStory env err m
-- , HasNLPServer env )
-- => [Node HyperdataDocument] -> m ()
-- extractInsert docs = do
-- let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
-- let lang = EN
-- ncs <- view $ nlpServerGet lang
-- mapNgramsDocs' <- mapNodeIdNgrams
-- <$> documentIdWithNgrams
-- (extractNgramsT ncs $ withLang (Multi lang) documentsWithId)
-- documentsWithId
-- _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
-- pure ()
src/Gargantext/Database/Action/Flow/Types.hs
View file @
ab68f83b
...
...
@@ -9,29 +9,40 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Database.Action.Flow.Types
where
import
Conduit
(
ConduitT
)
import
Control.Lens
(
makeLenses
)
import
Data.Aeson
(
ToJSON
)
import
Gargantext.Core.Types
(
HasValidationError
)
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.HashMap.Strict
(
HashMap
)
import
Data.Swagger
(
ToSchema
(
..
),
genericDeclareNamedSchema
)
import
Gargantext.Core.Flow.Types
import
Gargantext.Core.Text
import
Gargantext.Core.NodeStory
import
Gargantext.Core.Text
import
Gargantext.Core.Text.Corpus.API
qualified
as
API
import
Gargantext.Core.Text.Terms
import
Gargantext.Database.Query.Table.Node.Error
(
HasNodeError
)
import
Gargantext.Core.Types
(
HasValidationError
,
TermsCount
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
)
import
Gargantext.Database.Admin.Types.Node
(
NodeId
)
import
Gargantext.Database.Prelude
(
CmdM
)
import
Gargantext.Database.Query.Table.Node.Document.Insert
import
Gargantext.Database.Query.Table.Node.Error
(
HasNodeError
)
import
Gargantext.Database.Query.Tree.Error
(
HasTreeError
)
import
Gargantext.Database.Schema.Ngrams
(
NgramsType
(
..
))
import
Gargantext.Database.Types
(
Indexed
)
import
Gargantext.Prelude
import
Gargantext.System.Logging
type
FlowCmdM
env
err
m
=
(
CmdM
env
err
m
,
HasNodeStory
env
err
m
...
...
@@ -56,3 +67,27 @@ type FlowInsertDB a = ( AddUniqId a
,
UniqParameters
a
,
InsertDb
a
)
data
DocumentIdWithNgrams
a
b
=
DocumentIdWithNgrams
{
documentWithId
::
Indexed
NodeId
a
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)
}
deriving
(
Show
)
-- TODO use internal with API name (could be old data)
data
DataOrigin
=
InternalOrigin
{
_do_api
::
API
.
ExternalAPIs
}
|
ExternalOrigin
{
_do_api
::
API
.
ExternalAPIs
}
-- TODO Web
deriving
(
Generic
,
Eq
)
makeLenses
''
D
ataOrigin
deriveJSON
(
unPrefix
"_do_"
)
''
D
ataOrigin
instance
ToSchema
DataOrigin
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_do_"
)
data
DataText
=
DataOld
!
[
NodeId
]
|
DataNew
!
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
)
--- | DataNew ![[HyperdataDocument]]
src/Gargantext/Database/Action/Flow/Utils.hs
View file @
ab68f83b
...
...
@@ -9,30 +9,47 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-}
module
Gargantext.Database.Action.Flow.Utils
where
(
docNgrams
,
documentIdWithNgrams
,
insertDocNgrams
,
insertDocs
,
mapNodeIdNgrams
)
where
import
Control.Lens
((
^.
))
import
Data.HashMap.Strict
(
HashMap
)
import
Data.HashMap.Strict
qualified
as
HashMap
import
Data.List
qualified
as
List
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
qualified
as
T
import
Gargantext.API.Ngrams.Types
qualified
as
NT
import
Gargantext.Core
(
Lang
,
toDBid
)
import
Gargantext.Core.Flow.Types
(
UniqId
,
uniqId
)
import
Gargantext.Core.Text.Terms.WithList
(
MatchedText
,
buildPatternsWith
,
termsInText
)
import
Gargantext.Core.Types
(
TermsCount
)
import
Gargantext.Core.Utils
(
addTuples
)
import
Gargantext.Data.HashMap.Strict.Utils
qualified
as
HashMap
import
Gargantext.Database.Action.Flow.Types
(
DocumentIdWithNgrams
(
..
),
FlowInsertDB
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
,
hd_abstract
,
hd_title
)
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Prelude
(
DBCmd
)
import
Gargantext.Database.Prelude
(
DBCmd
,
DbCmd
'
)
import
Gargantext.Database.Query.Table.ContextNodeNgrams
import
Gargantext.Database.Query.Table.Node.Document.Add
qualified
as
Doc
(
add
)
import
Gargantext.Database.Query.Table.Node.Document.Insert
(
ReturnId
,
addUniqId
,
insertDb
,
reId
,
reInserted
,
reUniqId
)
import
Gargantext.Database.Query.Table.Node.Error
(
HasNodeError
(
..
))
import
Gargantext.Database.Schema.Context
(
context_hyperdata
,
context_id
)
import
Gargantext.Database.Schema.Ngrams
import
Gargantext.Database.Types
import
Gargantext.Prelude
import
Gargantext.
Core
(
toDBid
)
import
Gargantext.
Prelude.Crypto.Hash
(
Hash
)
data
DocumentIdWithNgrams
a
b
=
DocumentIdWithNgrams
{
documentWithId
::
Indexed
NodeId
a
,
documentNgrams
::
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)
}
deriving
(
Show
)
insertDocNgrams
::
ListId
->
HashMap
(
Indexed
NgramsId
Ngrams
)
(
Map
NgramsType
(
Map
DocId
(
Int
,
TermsCount
)))
->
DBCmd
err
Int
...
...
@@ -52,3 +69,122 @@ insertDocNgrams lId m = do
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
docNgrams
::
Lang
->
NgramsType
->
[
NT
.
NgramsTerm
]
->
Gargantext
.
Database
.
Admin
.
Types
.
Node
.
Context
HyperdataDocument
->
[((
MatchedText
,
TermsCount
),
Map
NgramsType
(
Map
NodeId
Int
))]
docNgrams
lang
nt
ts
doc
=
List
.
zip
(
termsInText
lang
(
buildPatternsWith
lang
ts
)
$
T
.
unlines
$
catMaybes
[
doc
^.
context_hyperdata
.
hd_title
,
doc
^.
context_hyperdata
.
hd_abstract
]
)
(
List
.
cycle
[
DM
.
fromList
$
[(
nt
,
DM
.
singleton
(
doc
^.
context_id
)
1
)]])
documentIdWithNgrams
::
HasNodeError
err
=>
(
a
->
DBCmd
err
(
HashMap
.
HashMap
b
(
Map
NgramsType
Int
,
TermsCount
)))
->
[
Indexed
NodeId
a
]
->
DBCmd
err
[
DocumentIdWithNgrams
a
b
]
documentIdWithNgrams
f
=
traverse
toDocumentIdWithNgrams
where
toDocumentIdWithNgrams
d
=
do
e
<-
f
$
_unIndex
d
pure
$
DocumentIdWithNgrams
d
e
-- | TODO check optimization
mapNodeIdNgrams
::
(
Ord
b
,
Hashable
b
)
=>
[
DocumentIdWithNgrams
a
b
]
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
))
)
mapNodeIdNgrams
=
HashMap
.
unionsWith
(
DM
.
unionWith
(
DM
.
unionWith
addTuples
))
.
fmap
f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f
::
DocumentIdWithNgrams
a
b
->
HashMap
.
HashMap
b
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)))
f
d
=
fmap
(
\
(
ngramsTypeMap
,
cnt
)
->
fmap
(
\
i
->
DM
.
singleton
nId
(
i
,
cnt
))
ngramsTypeMap
)
$
documentNgrams
d
where
nId
=
_index
$
documentWithId
d
-- TODO Type NodeDocumentUnicised
insertDocs
::
(
DbCmd'
env
err
m
-- , FlowCorpus a
,
FlowInsertDB
a
,
HasNodeError
err
)
=>
UserId
->
CorpusId
->
[
a
]
->
m
([
ContextId
],
[
Indexed
ContextId
a
])
insertDocs
uId
cId
hs
=
do
let
docs
=
map
addUniqId
hs
newIds
<-
insertDb
uId
Nothing
docs
-- printDebug "newIds" newIds
let
newIds'
=
map
(
nodeId2ContextId
.
reId
)
newIds
documentsWithId
=
mergeData
(
toInserted
newIds
)
(
DM
.
fromList
$
map
viewUniqId'
docs
)
_
<-
Doc
.
add
cId
newIds'
pure
(
newIds'
,
map
(
first
nodeId2ContextId
)
documentsWithId
)
------------------------------------------------------------------------
viewUniqId'
::
UniqId
a
=>
a
->
(
Hash
,
a
)
viewUniqId'
d
=
maybe
err
(
\
h
->
(
h
,
d
))
(
d
^.
uniqId
)
where
err
=
panicTrace
"[ERROR] Database.Flow.toInsert"
mergeData
::
Map
Hash
ReturnId
->
Map
Hash
a
->
[
Indexed
NodeId
a
]
mergeData
rs
=
catMaybes
.
map
toDocumentWithId
.
DM
.
toList
where
toDocumentWithId
(
sha
,
hpd
)
=
Indexed
<$>
fmap
reId
(
DM
.
lookup
sha
rs
)
<*>
Just
hpd
toInserted
::
[
ReturnId
]
->
Map
Hash
ReturnId
toInserted
=
DM
.
fromList
.
map
(
\
r
->
(
reUniqId
r
,
r
)
)
.
filter
(
\
r
->
reInserted
r
==
True
)
-- Apparently unused functions
-- | TODO putelsewhere
-- | Upgrade function
-- Suppose all documents are English (this is the case actually)
-- indexAllDocumentsWithPosTag :: ( HasNodeStory env err m
-- , HasNLPServer env )
-- => m ()
-- indexAllDocumentsWithPosTag = do
-- rootId <- getRootId (UserName userMaster)
-- corpusIds <- findNodesId rootId [NodeCorpus]
-- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
-- _ <- mapM extractInsert (splitEvery 1000 docs)
-- pure ()
src/Gargantext/Database/Action/Metrics.hs
View file @
ab68f83b
...
...
@@ -30,7 +30,7 @@ import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import
Gargantext.API.Ngrams.Tools
(
filterListWithRoot
,
groupNodesByNgrams
,
Diagonal
(
..
),
getCoocByNgrams
,
mapTermListRoot
,
RootTerm
,
getRepo
)
import
Gargantext.API.Ngrams.Types
(
TabType
(
..
),
ngramsTypeFromTabType
,
NgramsTerm
(
..
))
import
Gargantext.Core
(
HasDBid
(
toDBid
))
import
Gargantext.Core.NodeStory
hiding
(
runPGSQuery
)
import
Gargantext.Core.NodeStory
import
Gargantext.Core.Text.Metrics
(
scored
,
Scored
(
..
),
{-localMetrics, toScored-}
)
import
Gargantext.Core.Types
(
ListType
(
..
),
NodeType
(
..
),
ContextId
,
contextId2NodeId
)
import
Gargantext.Core.Types.Query
(
Limit
(
..
))
...
...
@@ -74,7 +74,8 @@ getNgramsCooc cId lId tabType maybeLimit = do
------------------------------------------------------------------------
------------------------------------------------------------------------
updateNgramsOccurrences
::
(
HasNodeStory
env
err
m
)
=>
CorpusId
->
ListId
=>
CorpusId
->
ListId
->
m
()
updateNgramsOccurrences
cId
lId
=
do
_
<-
mapM
(
updateNgramsOccurrences'
cId
lId
Nothing
)
[
Terms
,
Sources
,
Authors
,
Institutes
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment