[nodestory] some Flow rewrite

Split into Extract/Types/Utils modules.

Found some functions that are unused.
parent 6b9588b1
Pipeline #5652 canceled with stages
......@@ -122,6 +122,7 @@ library
Gargantext.Core.Methods.Similarities
Gargantext.Core.NLP
Gargantext.Core.NodeStory
Gargantext.Core.NodeStory.DB
Gargantext.Core.NodeStory.Types
Gargantext.Core.Text
Gargantext.Core.Text.Context
......@@ -341,6 +342,7 @@ library
Gargantext.Database
Gargantext.Database.Action.Delete
Gargantext.Database.Action.Flow.Annuaire
Gargantext.Database.Action.Flow.Extract
Gargantext.Database.Action.Flow.List
Gargantext.Database.Action.Flow.Pairing
Gargantext.Database.Action.Flow.Utils
......
{-|
Module : Gargantext.Core.NodeStory
Description : Node API generation
Description : NodeStory
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -50,10 +50,6 @@ module Gargantext.Core.NodeStory
, getNodesArchiveHistory
, Archive(..)
, nodeExists
, runPGSQuery
, runPGSAdvisoryLock
, runPGSAdvisoryUnlock
, runPGSAdvisoryXactLock
, getNodesIdWithType
, fromDBNodeStoryEnv
, upsertNodeStories
......@@ -67,98 +63,23 @@ where
import Control.Lens ((^.), (.~), (%~), non, _Just, at, view)
import Control.Monad.Except
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Map.Strict.Patch qualified as PM
import Data.Monoid
import Data.Pool (Pool, withResource)
import Data.Set qualified as Set
import Database.PostgreSQL.Simple qualified as PGS
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.ToField qualified as PGS
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.API.Ngrams.Types
import Gargantext.Core (toDBid)
import Gargantext.Core.NodeStory.DB
import Gargantext.Core.NodeStory.Types
import Gargantext.Core.Types (ListId, NodeId(..), NodeType)
import Gargantext.Core.Types (ListId, NodeId(..))
import Gargantext.Database.Admin.Config ()
import Gargantext.Database.Prelude (HasConnectionPool(..))
import Gargantext.Database.Query.Table.Ngrams qualified as TableNgrams
import Gargantext.Database.Schema.Ngrams (NgramsType)
import Gargantext.Prelude hiding (to)
import Gargantext.Prelude.Database
-- DB stuff
nodeExists :: PGS.Connection -> NodeId -> IO Bool
nodeExists c nId = (== [PGS.Only True])
<$> runPGSQuery c [sql| SELECT true FROM nodes WHERE id = ? LIMIT 1 |]
(PGS.Only nId)
getNodesIdWithType :: PGS.Connection -> NodeType -> IO [NodeId]
getNodesIdWithType c nt = do
ns <- runPGSQuery c query (PGS.Only $ toDBid nt)
pure $ map (\(PGS.Only nId) -> UnsafeMkNodeId nId) ns
where
query :: PGS.Query
query = [sql| SELECT id FROM nodes WHERE typename = ? |]
-- /!\ This function is using an hard coded parameter
-- which depends on the Ngrams List Flow
-- Version > 5 is hard coded because by default
-- first version of history of manual change is 6
getNodesArchiveHistory :: PGS.Connection
-> [NodeId]
-> IO [(NodeId, (Map NgramsType [HashMap NgramsTerm NgramsPatch]))]
getNodesArchiveHistory c nodesId = do
as <- runPGSQuery c query (PGS.Only $ Values fields nodesId)
:: IO [(Int, TableNgrams.NgramsType, NgramsTerm, NgramsPatch)]
pure $ map (\(nId, ngramsType, terms, patch)
-> ( UnsafeMkNodeId nId
, Map.singleton ngramsType [HashMap.singleton terms patch]
)
) as
where
fields = [QualifiedIdentifier Nothing "int4"]
query :: PGS.Query
query = [sql| WITH nodes_id(nid) as (?)
SELECT node_id, ngrams_type_id, terms, patch
FROM node_story_archive_history
JOIN ngrams ON ngrams.id = ngrams_id
JOIN nodes_id n ON node_id = n.nid
WHERE version > 5
ORDER BY (version, node_story_archive_history.id) DESC
|]
insertNodeArchiveHistory :: PGS.Connection -> NodeId -> Version -> [NgramsStatePatch'] -> IO ()
insertNodeArchiveHistory _ _ _ [] = pure ()
insertNodeArchiveHistory c nodeId version (h:hs) = do
let tuples = mconcat $ (\(nType, NgramsTablePatch patch) ->
(\(term, p) ->
(nodeId, nType, term, p)) <$> PM.toList patch) <$> PM.toList h :: [(NodeId, TableNgrams.NgramsType, NgramsTerm, NgramsPatch)]
tuplesM <- mapM (\(nId, nType, term, patch) -> do
[PGS.Only ngramsId] <- runPGSReturning c qInsert [PGS.Only term] :: IO [PGS.Only Int]
pure (nId, nType, ngramsId, term, patch)
) tuples :: IO [(NodeId, TableNgrams.NgramsType, Int, NgramsTerm, NgramsPatch)]
_ <- runPGSExecuteMany c query $ ((\(nId, nType, termId, _term, patch) -> (nId, nType, termId, patch, version)) <$> tuplesM)
_ <- insertNodeArchiveHistory c nodeId version hs
pure ()
where
qInsert :: PGS.Query
qInsert = [sql|INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id|]
-- https://stackoverflow.com/questions/39224438/postgresql-insert-if-foreign-key-exists
query :: PGS.Query
query = [sql| INSERT INTO node_story_archive_history(node_id, ngrams_type_id, ngrams_id, patch, version)
VALUES (?, ?, ?, ?, ?)
|]
getNodeStory' :: PGS.Connection -> NodeId -> IO ArchiveList
getNodeStory' c nId = do
......@@ -196,14 +117,6 @@ getNodeStory c nId = do
a <- getNodeStory' c nId
pure $ NodeStory $ Map.singleton nId a
nodeStoriesQuery :: PGS.Query
nodeStoriesQuery = [sql| SELECT version, ngrams_type_id, terms, ngrams_repo_element
FROM node_stories
JOIN ngrams ON ngrams.id = ngrams_id
WHERE node_id = ?
|]
-- |Functions to convert archive state (which is a `Map NgramsType
-- (Map NgramsTerm NgramsRepoElement`)) to/from a flat list
archiveStateToList :: NgramsState' -> ArchiveStateList
......@@ -224,53 +137,6 @@ insertNodeStory :: PGS.Connection -> NodeId -> ArchiveList -> IO ()
insertNodeStory c nId a = do
insertArchiveStateList c nId (a ^. a_version) (archiveStateToList $ a ^. a_state)
insertArchiveStateList :: PGS.Connection -> NodeId -> Version -> ArchiveStateList -> IO ()
insertArchiveStateList c nodeId version as = do
mapM_ performInsert as
where
performInsert (ngramsType, ngrams, ngramsRepoElement) = do
[PGS.Only ngramsId] <- tryInsertTerms ngrams
_ <- case ngramsRepoElement ^. nre_root of
Nothing -> pure []
Just r -> tryInsertTerms r
mapM_ tryInsertTerms $ ngramsRepoElement ^. nre_children
runPGSExecute c query (nodeId, ngramsId, version, ngramsType, ngramsRepoElement)
tryInsertTerms :: NgramsTerm -> IO [PGS.Only Int]
tryInsertTerms t = runPGSReturning c qInsert [PGS.Only t]
qInsert :: PGS.Query
qInsert = [sql|INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id|]
query :: PGS.Query
query = [sql|INSERT INTO node_stories(node_id, ngrams_id, version, ngrams_type_id, ngrams_repo_element)
VALUES (?, ?, ?, ?, ? :: jsonb)
|]
deleteArchiveStateList :: PGS.Connection -> NodeId -> ArchiveStateList -> IO ()
deleteArchiveStateList c nodeId as = do
mapM_ (\(nt, n, _) -> runPGSExecute c query (nodeId, nt, n)) as
where
query :: PGS.Query
query = [sql| DELETE FROM node_stories
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateArchiveStateList :: PGS.Connection -> NodeId -> Version -> ArchiveStateList -> IO ()
updateArchiveStateList c nodeId version as = do
let params = (\(nt, n, nre) -> (nre, version, nodeId, nt, n)) <$> as
mapM_ (runPGSExecute c query) params
where
query :: PGS.Query
query = [sql| UPDATE node_stories
SET ngrams_repo_element = ?, version = ?
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
-- | This function updates the node story and archive for given node_id.
updateNodeStory :: PGS.Connection -> NodeId -> ArchiveList -> ArchiveList -> IO ()
updateNodeStory c nodeId currentArchive newArchive = do
......@@ -342,17 +208,6 @@ upsertNodeStories c nodeId newArchive = do
-- printDebug "[upsertNodeStories] STOP nId" nId
updateNodeStoryVersion :: PGS.Connection -> NodeId -> ArchiveList -> IO ()
updateNodeStoryVersion c nodeId newArchive = do
let ngramsTypes = Map.keys $ newArchive ^. a_state
mapM_ (\nt -> runPGSExecute c query (newArchive ^. a_version, nodeId, nt)) ngramsTypes
where
query :: PGS.Query
query = [sql|UPDATE node_stories
SET version = ?
WHERE node_id = ?
AND ngrams_type_id = ?|]
-- | Returns a `NodeListStory`, updating the given one for given `NodeId`
nodeStoryInc :: PGS.Connection -> NodeListStory -> NodeId -> IO NodeListStory
nodeStoryInc c ns@(NodeStory nls) nId = do
......
{-|
Module : Gargantext.Core.NodeStory.DB
Description : NodeStory DB functions
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.NodeStory.DB
( nodeExists
, getNodesIdWithType
, getNodesArchiveHistory
, insertNodeArchiveHistory
, nodeStoriesQuery
, insertArchiveStateList
, deleteArchiveStateList
, updateArchiveStateList
, updateNodeStoryVersion )
where
import Control.Lens ((^.))
import Control.Monad.Except
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Map.Strict.Patch qualified as PM
import Data.Monoid
import Database.PostgreSQL.Simple qualified as PGS
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.API.Ngrams.Types
import Gargantext.Core (toDBid)
import Gargantext.Core.NodeStory.Types
import Gargantext.Core.Types (NodeId(..), NodeType)
import Gargantext.Database.Admin.Config ()
import Gargantext.Database.Query.Table.Ngrams qualified as TableNgrams
import Gargantext.Database.Schema.Ngrams (NgramsType)
import Gargantext.Prelude hiding (to)
import Gargantext.Prelude.Database
nodeExists :: PGS.Connection -> NodeId -> IO Bool
nodeExists c nId = (== [PGS.Only True])
<$> runPGSQuery c [sql| SELECT true FROM nodes WHERE id = ? LIMIT 1 |]
(PGS.Only nId)
getNodesIdWithType :: PGS.Connection -> NodeType -> IO [NodeId]
getNodesIdWithType c nt = do
ns <- runPGSQuery c query (PGS.Only $ toDBid nt)
pure $ map (\(PGS.Only nId) -> UnsafeMkNodeId nId) ns
where
query :: PGS.Query
query = [sql| SELECT id FROM nodes WHERE typename = ? |]
-- /!\ This function is using an hard coded parameter
-- which depends on the Ngrams List Flow
-- Version > 5 is hard coded because by default
-- first version of history of manual change is 6
getNodesArchiveHistory :: PGS.Connection
-> [NodeId]
-> IO [(NodeId, (Map NgramsType [HashMap NgramsTerm NgramsPatch]))]
getNodesArchiveHistory c nodesId = do
as <- runPGSQuery c query (PGS.Only $ Values fields nodesId)
:: IO [(Int, TableNgrams.NgramsType, NgramsTerm, NgramsPatch)]
pure $ map (\(nId, ngramsType, terms, patch)
-> ( UnsafeMkNodeId nId
, Map.singleton ngramsType [HashMap.singleton terms patch]
)
) as
where
fields = [QualifiedIdentifier Nothing "int4"]
query :: PGS.Query
query = [sql| WITH nodes_id(nid) as (?)
SELECT node_id, ngrams_type_id, terms, patch
FROM node_story_archive_history
JOIN ngrams ON ngrams.id = ngrams_id
JOIN nodes_id n ON node_id = n.nid
WHERE version > 5
ORDER BY (version, node_story_archive_history.id) DESC
|]
insertNodeArchiveHistory :: PGS.Connection -> NodeId -> Version -> [NgramsStatePatch'] -> IO ()
insertNodeArchiveHistory _ _ _ [] = pure ()
insertNodeArchiveHistory c nodeId version (h:hs) = do
let tuples = mconcat $ (\(nType, NgramsTablePatch patch) ->
(\(term, p) ->
(nodeId, nType, term, p)) <$> PM.toList patch) <$> PM.toList h :: [(NodeId, TableNgrams.NgramsType, NgramsTerm, NgramsPatch)]
tuplesM <- mapM (\(nId, nType, term, patch) -> do
[PGS.Only ngramsId] <- runPGSReturning c qInsert [PGS.Only term] :: IO [PGS.Only Int]
pure (nId, nType, ngramsId, term, patch)
) tuples :: IO [(NodeId, TableNgrams.NgramsType, Int, NgramsTerm, NgramsPatch)]
_ <- runPGSExecuteMany c query $ ((\(nId, nType, termId, _term, patch) -> (nId, nType, termId, patch, version)) <$> tuplesM)
_ <- insertNodeArchiveHistory c nodeId version hs
pure ()
where
qInsert :: PGS.Query
qInsert = [sql|INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id|]
-- https://stackoverflow.com/questions/39224438/postgresql-insert-if-foreign-key-exists
query :: PGS.Query
query = [sql| INSERT INTO node_story_archive_history(node_id, ngrams_type_id, ngrams_id, patch, version)
VALUES (?, ?, ?, ?, ?)
|]
nodeStoriesQuery :: PGS.Query
nodeStoriesQuery = [sql| SELECT version, ngrams_type_id, terms, ngrams_repo_element
FROM node_stories
JOIN ngrams ON ngrams.id = ngrams_id
WHERE node_id = ?
|]
-- Archive
insertArchiveStateList :: PGS.Connection -> NodeId -> Version -> ArchiveStateList -> IO ()
insertArchiveStateList c nodeId version as = do
mapM_ performInsert as
where
performInsert (ngramsType, ngrams, ngramsRepoElement) = do
[PGS.Only ngramsId] <- tryInsertTerms ngrams
_ <- case ngramsRepoElement ^. nre_root of
Nothing -> pure []
Just r -> tryInsertTerms r
mapM_ tryInsertTerms $ ngramsRepoElement ^. nre_children
runPGSExecute c query (nodeId, ngramsId, version, ngramsType, ngramsRepoElement)
tryInsertTerms :: NgramsTerm -> IO [PGS.Only Int]
tryInsertTerms t = runPGSReturning c qInsert [PGS.Only t]
qInsert :: PGS.Query
qInsert = [sql|INSERT INTO ngrams (terms) VALUES (?)
ON CONFLICT (terms) DO UPDATE SET terms = excluded.terms
RETURNING id|]
query :: PGS.Query
query = [sql|INSERT INTO node_stories(node_id, ngrams_id, version, ngrams_type_id, ngrams_repo_element)
VALUES (?, ?, ?, ?, ? :: jsonb)
|]
deleteArchiveStateList :: PGS.Connection -> NodeId -> ArchiveStateList -> IO ()
deleteArchiveStateList c nodeId as = do
mapM_ (\(nt, n, _) -> runPGSExecute c query (nodeId, nt, n)) as
where
query :: PGS.Query
query = [sql| DELETE FROM node_stories
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateArchiveStateList :: PGS.Connection -> NodeId -> Version -> ArchiveStateList -> IO ()
updateArchiveStateList c nodeId version as = do
let params = (\(nt, n, nre) -> (nre, version, nodeId, nt, n)) <$> as
mapM_ (runPGSExecute c query) params
where
query :: PGS.Query
query = [sql| UPDATE node_stories
SET ngrams_repo_element = ?, version = ?
WHERE node_id = ? AND ngrams_type_id = ?
AND ngrams_id IN (SELECT id FROM ngrams WHERE terms = ?)
|]
updateNodeStoryVersion :: PGS.Connection -> NodeId -> ArchiveList -> IO ()
updateNodeStoryVersion c nodeId newArchive = do
let ngramsTypes = Map.keys $ newArchive ^. a_state
mapM_ (\nt -> runPGSExecute c query (newArchive ^. a_version, nodeId, nt)) ngramsTypes
where
query :: PGS.Query
query = [sql|UPDATE node_stories
SET version = ?
WHERE node_id = ?
AND ngrams_type_id = ?|]
This diff is collapsed.
{-|
Module : Gargantext.Database.Flow.Extract
Description : Database Flow
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-}
module Gargantext.Database.Action.Flow.Extract
where
import Control.Lens ((^.), _Just, view)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as DM
import Gargantext.Core (Lang, NLPServerConfig, PosTagAlgo(CoreNLP))
import Gargantext.Core.Text (HasText(..))
import Gargantext.Core.Text.Corpus.Parsers (splitOn)
import Gargantext.Core.Text.Terms (ExtractNgramsT, ExtractedNgrams(..), TermType, cleanExtractedNgrams, enrichedTerms, extractNgramsT, extractTerms, tt_lang)
import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataContact, HyperdataDocument, cw_lastName, hc_who, hd_authors, hd_bdd, hd_institutes, hd_source)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT _ncs l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ HashMap.fromList $ [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: NLPServerConfig
-> TermType Lang
-> HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT ncs lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' hd
where
extractNgramsT' :: HyperdataDocument
-> DBCmd err (HashMap.HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
extractNgramsT' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ doc ^. hd_source
institutes = map text2ngrams
$ maybe ["Nothing"] (splitOn Institutes (doc^. hd_bdd))
$ doc ^. hd_institutes
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn Authors (doc^. hd_bdd))
$ doc ^. hd_authors
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang ^. tt_lang) CoreNLP NP t, cnt))
<$> concat
<$> liftBase (extractTerms ncs lang $ hasText doc)
pure $ HashMap.fromList
$ [(SimpleNgrams source, (DM.singleton Sources 1, 1)) ]
<> [(SimpleNgrams i', (DM.singleton Institutes 1, 1)) | i' <- institutes ]
<> [(SimpleNgrams a', (DM.singleton Authors 1, 1)) | a' <- authors ]
<> [(EnrichedNgrams t', (DM.singleton NgramsTerms 1, cnt')) | (t', cnt') <- termsWithCounts' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
extractNgramsT ncs l (Node { _node_hyperdata = h }) = extractNgramsT ncs l h
instance HasText a => HasText (Node a)
where
hasText (Node { _node_hyperdata = h }) = hasText h
-- Apparently unused functions
-- extractInsert :: ( HasNodeStory env err m
-- , HasNLPServer env )
-- => [Node HyperdataDocument] -> m ()
-- extractInsert docs = do
-- let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
-- let lang = EN
-- ncs <- view $ nlpServerGet lang
-- mapNgramsDocs' <- mapNodeIdNgrams
-- <$> documentIdWithNgrams
-- (extractNgramsT ncs $ withLang (Multi lang) documentsWithId)
-- documentsWithId
-- _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
-- pure ()
......@@ -9,29 +9,40 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow.Types
where
import Conduit (ConduitT)
import Control.Lens (makeLenses)
import Data.Aeson (ToJSON)
import Gargantext.Core.Types (HasValidationError)
import Data.Aeson.TH (deriveJSON)
import Data.HashMap.Strict (HashMap)
import Data.Swagger (ToSchema(..), genericDeclareNamedSchema)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
import Gargantext.Core.NodeStory
import Gargantext.Core.Text
import Gargantext.Core.Text.Corpus.API qualified as API
import Gargantext.Core.Text.Terms
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Core.Types (HasValidationError, TermsCount)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Node.Document.Insert
import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Query.Tree.Error (HasTreeError)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Types (Indexed)
import Gargantext.Prelude
import Gargantext.System.Logging
type FlowCmdM env err m =
( CmdM env err m
, HasNodeStory env err m
......@@ -56,3 +67,27 @@ type FlowInsertDB a = ( AddUniqId a
, UniqParameters a
, InsertDb a
)
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
} deriving (Show)
-- TODO use internal with API name (could be old data)
data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
| ExternalOrigin { _do_api :: API.ExternalAPIs }
-- TODO Web
deriving (Generic, Eq)
makeLenses ''DataOrigin
deriveJSON (unPrefix "_do_") ''DataOrigin
instance ToSchema DataOrigin where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
data DataText = DataOld ![NodeId]
| DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
--- | DataNew ![[HyperdataDocument]]
......@@ -9,30 +9,47 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE InstanceSigs #-}
module Gargantext.Database.Action.Flow.Utils
where
( docNgrams
, documentIdWithNgrams
, insertDocNgrams
, insertDocs
, mapNodeIdNgrams )
where
import Control.Lens ((^.))
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List
import Data.Map.Strict qualified as DM
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core (Lang, toDBid)
import Gargantext.Core.Flow.Types (UniqId, uniqId)
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (addTuples)
import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap
import Gargantext.Database.Action.Flow.Types (DocumentIdWithNgrams(..), FlowInsertDB)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument, hd_abstract, hd_title)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Prelude (DBCmd, DbCmd')
import Gargantext.Database.Query.Table.ContextNodeNgrams
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_hyperdata, context_id)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Types
import Gargantext.Prelude
import Gargantext.Core (toDBid)
import Gargantext.Prelude.Crypto.Hash (Hash)
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int, TermsCount)
} deriving (Show)
insertDocNgrams :: ListId
-> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (Int, TermsCount)))
-> DBCmd err Int
......@@ -52,3 +69,122 @@ insertDocNgrams lId m = do
-- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})]
-- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}}
docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams lang nt ts doc =
List.zip
(termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
)
(List.cycle [DM.fromList $ [(nt, DM.singleton (doc ^. context_id) 1 )]])
documentIdWithNgrams :: HasNodeError err
=> (a
-> DBCmd err (HashMap.HashMap b (Map NgramsType Int, TermsCount)))
-> [Indexed NodeId a]
-> DBCmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e
-- | TODO check optimization
mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap.HashMap b
(Map NgramsType
(Map NodeId (Int, TermsCount))
)
mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f
where
-- | NOTE We are somehow multiplying 'TermsCount' here: If the
-- same ngrams term has different ngrams types, the 'TermsCount'
-- for it (which is the number of times the terms appears in a
-- document) is copied over to all its types.
f :: DocumentIdWithNgrams a b
-> HashMap.HashMap b (Map NgramsType (Map NodeId (Int, TermsCount)))
f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\i -> DM.singleton nId (i, cnt)) ngramsTypeMap) $ documentNgrams d
where
nId = _index $ documentWithId d
-- TODO Type NodeDocumentUnicised
insertDocs :: ( DbCmd' env err m
-- , FlowCorpus a
, FlowInsertDB a
, HasNodeError err
)
=> UserId
-> CorpusId
-> [a]
-> m ([ContextId], [Indexed ContextId a])
insertDocs uId cId hs = do
let docs = map addUniqId hs
newIds <- insertDb uId Nothing docs
-- printDebug "newIds" newIds
let
newIds' = map (nodeId2ContextId . reId) newIds
documentsWithId = mergeData (toInserted newIds) (DM.fromList $ map viewUniqId' docs)
_ <- Doc.add cId newIds'
pure (newIds', map (first nodeId2ContextId) documentsWithId)
------------------------------------------------------------------------
viewUniqId' :: UniqId a
=> a
-> (Hash, a)
viewUniqId' d = maybe err (\h -> (h,d)) (d ^. uniqId)
where
err = panicTrace "[ERROR] Database.Flow.toInsert"
mergeData :: Map Hash ReturnId
-> Map Hash a
-> [Indexed NodeId a]
mergeData rs = catMaybes . map toDocumentWithId . DM.toList
where
toDocumentWithId (sha,hpd) =
Indexed <$> fmap reId (DM.lookup sha rs)
<*> Just hpd
toInserted :: [ReturnId]
-> Map Hash ReturnId
toInserted =
DM.fromList . map (\r -> (reUniqId r, r) )
. filter (\r -> reInserted r == True)
-- Apparently unused functions
-- | TODO putelsewhere
-- | Upgrade function
-- Suppose all documents are English (this is the case actually)
-- indexAllDocumentsWithPosTag :: ( HasNodeStory env err m
-- , HasNLPServer env )
-- => m ()
-- indexAllDocumentsWithPosTag = do
-- rootId <- getRootId (UserName userMaster)
-- corpusIds <- findNodesId rootId [NodeCorpus]
-- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
-- _ <- mapM extractInsert (splitEvery 1000 docs)
-- pure ()
......@@ -30,7 +30,7 @@ import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.API.Ngrams.Tools (filterListWithRoot, groupNodesByNgrams, Diagonal(..), getCoocByNgrams, mapTermListRoot, RootTerm, getRepo)
import Gargantext.API.Ngrams.Types (TabType(..), ngramsTypeFromTabType, NgramsTerm(..))
import Gargantext.Core (HasDBid(toDBid))
import Gargantext.Core.NodeStory hiding (runPGSQuery)
import Gargantext.Core.NodeStory
import Gargantext.Core.Text.Metrics (scored, Scored(..), {-localMetrics, toScored-})
import Gargantext.Core.Types (ListType(..), NodeType(..), ContextId, contextId2NodeId)
import Gargantext.Core.Types.Query (Limit(..))
......@@ -74,7 +74,8 @@ getNgramsCooc cId lId tabType maybeLimit = do
------------------------------------------------------------------------
------------------------------------------------------------------------
updateNgramsOccurrences :: (HasNodeStory env err m)
=> CorpusId -> ListId
=> CorpusId
-> ListId
-> m ()
updateNgramsOccurrences cId lId = do
_ <- mapM (updateNgramsOccurrences' cId lId Nothing) [Terms, Sources, Authors, Institutes]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment