Commit f42da79b authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev-250-docsOccurrences' into dev-merge

parents 1cb71785 bfd0f6b8
...@@ -16,39 +16,29 @@ Portability : POSIX ...@@ -16,39 +16,29 @@ Portability : POSIX
module Gargantext.API.Ngrams.List module Gargantext.API.Ngrams.List
where where
import Control.Lens hiding (elements, Indexed)
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Data.Map.Strict (Map, toList) import Data.Map.Strict (Map, toList)
import Data.Maybe (catMaybes, fromMaybe) import Data.Maybe (fromMaybe)
import Data.Set (Set)
import Data.Text (Text, concat, pack, splitOn) import Data.Text (Text, concat, pack, splitOn)
import Data.Vector (Vector) import Data.Vector (Vector)
import Gargantext.API.Admin.EnvTypes (Env, GargJob(..)) import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
import Gargantext.Core (Lang, withDefaultLanguage)
import Gargantext.API.Admin.Orchestrator.Types import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (setListNgrams) import Gargantext.API.Ngrams (setListNgrams)
import Gargantext.API.Ngrams.List.Types import Gargantext.API.Ngrams.List.Types
import Gargantext.API.Ngrams.Prelude (getNgramsList) import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.API.Prelude (GargServer, GargM, GargError) import Gargantext.API.Prelude (GargServer, GargM, GargError)
import Gargantext.API.Types import Gargantext.API.Types
import Gargantext.Core.NodeStory import Gargantext.Core.NodeStory
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Types.Main (ListType(..)) import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Flow (saveDocNgramsWith) import Gargantext.Database.Action.Flow (reIndexWith)
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
-- import Gargantext.Database.Action.Metrics.NgramsByContext (refreshNgramsMaterialized) -- import Gargantext.Database.Action.Metrics.NgramsByContext (refreshNgramsMaterialized)
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getNode, getNodeWith) import Gargantext.Database.Query.Table.Node (getNode)
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Schema.Ngrams import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node (_node_parent_id, node_hyperdata) import Gargantext.Database.Schema.Node (_node_parent_id)
import Gargantext.Database.Types (Indexed(..)) import Gargantext.Database.Types (Indexed(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..)) import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
...@@ -56,16 +46,13 @@ import Servant ...@@ -56,16 +46,13 @@ import Servant
import qualified Data.ByteString.Lazy as BSL import qualified Data.ByteString.Lazy as BSL
import qualified Data.Csv as Csv import qualified Data.Csv as Csv
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import qualified Data.List as List
import qualified Data.Map.Strict as Map import qualified Data.Map.Strict as Map
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.Vector as Vec import qualified Data.Vector as Vec
import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
import qualified Gargantext.Utils.Servant as GUS import qualified Gargantext.Utils.Servant as GUS
import qualified Prelude import qualified Prelude
import qualified Protolude as P import qualified Protolude as P
import Gargantext.Database.Admin.Types.Hyperdata.Corpus
------------------------------------------------------------------------ ------------------------------------------------------------------------
type GETAPI = Summary "Get List" type GETAPI = Summary "Get List"
:> "lists" :> "lists"
...@@ -145,56 +132,6 @@ setList l m = do ...@@ -145,56 +132,6 @@ setList l m = do
pure True pure True
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Re-index documents of a corpus with ngrams in the list
reIndexWith :: ( HasNodeStory env err m
, FlowCmdM env err m
)
=> CorpusId
-> ListId
-> NgramsType
-> Set ListType
-> m ()
reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
corpus_node <- getNodeWith cId (Proxy @ HyperdataCorpus)
let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node
-- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus
docs <- selectDocNodes cId
let
-- fromListWith (<>)
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (docNgrams corpusLang nt ts) docs
-- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc
pure ()
docNgrams :: Lang
-> NgramsType
-> [NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams lang nt ts doc =
List.zip
(termsInText lang (buildPatternsWith lang ts)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
)
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams) toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n toIndexedNgrams m t = Indexed <$> i <*> n
......
...@@ -25,7 +25,6 @@ import GHC.Generics (Generic) ...@@ -25,7 +25,6 @@ import GHC.Generics (Generic)
import Gargantext.API.Admin.EnvTypes (GargJob(..), Env) import Gargantext.API.Admin.EnvTypes (GargJob(..), Env)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Ngrams.List (reIndexWith)
--import Gargantext.API.Ngrams.Types (TabType(..)) --import Gargantext.API.Ngrams.Types (TabType(..))
import Gargantext.API.Prelude (GargM, GargError, simuLogs) import Gargantext.API.Prelude (GargM, GargError, simuLogs)
import Gargantext.Core.Methods.Similarities (GraphMetric(..)) import Gargantext.Core.Methods.Similarities (GraphMetric(..))
...@@ -36,6 +35,7 @@ import Gargantext.Core.Viz.Graph.Types (Strength) ...@@ -36,6 +35,7 @@ import Gargantext.Core.Viz.Graph.Types (Strength)
import Gargantext.Core.Viz.Phylo (PhyloSubConfigAPI(..), subConfigAPI2config) import Gargantext.Core.Viz.Phylo (PhyloSubConfigAPI(..), subConfigAPI2config)
import Gargantext.Core.Viz.Phylo.API.Tools (flowPhyloAPI) import Gargantext.Core.Viz.Phylo.API.Tools (flowPhyloAPI)
-- import Gargantext.Database.Action.Mail (sendMail) -- import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Flow (reIndexWith)
import Gargantext.Database.Action.Flow.Pairing (pairing) import Gargantext.Database.Action.Flow.Pairing (pairing)
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore) import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
......
...@@ -22,6 +22,8 @@ Portability : POSIX ...@@ -22,6 +22,8 @@ Portability : POSIX
{-# LANGUAGE InstanceSigs #-} {-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( DataText(..) ( DataText(..)
...@@ -37,6 +39,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -37,6 +39,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, insertMasterDocs , insertMasterDocs
, saveDocNgramsWith , saveDocNgramsWith
, reIndexWith
, docNgrams
, getOrMkRoot , getOrMkRoot
, getOrMk_RootWithCorpus , getOrMk_RootWithCorpus
, TermType(..) , TermType(..)
...@@ -49,10 +54,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -49,10 +54,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where where
import Conduit import Conduit
import Control.Lens ((^.), view, _Just, makeLenses, over, traverse) import Control.Lens hiding (elements, Indexed)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
import qualified Data.Conduit.List as CList
import Data.Either import Data.Either
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable) import Data.Hashable (Hashable)
...@@ -60,22 +64,18 @@ import Data.List (concat) ...@@ -60,22 +64,18 @@ import Data.List (concat)
import Data.Map.Strict (Map, lookup) import Data.Map.Strict (Map, lookup)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import Data.Monoid import Data.Monoid
import Data.Proxy
import Data.Set (Set)
import Data.Swagger import Data.Swagger
import qualified Data.Text as T
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import System.FilePath (FilePath) import Gargantext.API.Ngrams.Tools (getTermsWith)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map.Strict as Map
import qualified Data.Conduit.List as CL
import qualified Data.Conduit as C
import Gargantext.Core (Lang(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), PosTagAlgo(..))
-- import Gargantext.Core.Ext.IMT (toSchoolName) import Gargantext.Core (withDefaultLanguage)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Core.NLP (nlpServerGet) import Gargantext.Core.NLP (nlpServerGet)
import Gargantext.Core.NodeStory (HasNodeStory)
import Gargantext.Core.Text import Gargantext.Core.Text
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType, splitOn)
import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List (buildNgramsLists)
...@@ -83,6 +83,7 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(. ...@@ -83,6 +83,7 @@ import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(.
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (POS(NP), TermsCount) import Gargantext.Core.Types (POS(NP), TermsCount)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
...@@ -92,9 +93,9 @@ import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) ...@@ -92,9 +93,9 @@ import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow.List import Gargantext.Database.Action.Flow.List
import Gargantext.Database.Action.Flow.Types import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..)) import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences, updateContextScore)
import Gargantext.Database.Action.Search (searchDocInDatabase) import Gargantext.Database.Action.Search (searchDocInDatabase)
import Gargantext.Database.Admin.Config (userMaster, corpusMasterName) import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
import Gargantext.Database.Action.Metrics (updateNgramsOccurrences)
import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
...@@ -103,17 +104,29 @@ import Gargantext.Database.Query.Table.Ngrams ...@@ -103,17 +104,29 @@ import Gargantext.Database.Query.Table.Ngrams
import Gargantext.Database.Query.Table.Node import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId) import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus) import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Schema.Node (NodePoly(..), node_id) import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Database.Types import Gargantext.Database.Types
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash) import Gargantext.Prelude.Crypto.Hash (Hash)
import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..)) import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
import System.FilePath (FilePath)
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.List as CList
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Data.Text as T
import qualified Gargantext.API.Ngrams.Types as NT
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
import qualified PUBMED.Types as PUBMED import qualified PUBMED.Types as PUBMED
--import qualified Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Imports for upgrade function -- Imports for upgrade function
...@@ -339,7 +352,7 @@ createNodes user corpusName ctype = do ...@@ -339,7 +352,7 @@ createNodes user corpusName ctype = do
-- User Graph Flow -- User Graph Flow
_ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
_ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId -- _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
pure (userId, userCorpusId, listId) pure (userId, userCorpusId, listId)
...@@ -376,10 +389,11 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do ...@@ -376,10 +389,11 @@ flowCorpusUser l user userCorpusId listId ctype mfslw = do
_mastListId <- getOrMkList masterCorpusId masterUserId _mastListId <- getOrMkList masterCorpusId masterUserId
pure () pure ()
-- _ <- insertOccsUpdates userCorpusId mastListId -- _ <- insertOccsUpdates userCorpusId mastListId
-- printDebug "userListId" userListId
--_ <- mkPhylo userCorpusId userId --_ <- mkPhylo userCorpusId userId
-- Annuaire Flow -- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId -- _ <- mkAnnuaire rootUserId userId
_ <- reIndexWith userCorpusId listId NgramsTerms (Set.singleton MapTerm)
_ <- updateContextScore userCorpusId (Just listId)
_ <- updateNgramsOccurrences userCorpusId (Just listId) _ <- updateNgramsOccurrences userCorpusId (Just listId)
pure userCorpusId pure userCorpusId
...@@ -612,3 +626,57 @@ extractInsert docs = do ...@@ -612,3 +626,57 @@ extractInsert docs = do
documentsWithId documentsWithId
_ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
pure () pure ()
-- | Re-index documents of a corpus with ngrams in the list
reIndexWith :: ( HasNodeStory env err m
, FlowCmdM env err m
)
=> CorpusId
-> ListId
-> NgramsType
-> Set ListType
-> m ()
reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
corpus_node <- getNodeWith cId (Proxy @ HyperdataCorpus)
let corpusLang = withDefaultLanguage $ view (node_hyperdata . to _hc_lang) corpus_node
-- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- Get all documents of the corpus
docs <- selectDocNodes cId
let
-- fromListWith (<>)
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (docNgrams corpusLang nt ts) docs
-- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc
pure ()
docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams lang nt ts doc =
List.zip
(termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
]
)
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment