{-| Module : Gargantext.Database.Flow.Utils Description : Database Flow Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} module Gargantext.Database.Action.Flow.Utils ( docNgrams , documentIdWithNgrams , insertDocNgrams , insertDocs , mapNodeIdNgrams , ngramsByDoc ) where import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict qualified as HashMap import Data.Map.Strict qualified as DM import Data.Text qualified as T import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.Core (Lang, toDBid) import Gargantext.Core.Flow.Types (UniqId, uniqId) import Gargantext.Core.Text.Ngrams ( Ngrams, NgramsType ) import Gargantext.Core.Text.Terms (ExtractedNgrams(..)) import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText) import Gargantext.Core.Types (TermsCount, TermsWeight(..)) import Gargantext.Core.Utils (addTuples) import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap import Gargantext.Database.Action.Flow.Types (DocumentIdWithNgrams(..), FlowInsertDB) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument, hd_abstract, hd_title ) import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Prelude import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly(..), insertContextNodeNgrams ) import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add) import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id) import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..), text2ngrams) import Gargantext.Database.Types ( Indexed(..), index ) import Gargantext.Prelude import Gargantext.Prelude.Crypto.Hash (Hash) insertDocNgrams :: ListId -> HashMap (Indexed NgramsId Ngrams) (Map NgramsType (Map DocId (TermsWeight, TermsCount))) -> DBUpdate err Int insertDocNgrams lId m = do -- printDebug "[insertDocNgrams] ns" ns insertContextNodeNgrams ns where ns = [ ContextNodeNgrams (nodeId2ContextId docId) lId (ng^.index) (NgramsTypeId $ toDBid t) (fromIntegral $ unTermsWeight w) cnt | (ng, t2n2i) <- HashMap.toList m , (t, n2i) <- DM.toList t2n2i , (docId, (w, cnt)) <- DM.toList n2i ] -- [(NodeId, {Ngrams: ({NgramsType: Int}, TermsCount)})] -- {Ngrams: {NgramsType: {NodeId: (Int, TermsCount)}}} -- Given language, ngrams type, a list of terms and a -- HyperdataDocument, return ngrams that are in this text, with counts. -- This is a pure function (doesn't use corenlp nor PostgreSQL FTS). docNgrams :: Lang -> [NT.NgramsTerm] -> ContextOnlyId HyperdataDocument -> [(MatchedText, TermsCount)] docNgrams lang ts doc = ( termsInText lang (buildPatternsWith lang ts) $ T.unlines $ catMaybes [ doc ^. context_oid_hyperdata . hd_title , doc ^. context_oid_hyperdata . hd_abstract ] ) documentIdWithNgrams :: HasNodeError err => ( a -> DBCmd err (HashMap.HashMap b (Map NgramsType TermsWeight, TermsCount)) ) -> [Indexed NodeId a] -> DBCmd err [DocumentIdWithNgrams a b] documentIdWithNgrams f = traverse toDocumentIdWithNgrams where toDocumentIdWithNgrams d = do e <- f $ _unIndex d pure $ DocumentIdWithNgrams d e -- | TODO check optimization mapNodeIdNgrams :: (Ord b, Hashable b) => [DocumentIdWithNgrams a b] -> HashMap.HashMap b (Map NgramsType (Map NodeId (TermsWeight, TermsCount)) ) mapNodeIdNgrams = HashMap.unionsWith (DM.unionWith (DM.unionWith addTuples)) . fmap f where -- | NOTE We are somehow multiplying 'TermsCount' here: If the -- same ngrams term has different ngrams types, the 'TermsCount' -- for it (which is the number of times the terms appears in a -- document) is copied over to all its types. f :: DocumentIdWithNgrams a b -> HashMap.HashMap b (Map NgramsType (Map NodeId (TermsWeight, TermsCount))) f d = fmap (\(ngramsTypeMap, cnt) -> fmap (\w -> DM.singleton nId (w, cnt)) ngramsTypeMap) $ documentNgrams d where nId = _index $ documentWithId d -- TODO Type NodeDocumentUnicised insertDocs :: FlowInsertDB a => UserId -> CorpusId -> [a] -> DBUpdate err ([ContextId], [Indexed ContextId a]) insertDocs uId cId hs = do let docs = map addUniqId hs newIds <- insertDb uId Nothing docs -- printDebug "newIds" newIds let newIds' = map (nodeId2ContextId . reId) newIds documentsWithId = mergeData (toInserted newIds) (DM.fromList $ map viewUniqId' docs) _ <- Doc.add cId newIds' pure (newIds', map (first nodeId2ContextId) documentsWithId) ------------------------------------------------------------------------ viewUniqId' :: UniqId a => a -> (Hash, a) viewUniqId' d = maybe err (\h -> (h,d)) (d ^. uniqId) where err = panicTrace "[ERROR] Database.Flow.toInsert" mergeData :: Map Hash ReturnId -> Map Hash a -> [Indexed NodeId a] mergeData rs = catMaybes . map toDocumentWithId . DM.toList where toDocumentWithId (sha,hpd) = Indexed <$> fmap reId (DM.lookup sha rs) <*> Just hpd toInserted :: [ReturnId] -> Map Hash ReturnId toInserted = DM.fromList . map (\r -> (reUniqId r, r) ) . filter (\r -> reInserted r == True) -- Apparently unused functions -- | TODO putelsewhere -- | Upgrade function -- Suppose all documents are English (this is the case actually) -- indexAllDocumentsWithPosTag :: ( HasNodeStory env err m -- , HasNLPServer env ) -- => m () -- indexAllDocumentsWithPosTag = do -- rootId <- getRootId (UserName userMaster) -- corpusIds <- findNodesId rootId [NodeCorpus] -- docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds -- _ <- mapM extractInsert (splitEvery 1000 docs) -- pure () -- | Given list of terms and a document, produce a map for this doc's -- terms count and weights. Notice that the weight is always 1 here. ngramsByDoc :: Lang -> NgramsType -> [NT.NgramsTerm] -> ContextOnlyId HyperdataDocument -> HashMap.HashMap ExtractedNgrams (DM.Map NgramsType (Map NodeId (TermsWeight, TermsCount))) ngramsByDoc l nt ts doc = HashMap.map (\cnt -> DM.singleton nt $ DM.singleton nId (1, cnt)) extractedMap where matched :: [(MatchedText, TermsCount)] matched = docNgrams l ts doc nId :: NodeId nId = doc ^. context_oid_id withExtractedNgrams :: [(ExtractedNgrams, TermsCount)] withExtractedNgrams = first (SimpleNgrams . text2ngrams) <$> matched extractedMap :: HashMap.HashMap ExtractedNgrams TermsCount extractedMap = HashMap.fromListWith (+) withExtractedNgrams