[ngrams] small simplification to docNgrams function

parent 161ac077
Pipeline #7171 passed with stages
in 47 minutes and 11 seconds
......@@ -52,7 +52,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where
import Conduit
import Control.Lens ( to, view, over )
import Control.Lens ( to, view )
import Data.Bifunctor qualified as B
import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources)
......@@ -105,7 +105,8 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams )
import Gargantext.Database.Schema.Context (context_oid_id)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger )
......@@ -451,10 +452,12 @@ saveDocNgramsWith :: (IsDBCmd env err m)
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
-- let mapNgramsDocsNoCount :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
-- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
(terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (Int, TermsCount)))
mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new
mapCgramsId <- listInsertDb lId toNodeNgramsW'
......@@ -505,15 +508,18 @@ reIndexWith cId lId nt lts = do
(docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId
let
-- fromListWith (<>)
docNgramsMap :: [[((MatchedText, TermsCount), Map NgramsType (Map NodeId Int))]]
docNgramsMap = map (docNgrams corpusLang nt ts) docs
docNgrams' :: [([(MatchedText, TermsCount)], NodeId)]
docNgrams' = map (\doc -> (docNgrams corpusLang ts doc, doc ^. context_oid_id)) docs
withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]]
withExtractedNgrams =
map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ docNgramsMap
map (\(matched, nId) ->
map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, Map.singleton nt $ Map.singleton nId (1, cnt) ) ) matched)
$ docNgrams'
-- TODO Is this weight always equal to 1?
ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ withExtractedNgrams
......
......@@ -19,7 +19,6 @@ where
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List
import Data.Map.Strict qualified as DM
import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT
......@@ -38,7 +37,7 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id)
import Gargantext.Database.Schema.Context (context_oid_hyperdata)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..))
import Gargantext.Database.Types ( Indexed(..), index )
import Gargantext.Prelude
......@@ -73,20 +72,17 @@ insertDocNgrams lId m = do
-- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams lang nt ts doc =
List.zip
(termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes
[ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract
]
-> [(MatchedText, TermsCount)]
docNgrams lang ts doc =
(
termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes
[ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract
]
)
(List.cycle [DM.fromList $ [(nt, DM.singleton (doc ^. context_oid_id) 1 )]])
documentIdWithNgrams :: HasNodeError err
......
......@@ -6,7 +6,6 @@ module Test.Ngrams.Count (tests) where
import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument )
......@@ -105,5 +104,5 @@ testDocNgrams01 = do
let hd = emptyHyperdataDocument { _hd_title = Just "hello world"
, _hd_abstract = Nothing }
let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN NgramsTerms terms ctx
let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment