[ngrams] small simplification to docNgrams function

parent 161ac077
Pipeline #7171 passed with stages
in 47 minutes and 11 seconds
...@@ -52,7 +52,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -52,7 +52,7 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
where where
import Conduit import Conduit
import Control.Lens ( to, view, over ) import Control.Lens ( to, view )
import Data.Bifunctor qualified as B import Data.Bifunctor qualified as B
import Data.Conduit qualified as C import Data.Conduit qualified as C
import Data.Conduit.Internal (zipSources) import Data.Conduit.Internal (zipSources)
...@@ -105,7 +105,8 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) ...@@ -105,7 +105,8 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId) import Gargantext.Database.Query.Table.NodeContext (selectDocNodesOnlyId)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId) import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser) import Gargantext.Database.Query.Tree.Root (MkCorpusUser(..), getOrMkRoot, getOrMkRootWithCorpus, userFromMkCorpusUser)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams ) import Gargantext.Database.Schema.Context (context_oid_id)
import Gargantext.Database.Schema.Ngrams ( indexNgrams, text2ngrams, NgramsId )
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude hiding (to) import Gargantext.Prelude hiding (to)
import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger ) import Gargantext.System.Logging ( logLocM, LogLevel(DEBUG), MonadLogger )
...@@ -451,10 +452,12 @@ saveDocNgramsWith :: (IsDBCmd env err m) ...@@ -451,10 +452,12 @@ saveDocNgramsWith :: (IsDBCmd env err m)
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs' -- let mapNgramsDocsNoCount :: HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount -- mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
(terms2id :: HashMap.HashMap Text NgramsId) <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs' let mapNgramsDocs :: HashMap.HashMap Ngrams (Map NgramsType (Map NodeId (Int, TermsCount)))
mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
-- new -- new
mapCgramsId <- listInsertDb lId toNodeNgramsW' mapCgramsId <- listInsertDb lId toNodeNgramsW'
...@@ -505,15 +508,18 @@ reIndexWith cId lId nt lts = do ...@@ -505,15 +508,18 @@ reIndexWith cId lId nt lts = do
(docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId (docs :: [ContextOnlyId HyperdataDocument]) <- selectDocNodesOnlyId cId
let let
-- fromListWith (<>) docNgrams' :: [([(MatchedText, TermsCount)], NodeId)]
docNgramsMap :: [[((MatchedText, TermsCount), Map NgramsType (Map NodeId Int))]] docNgrams' = map (\doc -> (docNgrams corpusLang ts doc, doc ^. context_oid_id)) docs
docNgramsMap = map (docNgrams corpusLang nt ts) docs
withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]] withExtractedNgrams :: [[(ExtractedNgrams, Map NgramsType (Map NodeId (Int, TermsCount)))]]
withExtractedNgrams = withExtractedNgrams =
map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v))) map (\(matched, nId) ->
$ docNgramsMap map (\(matchedText, cnt) ->
( SimpleNgrams (text2ngrams matchedText)
, Map.singleton nt $ Map.singleton nId (1, cnt) ) ) matched)
$ docNgrams'
-- TODO Is this weight always equal to 1?
ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))] ngramsByDoc :: [HashMap.HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))]
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b'))))) ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ withExtractedNgrams $ withExtractedNgrams
......
...@@ -19,7 +19,6 @@ where ...@@ -19,7 +19,6 @@ where
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List
import Data.Map.Strict qualified as DM import Data.Map.Strict qualified as DM
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.API.Ngrams.Types qualified as NT import Gargantext.API.Ngrams.Types qualified as NT
...@@ -38,7 +37,7 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly ...@@ -38,7 +37,7 @@ import Gargantext.Database.Query.Table.ContextNodeNgrams ( ContextNodeNgramsPoly
import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add) import Gargantext.Database.Query.Table.Node.Document.Add qualified as Doc (add)
import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId) import Gargantext.Database.Query.Table.Node.Document.Insert (ReturnId, addUniqId, insertDb, reId, reInserted, reUniqId)
import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..)) import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Schema.Context (context_oid_hyperdata, context_oid_id) import Gargantext.Database.Schema.Context (context_oid_hyperdata)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..)) import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTypeId(..))
import Gargantext.Database.Types ( Indexed(..), index ) import Gargantext.Database.Types ( Indexed(..), index )
import Gargantext.Prelude import Gargantext.Prelude
...@@ -73,20 +72,17 @@ insertDocNgrams lId m = do ...@@ -73,20 +72,17 @@ insertDocNgrams lId m = do
-- HyperdataDocument, return ngrams that are in this text, with counts. -- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS). -- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
docNgrams :: Lang docNgrams :: Lang
-> NgramsType
-> [NT.NgramsTerm] -> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument -> ContextOnlyId HyperdataDocument
-> [((MatchedText, TermsCount), -> [(MatchedText, TermsCount)]
Map NgramsType (Map NodeId Int))] docNgrams lang ts doc =
docNgrams lang nt ts doc = (
List.zip termsInText lang (buildPatternsWith lang ts)
(termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes $ T.unlines $ catMaybes
[ doc ^. context_oid_hyperdata . hd_title [ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract , doc ^. context_oid_hyperdata . hd_abstract
] ]
) )
(List.cycle [DM.fromList $ [(nt, DM.singleton (doc ^. context_oid_id) 1 )]])
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
......
...@@ -6,7 +6,6 @@ module Test.Ngrams.Count (tests) where ...@@ -6,7 +6,6 @@ module Test.Ngrams.Count (tests) where
import Gargantext.API.Ngrams import Gargantext.API.Ngrams
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..)) import Gargantext.Core.Text.Terms.WithList (buildPatternsWith, termsInText, Pattern(..))
import Gargantext.Database.Action.Flow.Utils (docNgrams) import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..), emptyHyperdataDocument )
...@@ -105,5 +104,5 @@ testDocNgrams01 = do ...@@ -105,5 +104,5 @@ testDocNgrams01 = do
let hd = emptyHyperdataDocument { _hd_title = Just "hello world" let hd = emptyHyperdataDocument { _hd_title = Just "hello world"
, _hd_abstract = Nothing } , _hd_abstract = Nothing }
let ctx = ContextOnlyId 1 hd let ctx = ContextOnlyId 1 hd
let dNgrams = docNgrams EN NgramsTerms terms ctx let dNgrams = docNgrams EN terms ctx
length dNgrams @?= 2 length dNgrams @?= 2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment