[OPT] Proposal for faster occ computation (not tested)

parent a16aa3d3
Pipeline #299 failed with stage
...@@ -137,37 +137,75 @@ queryNgramsByNodeUser = [sql| ...@@ -137,37 +137,75 @@ queryNgramsByNodeUser = [sql|
-- TODO add groups -- TODO add groups
getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text] getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text]
-> Cmd err (Map Text Int) -> Cmd err (Map Text Int)
getOccByNgramsOnly cId nt ngs = Map.map Set.size getOccByNgramsOnly cId nt ngs =
<$> getNodesByNgramsOnlyUser cId nt ngs fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
-- just slower than getOccByNgramsOnly
getOccByNgramsOnly' :: CorpusId -> NgramsType -> [Text]
-> Cmd err (Map Text Int)
getOccByNgramsOnly' cId nt ngs =
Map.map Set.size <$> getNodesByNgramsOnlyUser cId nt ngs
selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err [(Text, Int)]
selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
(cId, nodeTypeId NodeDocument,
ngramsTypeId nt, Values fields (DPS.Only <$> tms))
where
fields = [QualifiedIdentifier Nothing "text"]
-- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser = [sql|
WITH corpus_id AS ?
WITH docType AS ?
WITH ngramsType AS ?
WITH input_rows(terms) AS (?)
SELECT ng.terms, COUNT(nng.node_id) FROM nodes_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = corpus_id -- CorpusId
AND n.typename = docType -- NodeTypeId
AND nng.ngrams_type = ngramsType -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node_id, ng.terms
|]
getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text] getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId)) -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId nt ngs = getNodesByNgramsOnlyUser cId nt ngs =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n)) fromListWith (<>) <$> map (second Set.singleton)
<$> selectNgramsOnlyByNodeUser cId nt ngs <$> selectNgramsOnlyByNodeUser cId nt ngs
selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text] selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
-> Cmd err [(NodeId, Text)] -> Cmd err [(Text, NodeId)]
selectNgramsOnlyByNodeUser cId nt tms = selectNgramsOnlyByNodeUser cId nt tms =
runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' ) runPGSQuery queryNgramsOnlyByNodeUser
(cId, nodeTypeId NodeDocument,
ngramsTypeId nt, Values fields (DPS.Only <$> tms))
where where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4", "int4", "int4"] fields = [QualifiedIdentifier Nothing "text"]
tms' = map (\t -> (t,cId,nodeTypeId NodeDocument, ngramsTypeId nt)) tms
queryNgramsOnlyByNodeUser :: DPS.Query queryNgramsOnlyByNodeUser :: DPS.Query
queryNgramsOnlyByNodeUser = [sql| queryNgramsOnlyByNodeUser = [sql|
WITH input_rows(terms,corpus_id,docType,ngramsType) AS (?) WITH corpus_id AS ?
SELECT nng.node_id, ng.terms FROM nodes_ngrams nng WITH docType AS ?
WITH ngramsType AS ?
WITH input_rows(terms) AS (?)
SELECT ng.terms, nng.node_id FROM nodes_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN nodes n ON nn.node2_id = n.id JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ir.corpus_id -- CorpusId WHERE nn.node1_id = corpus_id -- CorpusId
AND n.typename = ir.docType -- NodeTypeId AND n.typename = docType -- NodeTypeId
AND nng.ngrams_type = ir.ngramsType -- NgramsTypeId AND nng.ngrams_type = ngramsType -- NgramsTypeId
AND nn.delete = False AND nn.delete = False
GROUP BY nng.node_id, ng.terms GROUP BY nng.node_id, ng.terms
|] |]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment