Commit 6019e088 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] Flow List / Metrics TFICF with sample

parent f35d84d9
Pipeline #1743 passed with stage
in 36 minutes and 21 seconds
......@@ -35,7 +35,7 @@ import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
......@@ -156,8 +156,11 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
-- Filter 0 With Double
-- Computing global speGen score
allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
printDebug "[buldNgramsTermsList: Sample List] / start" nt
allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
printDebug "[buldNgramsTermsList: Sample List / end]" nt
printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
-- PrivateFirst for first developments since Public NodeMode is not implemented yet
socialLists :: FlowCont NgramsTerm FlowListScores
<- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
......@@ -165,6 +168,8 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
$ List.zip (HashMap.keys allTerms)
(List.cycle [mempty])
)
printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
let ngramsKeys = HashMap.keysSet allTerms
groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
......
......@@ -105,6 +105,18 @@ getOccByNgramsOnlyFast cId nt ngs =
HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast_withSample cId int nt ngs =
HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt ngs
getOccByNgramsOnlyFast' :: CorpusId
-> ListId
-> NgramsType
......@@ -190,6 +202,8 @@ selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
where
fields = [QualifiedIdentifier Nothing "text"]
-- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
-- Question: with the grouping is the result exactly the same (since Set NodeId for
-- equivalent ngrams intersections are not empty)
......@@ -208,6 +222,46 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql|
GROUP BY nng.node2_id, ng.terms
|]
selectNgramsOccurrencesOnlyByNodeUser_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt tms =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser_withSample
( int
, toDBid NodeDocument
, cId
, Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, cId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByNodeUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser_withSample = [sql|
WITH nodes_sample AS (SELECT id FROM nodes n TABLESAMPLE SYSTEM_ROWS (?)
JOIN nodes_nodes nn ON n.id = nn.node2_id
WHERE n.typename = ?
AND nn.node1_id = ?),
input_rows(terms) AS (?)
SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes_sample n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY nng.node2_id, ng.terms
|]
queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser' = [sql|
WITH input_rows(terms) AS (?)
......
......@@ -21,7 +21,7 @@ import qualified Data.HashMap.Strict as HM
import Data.Maybe (fromMaybe)
import Gargantext.Core
import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast)
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast, getOccByNgramsOnlyFast_withSample)
import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNode (selectCountDocs)
......@@ -52,3 +52,29 @@ getTficf cId mId nt = do
(TficfSupra (Count $ fromMaybe 0 $ HM.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal))
) mapTextDoubleLocal
getTficf_withSample :: HasDBid NodeType
=> UserCorpusId
-> MasterCorpusId
-> NgramsType
-> Cmd err (HashMap NgramsTerm Double)
getTficf_withSample cId mId nt = do
mapTextDoubleLocal <- HM.filter (> 1)
<$> HM.map (fromIntegral . Set.size)
<$> getNodesByNgramsUser cId nt
countLocal <- selectCountDocs cId
let countGlobal = countLocal * 10
mapTextDoubleGlobal <- HM.map fromIntegral
<$> getOccByNgramsOnlyFast_withSample mId countGlobal nt
(HM.keys mapTextDoubleLocal)
pure $ HM.mapWithKey (\t n ->
tficf (TficfInfra (Count n )
(Total $ fromIntegral countLocal))
(TficfSupra (Count $ fromMaybe 0 $ HM.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal))
) mapTextDoubleLocal
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment