Commit 6019e088 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] Flow List / Metrics TFICF with sample

parent f35d84d9
Pipeline #1743 passed with stage
in 36 minutes and 21 seconds
...@@ -35,7 +35,7 @@ import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_ ...@@ -35,7 +35,7 @@ import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser) import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf) import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node (NodeId) import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (CmdM) import Gargantext.Database.Prelude (CmdM)
import Gargantext.Database.Query.Table.Ngrams (text2ngrams) import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
...@@ -156,8 +156,11 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do ...@@ -156,8 +156,11 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
-- Filter 0 With Double -- Filter 0 With Double
-- Computing global speGen score -- Computing global speGen score
allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt printDebug "[buldNgramsTermsList: Sample List] / start" nt
allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
printDebug "[buldNgramsTermsList: Sample List / end]" nt
printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
-- PrivateFirst for first developments since Public NodeMode is not implemented yet -- PrivateFirst for first developments since Public NodeMode is not implemented yet
socialLists :: FlowCont NgramsTerm FlowListScores socialLists :: FlowCont NgramsTerm FlowListScores
<- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
...@@ -165,6 +168,8 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do ...@@ -165,6 +168,8 @@ buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
$ List.zip (HashMap.keys allTerms) $ List.zip (HashMap.keys allTerms)
(List.cycle [mempty]) (List.cycle [mempty])
) )
printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
let ngramsKeys = HashMap.keysSet allTerms let ngramsKeys = HashMap.keysSet allTerms
groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys) groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
......
...@@ -105,6 +105,18 @@ getOccByNgramsOnlyFast cId nt ngs = ...@@ -105,6 +105,18 @@ getOccByNgramsOnlyFast cId nt ngs =
HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast_withSample cId int nt ngs =
HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt ngs
getOccByNgramsOnlyFast' :: CorpusId getOccByNgramsOnlyFast' :: CorpusId
-> ListId -> ListId
-> NgramsType -> NgramsType
...@@ -190,6 +202,8 @@ selectNgramsOccurrencesOnlyByNodeUser cId nt tms = ...@@ -190,6 +202,8 @@ selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
where where
fields = [QualifiedIdentifier Nothing "text"] fields = [QualifiedIdentifier Nothing "text"]
-- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids. -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
-- Question: with the grouping is the result exactly the same (since Set NodeId for -- Question: with the grouping is the result exactly the same (since Set NodeId for
-- equivalent ngrams intersections are not empty) -- equivalent ngrams intersections are not empty)
...@@ -208,6 +222,46 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql| ...@@ -208,6 +222,46 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql|
GROUP BY nng.node2_id, ng.terms GROUP BY nng.node2_id, ng.terms
|] |]
selectNgramsOccurrencesOnlyByNodeUser_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt tms =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOccurrencesOnlyByNodeUser_withSample
( int
, toDBid NodeDocument
, cId
, Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, cId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByNodeUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser_withSample = [sql|
WITH nodes_sample AS (SELECT id FROM nodes n TABLESAMPLE SYSTEM_ROWS (?)
JOIN nodes_nodes nn ON n.id = nn.node2_id
WHERE n.typename = ?
AND nn.node1_id = ?),
input_rows(terms) AS (?)
SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes_sample n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY nng.node2_id, ng.terms
|]
queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser' = [sql| queryNgramsOccurrencesOnlyByNodeUser' = [sql|
WITH input_rows(terms) AS (?) WITH input_rows(terms) AS (?)
......
...@@ -21,7 +21,7 @@ import qualified Data.HashMap.Strict as HM ...@@ -21,7 +21,7 @@ import qualified Data.HashMap.Strict as HM
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Text.Metrics.TFICF import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast) import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast, getOccByNgramsOnlyFast_withSample)
import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId) import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNode (selectCountDocs) import Gargantext.Database.Query.Table.NodeNode (selectCountDocs)
...@@ -52,3 +52,29 @@ getTficf cId mId nt = do ...@@ -52,3 +52,29 @@ getTficf cId mId nt = do
(TficfSupra (Count $ fromMaybe 0 $ HM.lookup t mapTextDoubleGlobal) (TficfSupra (Count $ fromMaybe 0 $ HM.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal)) (Total $ fromIntegral countGlobal))
) mapTextDoubleLocal ) mapTextDoubleLocal
getTficf_withSample :: HasDBid NodeType
=> UserCorpusId
-> MasterCorpusId
-> NgramsType
-> Cmd err (HashMap NgramsTerm Double)
getTficf_withSample cId mId nt = do
mapTextDoubleLocal <- HM.filter (> 1)
<$> HM.map (fromIntegral . Set.size)
<$> getNodesByNgramsUser cId nt
countLocal <- selectCountDocs cId
let countGlobal = countLocal * 10
mapTextDoubleGlobal <- HM.map fromIntegral
<$> getOccByNgramsOnlyFast_withSample mId countGlobal nt
(HM.keys mapTextDoubleLocal)
pure $ HM.mapWithKey (\t n ->
tficf (TficfInfra (Count n )
(Total $ fromIntegral countLocal))
(TficfSupra (Count $ fromMaybe 0 $ HM.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal))
) mapTextDoubleLocal
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment