Commit 2b0c0c9b authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Score by Doc or Corpus.

parent 7643b2ea
...@@ -72,7 +72,7 @@ import GHC.Generics (Generic) ...@@ -72,7 +72,7 @@ import GHC.Generics (Generic)
import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Core.Utils.Prefix (unPrefix)
-- import Gargantext.Database.Schema.Ngrams (NgramsTypeId, ngramsTypeId, NgramsTableData(..)) -- import Gargantext.Database.Schema.Ngrams (NgramsTypeId, ngramsTypeId, NgramsTableData(..))
import Gargantext.Database.Config (userMaster) import Gargantext.Database.Config (userMaster)
import Gargantext.Database.Metrics.NgramsByNode (getOccByNgramsOnlySafe) import Gargantext.Database.Metrics.NgramsByNode (getOccByNgramsOnlySlow)
import Gargantext.Database.Schema.Ngrams (NgramsType) import Gargantext.Database.Schema.Ngrams (NgramsType)
import Gargantext.Database.Types.Node (NodeType(..)) import Gargantext.Database.Types.Node (NodeType(..))
import Gargantext.Database.Utils (fromField', HasConnection) import Gargantext.Database.Utils (fromField', HasConnection)
...@@ -236,7 +236,7 @@ ngramsElementFromRepo ...@@ -236,7 +236,7 @@ ngramsElementFromRepo
, _ne_parent = p , _ne_parent = p
, _ne_children = c , _ne_children = c
, _ne_ngrams = ngrams , _ne_ngrams = ngrams
, _ne_occurrences = panic "API.Ngrams._ne_occurrences" , _ne_occurrences = 0 -- panic "API.Ngrams._ne_occurrences"
{- {-
-- Here we could use 0 if we want to avoid any `panic`. -- Here we could use 0 if we want to avoid any `panic`.
-- It will not happen using getTableNgrams if -- It will not happen using getTableNgrams if
...@@ -875,14 +875,14 @@ type MaxSize = Int ...@@ -875,14 +875,14 @@ type MaxSize = Int
getTableNgrams :: (RepoCmdM env err m, HasNodeError err, HasConnection env) getTableNgrams :: (RepoCmdM env err m, HasNodeError err, HasConnection env)
=> NodeId -> TabType => NodeType -> NodeId -> TabType
-> ListId -> Limit -> Maybe Offset -> ListId -> Limit -> Maybe Offset
-> Maybe ListType -> Maybe ListType
-> Maybe MinSize -> Maybe MaxSize -> Maybe MinSize -> Maybe MaxSize
-> Maybe OrderBy -> Maybe OrderBy
-> (NgramsTerm -> Bool) -> (NgramsTerm -> Bool)
-> m (Versioned NgramsTable) -> m (Versioned NgramsTable)
getTableNgrams nId tabType listId limit_ offset getTableNgrams nType nId tabType listId limit_ offset
listType minSize maxSize orderBy searchQuery = do listType minSize maxSize orderBy searchQuery = do
lIds <- selectNodesWithUsername NodeList userMaster lIds <- selectNodesWithUsername NodeList userMaster
...@@ -902,12 +902,14 @@ getTableNgrams nId tabType listId limit_ offset ...@@ -902,12 +902,14 @@ getTableNgrams nId tabType listId limit_ offset
selected_inner roots n = maybe False (`Set.member` roots) (n ^. ne_root) selected_inner roots n = maybe False (`Set.member` roots) (n ^. ne_root)
---------------------------------------
sortOnOrder Nothing = identity sortOnOrder Nothing = identity
sortOnOrder (Just TermAsc) = List.sortOn $ view ne_ngrams sortOnOrder (Just TermAsc) = List.sortOn $ view ne_ngrams
sortOnOrder (Just TermDesc) = List.sortOn $ Down . view ne_ngrams sortOnOrder (Just TermDesc) = List.sortOn $ Down . view ne_ngrams
sortOnOrder (Just ScoreAsc) = List.sortOn $ view ne_occurrences sortOnOrder (Just ScoreAsc) = List.sortOn $ view ne_occurrences
sortOnOrder (Just ScoreDesc) = List.sortOn $ Down . view ne_occurrences sortOnOrder (Just ScoreDesc) = List.sortOn $ Down . view ne_occurrences
---------------------------------------
selectAndPaginate tableMap (NgramsTable list) = NgramsTable $ roots <> inners selectAndPaginate tableMap (NgramsTable list) = NgramsTable $ roots <> inners
where where
rootOf ne = maybe ne (\r -> ngramsElementFromRepo (r, fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. v_data . at r))) rootOf ne = maybe ne (\r -> ngramsElementFromRepo (r, fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. v_data . at r)))
...@@ -920,14 +922,20 @@ getTableNgrams nId tabType listId limit_ offset ...@@ -920,14 +922,20 @@ getTableNgrams nId tabType listId limit_ offset
rootsSet = Set.fromList (_ne_ngrams <$> roots) rootsSet = Set.fromList (_ne_ngrams <$> roots)
inners = list & filter (selected_inner rootsSet) inners = list & filter (selected_inner rootsSet)
---------------------------------------
setScores False table = pure table setScores False table = pure table
setScores True table = do setScores True table = do
occurrences <- getOccByNgramsOnlySafe nId (lIds <> [listId]) ngramsType (table ^.. v_data . _NgramsTable . each . ne_ngrams) let ngrams_terms = (table ^.. v_data . _NgramsTable . each . ne_ngrams)
occurrences <- getOccByNgramsOnlySlow nType nId
(lIds <> [listId])
ngramsType
ngrams_terms
let let
setOcc ne = ne & ne_occurrences .~ sumOf (at (ne ^. ne_ngrams) . _Just) occurrences setOcc ne = ne & ne_occurrences .~ sumOf (at (ne ^. ne_ngrams) . _Just) occurrences
pure $ table & v_data . _NgramsTable . each %~ setOcc pure $ table & v_data . _NgramsTable . each %~ setOcc
---------------------------------------
-- lists <- catMaybes <$> listsWith userMaster -- lists <- catMaybes <$> listsWith userMaster
-- trace (show lists) $ -- trace (show lists) $
...@@ -1003,7 +1011,7 @@ getTableNgramsCorpus :: (RepoCmdM env err m, HasNodeError err, HasConnection env ...@@ -1003,7 +1011,7 @@ getTableNgramsCorpus :: (RepoCmdM env err m, HasNodeError err, HasConnection env
-> Maybe Text -- full text search -> Maybe Text -- full text search
-> m (Versioned NgramsTable) -> m (Versioned NgramsTable)
getTableNgramsCorpus nId tabType listId limit_ offset listType minSize maxSize orderBy mt = getTableNgramsCorpus nId tabType listId limit_ offset listType minSize maxSize orderBy mt =
getTableNgrams nId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery getTableNgrams NodeCorpus nId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery
where where
searchQuery = maybe (const True) isInfixOf mt searchQuery = maybe (const True) isInfixOf mt
...@@ -1021,7 +1029,10 @@ getTableNgramsDoc dId tabType listId limit_ offset listType minSize maxSize orde ...@@ -1021,7 +1029,10 @@ getTableNgramsDoc dId tabType listId limit_ offset listType minSize maxSize orde
let ngramsType = ngramsTypeFromTabType tabType let ngramsType = ngramsTypeFromTabType tabType
ngs <- selectNgramsByDoc (ns <> [listId]) dId ngramsType ngs <- selectNgramsByDoc (ns <> [listId]) dId ngramsType
let searchQuery = flip S.member (S.fromList ngs) let searchQuery = flip S.member (S.fromList ngs)
getTableNgrams dId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery getTableNgrams NodeDocument dId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery
......
...@@ -28,6 +28,7 @@ Portability : POSIX ...@@ -28,6 +28,7 @@ Portability : POSIX
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where where
--import Debug.Trace (trace)
import Control.Lens ((^.), view, Lens', _Just) import Control.Lens ((^.), view, Lens', _Just)
import Control.Monad (mapM_) import Control.Monad (mapM_)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
...@@ -126,7 +127,6 @@ flowCorpusSearchInDatabase u la q = do ...@@ -126,7 +127,6 @@ flowCorpusSearchInDatabase u la q = do
------------------------------------------------------------------------ ------------------------------------------------------------------------
flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c) flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
=> Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flow c u cn la docs = do flow c u cn la docs = do
...@@ -177,10 +177,7 @@ insertMasterDocs c lang hs = do ...@@ -177,10 +177,7 @@ insertMasterDocs c lang hs = do
ids <- insertDb masterUserId masterCorpusId hs' ids <- insertDb masterUserId masterCorpusId hs'
let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs') let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams
terms2id <- insertNgrams $ DM.keys maps terms2id <- insertNgrams $ DM.keys maps
let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
......
...@@ -162,17 +162,21 @@ getOccByNgramsOnlyFast cId nt ngs = ...@@ -162,17 +162,21 @@ getOccByNgramsOnlyFast cId nt ngs =
fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
-- just slower than getOccByNgramsOnlyFast -- just slower than getOccByNgramsOnlyFast
getOccByNgramsOnlySlow :: CorpusId -> [ListId] -> NgramsType -> [Text] getOccByNgramsOnlySlow :: NodeType -> CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text Int) -> Cmd err (Map Text Int)
getOccByNgramsOnlySlow cId ls nt ngs = getOccByNgramsOnlySlow t cId ls nt ngs =
Map.map Set.size <$> getNodesByNgramsOnlyUser cId ls nt ngs Map.map Set.size <$> getScore' t cId ls nt ngs
where
getScore' NodeCorpus = getNodesByNgramsOnlyUser
getScore' NodeDocument = getNgramsByDocOnlyUser
getScore' _ = getNodesByNgramsOnlyUser
getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text] getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text Int) -> Cmd err (Map Text Int)
getOccByNgramsOnlySafe cId ls nt ngs = do getOccByNgramsOnlySafe cId ls nt ngs = do
printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs) printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
fast <- getOccByNgramsOnlyFast cId nt ngs fast <- getOccByNgramsOnlyFast cId nt ngs
slow <- getOccByNgramsOnlySlow cId ls nt ngs slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
when (fast /= slow) $ when (fast /= slow) $
printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int))) printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
pure slow pure slow
...@@ -209,7 +213,7 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql| ...@@ -209,7 +213,7 @@ queryNgramsOccurrencesOnlyByNodeUser = [sql|
GROUP BY nng.node2_id, ng.terms GROUP BY nng.node2_id, ng.terms
|] |]
getNodesByNgramsOnlyUser :: CorpusId -> [ListId] -> NgramsType -> [Text] getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId)) -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>) getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>)
. map (fromListWith (<>) . map (second Set.singleton)) . map (fromListWith (<>) . map (second Set.singleton))
...@@ -248,6 +252,39 @@ queryNgramsOnlyByNodeUser = [sql| ...@@ -248,6 +252,39 @@ queryNgramsOnlyByNodeUser = [sql|
getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs = Map.unionsWith (<>)
. map (fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
runPGSQuery queryNgramsOnlyByDocUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
WITH input_rows(terms) AS (?),
input_list(id) AS (?)
SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN input_list il ON il.id = nng.node1_id
WHERE nng.node2_id = ? -- DocId
AND nng.ngrams_type = ? -- NgramsTypeId
GROUP BY ng.terms, nng.node2_id
|]
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -60,23 +60,21 @@ buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType ...@@ -60,23 +60,21 @@ buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
buildNgramsOthersList uCid groupIt nt = do buildNgramsOthersList uCid groupIt nt = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
pure $ Map.fromList [(nt, [ mkNgramsElement t CandidateTerm Nothing (mSetFromList []) let
| (t,_ns) <- Map.toList ngs all' = Map.toList ngs
pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
where
toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
| (t,_ns) <- x
] ]
) )
] ]
-- TODO remove hard coded parameters
buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m) candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
--printDebug "candidate" (length candidates)
let termList = toTermList ((isStopTerm s) . fst) candidates let termList = toTermList ((isStopTerm s) . fst) candidates
--let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList let ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)] pure $ Map.fromList [(NgramsTerms, ngs)]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment