Commit b0b6a491 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[ngrams] implemented doc_count column

parent 092d482b
Pipeline #3368 passed with stage
in 92 minutes and 7 seconds
...@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do ...@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
<$> map (toRow aId) <$> map (toRow aId)
<$> searchInCorpusWithContacts nId aId (concat q) o l order <$> searchInCorpusWithContacts nId aId (concat q) o l order
api nId (SearchQuery q SearchDocWithNgrams) o l order = undefined api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined
----------------------------------------------------------------------- -----------------------------------------------------------------------
----------------------------------------------------------------------- -----------------------------------------------------------------------
......
...@@ -171,17 +171,10 @@ isSimpleNgrams _ = False ...@@ -171,17 +171,10 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi -- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet) -- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [TermsWithCount] terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms tt txt = do terms (Mono lang) txt = pure $ monoTerms lang txt
printDebug "[terms] tt" tt terms (Multi lang) txt = multiterms lang txt
printDebug "[terms] txt" txt terms (MonoMulti lang) txt = terms (Multi lang) txt
out <- termsNoLog tt txt terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
printDebug "[terms] out" out
pure out
termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
termsNoLog (Mono lang) txt = pure $ monoTerms lang txt
termsNoLog (Multi lang) txt = multiterms lang txt
termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......
...@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m) ...@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount))) -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs' let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
...@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument ...@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt)) termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat <$> concat
<$> liftBase (extractTerms lang' $ hasText doc) <$> liftBase (extractTerms lang' $ hasText doc)
printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'
pure $ HashMap.fromList pure $ HashMap.fromList
$ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ] $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
......
...@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where ...@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import Control.Arrow (returnA) import Control.Arrow (returnA)
import Control.Lens ((^.)) import Control.Lens ((^.))
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Maybe import Data.Maybe
import qualified Data.Set as Set
import Data.Text (Text, unpack, intercalate) import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime) import Data.Time (UTCTime)
import Gargantext.Core import Gargantext.Core
...@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter ...@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
import Gargantext.Database.Query.Join (leftJoin5) import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Context import Gargantext.Database.Query.Table.Context
import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
import Gargantext.Database.Query.Table.NodeContext import Gargantext.Database.Query.Table.NodeContext
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Query.Table.NodeContext_NodeContext import Gargantext.Database.Query.Table.NodeContext_NodeContext
import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Context import Gargantext.Database.Schema.Context
import Gargantext.Prelude import Gargantext.Prelude
...@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t) ...@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams :: HasDBid NodeType searchInCorpusWithNgrams :: HasDBid NodeType
=> CorpusId => CorpusId
-> ListId -> ListId
...@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType ...@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
-> Maybe Limit -> Maybe Limit
-> Maybe OrderBy -> Maybe OrderBy
-> Cmd err [FacetDoc] -> Cmd err [FacetDoc]
searchInCorpusWithNgrams cId lId t ngt q o l order = undefined searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
tfidfAll cId ngramIds = do
let ngramIdsSet = Set.fromList ngramIds
docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let docsNgramsM =
Map.fromListWith (Set.union)
[ (ctxId, Set.singleton ngrams_id)
| (ctxId, ngrams_id, _) <- docsWithNgrams]
let docsWithAllNgramsS = Set.fromList $ List.map fst $
List.filter (\(_, docNgrams) ->
ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
let docsWithAllNgrams =
List.filter (\(ctxId, _, _) ->
Set.member ctxId docsWithAllNgramsS) docsWithNgrams
printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
| (ctxId, _, doc_count) <- docsWithAllNgrams]
printDebug "[tfidfAll] docsWithCounts" docsWithCounts
let totals = [ ( ctxId
, ngrams_id
, fromIntegral doc_count :: Double
, fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
| (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
| (ctxId, _, doc_count, s) <- totals]
pure $ List.map fst $ List.reverse tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
queryCorpusWithNgrams cId ngramIds = proc () -> do
row <- queryContextNodeNgramsTable -< ()
restrict -< (_cnng_node_id row) .== (pgNodeId cId)
restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
returnA -< ( _cnng_context_id row
, _cnng_ngrams_id row
, _cnng_doc_count row)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -225,4 +284,3 @@ queryContactViaDoc = ...@@ -225,4 +284,3 @@ queryContactViaDoc =
) )
) -> Column SqlBool ) -> Column SqlBool
cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id
...@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9 ...@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
) cond67 ) cond67
) cond78 ) cond78
) cond89 ) cond89
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment