[ngrams] implemented doc_count column

b0b6a491 · Przemyslaw Kaminski · 092d482b · b0b6a491 · b0b6a491 · b0b6a491
Commit b0b6a491 authored Nov 07, 2022 by Przemyslaw Kaminski
6 changed files
--- a/src/Gargantext/API/Search.hs
+++ b/src/Gargantext/API/Search.hs
@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
            <$> map (toRow aId)
            <$> searchInCorpusWithContacts nId aId (concat q) o l order

-api nId (SearchQuery q SearchDocWithNgrams) o l order = undefined
+api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined

 -----------------------------------------------------------------------
 -----------------------------------------------------------------------
@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where
  parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject })
 instance ToJSON SearchResultTypes where
  toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject })
- 
+
 instance Arbitrary SearchResultTypes where
  arbitrary = do
    srd <- SearchResultDoc     <$> arbitrary
@@ -166,7 +166,7 @@ data Row =
  deriving (Generic)
 instance FromJSON  Row
  where
-    parseJSON = genericParseJSON 
+    parseJSON = genericParseJSON
                 ( defaultOptions { sumEncoding = defaultTaggedObject } )
 instance ToJSON  Row
  where

--- a/src/Gargantext/Core/Text/Terms.hs
+++ b/src/Gargantext/Core/Text/Terms.hs
@@ -171,17 +171,10 @@ isSimpleNgrams _                = False
 -- 'MonoMulti' : mono and multi
 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 terms :: TermType Lang -> Text -> IO [TermsWithCount]
-terms tt txt = do
-  printDebug "[terms] tt" tt
-  printDebug "[terms] txt" txt
-  out <- termsNoLog tt txt
-  printDebug "[terms] out" out
-  pure out
-termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
-termsNoLog (Mono      lang) txt = pure $ monoTerms lang txt
-termsNoLog (Multi     lang) txt = multiterms lang txt
-termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
-termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
+terms (Mono      lang) txt = pure $ monoTerms lang txt
+terms (Multi     lang) txt = multiterms lang txt
+terms (MonoMulti lang) txt = terms (Multi lang) txt
+terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
  where
    m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt

--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
                  -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
                  -> m ()
 saveDocNgramsWith lId mapNgramsDocs' = do
-  printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
+  --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
  let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
  terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount

@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
          termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
                              <$> concat
                              <$> liftBase (extractTerms lang' $ hasText doc)
-          printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
-          printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'

          pure $ HashMap.fromList
               $  [(SimpleNgrams source, (Map.singleton Sources     1, 1))                    ]

--- a/src/Gargantext/Database/Action/Search.hs
+++ b/src/Gargantext/Database/Action/Search.hs
@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where

 import Control.Arrow (returnA)
 import Control.Lens ((^.))
+import qualified Data.List as List
+import qualified Data.Map as Map
 import Data.Maybe
+import qualified Data.Set as Set
 import Data.Text (Text, unpack, intercalate)
 import Data.Time (UTCTime)
 import Gargantext.Core
@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
 import Gargantext.Database.Query.Join (leftJoin5)
 import Gargantext.Database.Query.Table.Node
 import Gargantext.Database.Query.Table.Context
+import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
 import Gargantext.Database.Query.Table.NodeContext
-import Gargantext.Database.Schema.Ngrams (NgramsType(..))
 import Gargantext.Database.Query.Table.NodeContext_NodeContext
+import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
+import Gargantext.Database.Schema.Ngrams (NgramsType(..))
 import Gargantext.Database.Schema.Node
 import Gargantext.Database.Schema.Context
 import Gargantext.Prelude
@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
                    -> Cmd err [(NodeId, HyperdataDocument)]
 searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
  where
-    -- | Global search query where ParentId is Master Node Corpus Id 
+    -- | Global search query where ParentId is Master Node Corpus Id
    queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
    queryDocInDatabase _p q = proc () -> do
        row <- queryNodeSearchTable -< ()
@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)

 ------------------------------------------------------------------------

+-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
+-- search only to map/candidate terms.
 searchInCorpusWithNgrams :: HasDBid NodeType
               => CorpusId
               -> ListId
@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
               -> Maybe Limit
               -> Maybe OrderBy
               -> Cmd err [FacetDoc]
-searchInCorpusWithNgrams cId lId t ngt q o l order = undefined
+searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
+
+-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
+-- case only the "TF" part makes sense and so we only compute the
+-- ratio of "number of times our terms appear in given document" and
+-- "number of all terms in document" and return a sorted list of
+-- document ids
+tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
+tfidfAll cId ngramIds = do
+  let ngramIdsSet = Set.fromList ngramIds
+  docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
+  -- NOTE The query returned docs with ANY ngramIds. We need to further
+  -- restrict to ALL ngramIds.
+  let docsNgramsM =
+        Map.fromListWith (Set.union)
+            [ (ctxId, Set.singleton ngrams_id)
+            | (ctxId, ngrams_id, _) <- docsWithNgrams]
+  let docsWithAllNgramsS = Set.fromList $ List.map fst $
+        List.filter (\(_, docNgrams) ->
+                        ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
+  let docsWithAllNgrams =
+        List.filter (\(ctxId, _, _) ->
+                       Set.member ctxId docsWithAllNgramsS) docsWithNgrams
+  printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
+  let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
+                                            | (ctxId, _, doc_count) <- docsWithAllNgrams]
+  printDebug "[tfidfAll] docsWithCounts" docsWithCounts
+  let totals = [ ( ctxId
+                 , ngrams_id
+                 , fromIntegral doc_count :: Double
+                 , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
+               | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
+  let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
+                                     | (ctxId, _, doc_count, s) <- totals]
+  pure $ List.map fst $ List.reverse tfidf_sorted
+
+-- | Query for searching the 'context_node_ngrams' table so that we
+-- find docs with ANY given 'ngramIds'.
+queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
+queryCorpusWithNgrams cId ngramIds = proc () -> do
+  row <- queryContextNodeNgramsTable -< ()
+  restrict -< (_cnng_node_id row) .== (pgNodeId cId)
+  restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
+  returnA -< ( _cnng_context_id row
+             , _cnng_ngrams_id row
+             , _cnng_doc_count row)
+  --returnA -< row
+  -- returnA -< ( _cnng_context_id row
+  --            , _cnng_node_id row
+  --            , _cnng_ngrams_id row
+  --            , _cnng_ngramsType row
+  --            , _cnng_weight row
+  --            , _cnng_doc_count row)


 ------------------------------------------------------------------------
@@ -225,4 +284,3 @@ queryContactViaDoc =
                  )
                ) -> Column SqlBool
      cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id
-
--- a/src/Gargantext/Database/Query/Facet.hs
+++ b/src/Gargantext/Database/Query/Facet.hs
@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
              , facetDoc_score      :: score
              } deriving (Show, Generic)
 {- | TODO after demo
-data Facet id date hyperdata score = 
+data Facet id date hyperdata score =
     FacetDoc { facetDoc_id        :: id
              , facetDoc_date      :: date
              , facetDoc_hyperdata :: hyperdata
@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4)       )
                                       (Column (Nullable SqlInt4)       )

 type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4)       )
-                                                      (Column (Nullable SqlInt4)       ) 
+                                                      (Column (Nullable SqlInt4)       )
                                          )
                                          (Aggregator (Column (Nullable SqlTimestamptz))
                                                      (Column (Nullable SqlTimestamptz))

--- a/src/Gargantext/Database/Query/Join.hs
+++ b/src/Gargantext/Database/Query/Join.hs
@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
 import qualified Opaleye.Internal.Unpackspec()


-keepWhen :: (a -> Field SqlBool) -> SelectArr a a  
+keepWhen :: (a -> Field SqlBool) -> SelectArr a a
 keepWhen p = proc a -> do
  restrict  -< p a
  returnA -< a
@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
 ------------------------------------------------------------------------
 -- | LeftJoin3 in two ways to write it
 leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC
-      -> ((columnsA, columnsB, columnsC) -> Column SqlBool) 
+      -> ((columnsA, columnsB, columnsC) -> Column SqlBool)
      -> Select (columnsA, columnsB, columnsC)
 leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond

@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
           -> Select columnsB
           -> Select columnsC
           -> Select columnsD
-      -> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool) 
+      -> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
      -> Select (columnsA, columnsB, columnsC, columnsD)
 leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond

@@ -375,4 +375,3 @@ leftJoin9   q1 q2    q3     q4     q5     q6      q7    q8     q9
                  ) cond67
                ) cond78
              ) cond89
-