Commit b0b6a491 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[ngrams] implemented doc_count column

parent 092d482b
Pipeline #3368 passed with stage
in 92 minutes and 7 seconds
......@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
<$> map (toRow aId)
<$> searchInCorpusWithContacts nId aId (concat q) o l order
api nId (SearchQuery q SearchDocWithNgrams) o l order = undefined
api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined
-----------------------------------------------------------------------
-----------------------------------------------------------------------
......@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where
parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance ToJSON SearchResultTypes where
toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance Arbitrary SearchResultTypes where
arbitrary = do
srd <- SearchResultDoc <$> arbitrary
......@@ -166,7 +166,7 @@ data Row =
deriving (Generic)
instance FromJSON Row
where
parseJSON = genericParseJSON
parseJSON = genericParseJSON
( defaultOptions { sumEncoding = defaultTaggedObject } )
instance ToJSON Row
where
......
......@@ -171,17 +171,10 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms tt txt = do
printDebug "[terms] tt" tt
printDebug "[terms] txt" txt
out <- termsNoLog tt txt
printDebug "[terms] out" out
pure out
termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
termsNoLog (Mono lang) txt = pure $ monoTerms lang txt
termsNoLog (Multi lang) txt = multiterms lang txt
termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......
......@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m ()
saveDocNgramsWith lId mapNgramsDocs' = do
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
--printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
......@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'
pure $ HashMap.fromList
$ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
......
......@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import Control.Arrow (returnA)
import Control.Lens ((^.))
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Maybe
import qualified Data.Set as Set
import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime)
import Gargantext.Core
......@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Context
import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
import Gargantext.Database.Query.Table.NodeContext
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Query.Table.NodeContext_NodeContext
import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Context
import Gargantext.Prelude
......@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
-> Cmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
where
-- | Global search query where ParentId is Master Node Corpus Id
-- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
queryDocInDatabase _p q = proc () -> do
row <- queryNodeSearchTable -< ()
......@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams :: HasDBid NodeType
=> CorpusId
-> ListId
......@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
-> Maybe Limit
-> Maybe OrderBy
-> Cmd err [FacetDoc]
searchInCorpusWithNgrams cId lId t ngt q o l order = undefined
searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
tfidfAll cId ngramIds = do
let ngramIdsSet = Set.fromList ngramIds
docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let docsNgramsM =
Map.fromListWith (Set.union)
[ (ctxId, Set.singleton ngrams_id)
| (ctxId, ngrams_id, _) <- docsWithNgrams]
let docsWithAllNgramsS = Set.fromList $ List.map fst $
List.filter (\(_, docNgrams) ->
ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
let docsWithAllNgrams =
List.filter (\(ctxId, _, _) ->
Set.member ctxId docsWithAllNgramsS) docsWithNgrams
printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
| (ctxId, _, doc_count) <- docsWithAllNgrams]
printDebug "[tfidfAll] docsWithCounts" docsWithCounts
let totals = [ ( ctxId
, ngrams_id
, fromIntegral doc_count :: Double
, fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
| (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
| (ctxId, _, doc_count, s) <- totals]
pure $ List.map fst $ List.reverse tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
queryCorpusWithNgrams cId ngramIds = proc () -> do
row <- queryContextNodeNgramsTable -< ()
restrict -< (_cnng_node_id row) .== (pgNodeId cId)
restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
returnA -< ( _cnng_context_id row
, _cnng_ngrams_id row
, _cnng_doc_count row)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------
......@@ -225,4 +284,3 @@ queryContactViaDoc =
)
) -> Column SqlBool
cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id
......@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
, facetDoc_score :: score
} deriving (Show, Generic)
{- | TODO after demo
data Facet id date hyperdata score =
data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id
, facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata
......@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) )
)
(Aggregator (Column (Nullable SqlTimestamptz))
(Column (Nullable SqlTimestamptz))
......
......@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import qualified Opaleye.Internal.Unpackspec()
keepWhen :: (a -> Field SqlBool) -> SelectArr a a
keepWhen :: (a -> Field SqlBool) -> SelectArr a a
keepWhen p = proc a -> do
restrict -< p a
returnA -< a
......@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it
leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC
-> ((columnsA, columnsB, columnsC) -> Column SqlBool)
-> ((columnsA, columnsB, columnsC) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC)
leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond
......@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
-> Select columnsB
-> Select columnsC
-> Select columnsD
-> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
-> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC, columnsD)
leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond
......@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
) cond67
) cond78
) cond89
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment