Commit b0b6a491 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[ngrams] implemented doc_count column

parent 092d482b
...@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do ...@@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do
<$> map (toRow aId) <$> map (toRow aId)
<$> searchInCorpusWithContacts nId aId (concat q) o l order <$> searchInCorpusWithContacts nId aId (concat q) o l order
api nId (SearchQuery q SearchDocWithNgrams) o l order = undefined api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined
----------------------------------------------------------------------- -----------------------------------------------------------------------
----------------------------------------------------------------------- -----------------------------------------------------------------------
...@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where ...@@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where
parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject }) parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance ToJSON SearchResultTypes where instance ToJSON SearchResultTypes where
toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject }) toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject })
instance Arbitrary SearchResultTypes where instance Arbitrary SearchResultTypes where
arbitrary = do arbitrary = do
srd <- SearchResultDoc <$> arbitrary srd <- SearchResultDoc <$> arbitrary
...@@ -166,7 +166,7 @@ data Row = ...@@ -166,7 +166,7 @@ data Row =
deriving (Generic) deriving (Generic)
instance FromJSON Row instance FromJSON Row
where where
parseJSON = genericParseJSON parseJSON = genericParseJSON
( defaultOptions { sumEncoding = defaultTaggedObject } ) ( defaultOptions { sumEncoding = defaultTaggedObject } )
instance ToJSON Row instance ToJSON Row
where where
......
...@@ -171,17 +171,10 @@ isSimpleNgrams _ = False ...@@ -171,17 +171,10 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi -- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet) -- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [TermsWithCount] terms :: TermType Lang -> Text -> IO [TermsWithCount]
terms tt txt = do terms (Mono lang) txt = pure $ monoTerms lang txt
printDebug "[terms] tt" tt terms (Multi lang) txt = multiterms lang txt
printDebug "[terms] txt" txt terms (MonoMulti lang) txt = terms (Multi lang) txt
out <- termsNoLog tt txt terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
printDebug "[terms] out" out
pure out
termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount]
termsNoLog (Mono lang) txt = pure $ monoTerms lang txt
termsNoLog (Multi lang) txt = multiterms lang txt
termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt
termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where where
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......
...@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m) ...@@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m)
-> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount))) -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount)))
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs'
let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs' let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs'
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount
...@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument ...@@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument
termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt)) termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt))
<$> concat <$> concat
<$> liftBase (extractTerms lang' $ hasText doc) <$> liftBase (extractTerms lang' $ hasText doc)
printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts'
printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts'
pure $ HashMap.fromList pure $ HashMap.fromList
$ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ] $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ]
......
...@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where ...@@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where
import Control.Arrow (returnA) import Control.Arrow (returnA)
import Control.Lens ((^.)) import Control.Lens ((^.))
import qualified Data.List as List
import qualified Data.Map as Map
import Data.Maybe import Data.Maybe
import qualified Data.Set as Set
import Data.Text (Text, unpack, intercalate) import Data.Text (Text, unpack, intercalate)
import Data.Time (UTCTime) import Data.Time (UTCTime)
import Gargantext.Core import Gargantext.Core
...@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter ...@@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter
import Gargantext.Database.Query.Join (leftJoin5) import Gargantext.Database.Query.Join (leftJoin5)
import Gargantext.Database.Query.Table.Node import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Context import Gargantext.Database.Query.Table.Context
import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable)
import Gargantext.Database.Query.Table.NodeContext import Gargantext.Database.Query.Table.NodeContext
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Query.Table.NodeContext_NodeContext import Gargantext.Database.Query.Table.NodeContext_NodeContext
import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..))
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Context import Gargantext.Database.Schema.Context
import Gargantext.Prelude import Gargantext.Prelude
...@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType ...@@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType
-> Cmd err [(NodeId, HyperdataDocument)] -> Cmd err [(NodeId, HyperdataDocument)]
searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t) searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
where where
-- | Global search query where ParentId is Master Node Corpus Id -- | Global search query where ParentId is Master Node Corpus Id
queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb) queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb)
queryDocInDatabase _p q = proc () -> do queryDocInDatabase _p q = proc () -> do
row <- queryNodeSearchTable -< () row <- queryNodeSearchTable -< ()
...@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t) ...@@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our
-- search only to map/candidate terms.
searchInCorpusWithNgrams :: HasDBid NodeType searchInCorpusWithNgrams :: HasDBid NodeType
=> CorpusId => CorpusId
-> ListId -> ListId
...@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType ...@@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType
-> Maybe Limit -> Maybe Limit
-> Maybe OrderBy -> Maybe OrderBy
-> Cmd err [FacetDoc] -> Cmd err [FacetDoc]
searchInCorpusWithNgrams cId lId t ngt q o l order = undefined searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined
-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this
-- case only the "TF" part makes sense and so we only compute the
-- ratio of "number of times our terms appear in given document" and
-- "number of all terms in document" and return a sorted list of
-- document ids
tfidfAll :: CorpusId -> [Int] -> Cmd err [Int]
tfidfAll cId ngramIds = do
let ngramIdsSet = Set.fromList ngramIds
docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)]
-- NOTE The query returned docs with ANY ngramIds. We need to further
-- restrict to ALL ngramIds.
let docsNgramsM =
Map.fromListWith (Set.union)
[ (ctxId, Set.singleton ngrams_id)
| (ctxId, ngrams_id, _) <- docsWithNgrams]
let docsWithAllNgramsS = Set.fromList $ List.map fst $
List.filter (\(_, docNgrams) ->
ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM
let docsWithAllNgrams =
List.filter (\(ctxId, _, _) ->
Set.member ctxId docsWithAllNgramsS) docsWithNgrams
printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams
let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count)
| (ctxId, _, doc_count) <- docsWithAllNgrams]
printDebug "[tfidfAll] docsWithCounts" docsWithCounts
let totals = [ ( ctxId
, ngrams_id
, fromIntegral doc_count :: Double
, fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double)
| (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams]
let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s)
| (ctxId, _, doc_count, s) <- totals]
pure $ List.map fst $ List.reverse tfidf_sorted
-- | Query for searching the 'context_node_ngrams' table so that we
-- find docs with ANY given 'ngramIds'.
queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4)
queryCorpusWithNgrams cId ngramIds = proc () -> do
row <- queryContextNodeNgramsTable -< ()
restrict -< (_cnng_node_id row) .== (pgNodeId cId)
restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row)
returnA -< ( _cnng_context_id row
, _cnng_ngrams_id row
, _cnng_doc_count row)
--returnA -< row
-- returnA -< ( _cnng_context_id row
-- , _cnng_node_id row
-- , _cnng_ngrams_id row
-- , _cnng_ngramsType row
-- , _cnng_weight row
-- , _cnng_doc_count row)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -225,4 +284,3 @@ queryContactViaDoc = ...@@ -225,4 +284,3 @@ queryContactViaDoc =
) )
) -> Column SqlBool ) -> Column SqlBool
cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id
...@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score = ...@@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score =
, facetDoc_score :: score , facetDoc_score :: score
} deriving (Show, Generic) } deriving (Show, Generic)
{- | TODO after demo {- | TODO after demo
data Facet id date hyperdata score = data Facet id date hyperdata score =
FacetDoc { facetDoc_id :: id FacetDoc { facetDoc_id :: id
, facetDoc_date :: date , facetDoc_date :: date
, facetDoc_hyperdata :: hyperdata , facetDoc_hyperdata :: hyperdata
...@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) ) ...@@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) ) (Column (Nullable SqlInt4) )
type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4) ) type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4) )
(Column (Nullable SqlInt4) ) (Column (Nullable SqlInt4) )
) )
(Aggregator (Column (Nullable SqlTimestamptz)) (Aggregator (Column (Nullable SqlTimestamptz))
(Column (Nullable SqlTimestamptz)) (Column (Nullable SqlTimestamptz))
......
...@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..)) ...@@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..))
import qualified Opaleye.Internal.Unpackspec() import qualified Opaleye.Internal.Unpackspec()
keepWhen :: (a -> Field SqlBool) -> SelectArr a a keepWhen :: (a -> Field SqlBool) -> SelectArr a a
keepWhen p = proc a -> do keepWhen p = proc a -> do
restrict -< p a restrict -< p a
returnA -< a returnA -< a
...@@ -61,7 +61,7 @@ leftJoin2 = leftJoin ...@@ -61,7 +61,7 @@ leftJoin2 = leftJoin
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | LeftJoin3 in two ways to write it -- | LeftJoin3 in two ways to write it
leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC
-> ((columnsA, columnsB, columnsC) -> Column SqlBool) -> ((columnsA, columnsB, columnsC) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC) -> Select (columnsA, columnsB, columnsC)
leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond
...@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA ...@@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA
-> Select columnsB -> Select columnsB
-> Select columnsC -> Select columnsC
-> Select columnsD -> Select columnsD
-> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool) -> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool)
-> Select (columnsA, columnsB, columnsC, columnsD) -> Select (columnsA, columnsB, columnsC, columnsD)
leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond
...@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9 ...@@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9
) cond67 ) cond67
) cond78 ) cond78
) cond89 ) cond89
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment