diff --git a/src/Gargantext/API/Search.hs b/src/Gargantext/API/Search.hs index f6222d8c7f27f860714010db7f23e266750b0373..92cd8f4ec18b4df91cad8459e848b3df5a02bf22 100644 --- a/src/Gargantext/API/Search.hs +++ b/src/Gargantext/API/Search.hs @@ -70,7 +70,7 @@ api nId (SearchQuery q SearchContact) o l order = do <$> map (toRow aId) <$> searchInCorpusWithContacts nId aId (concat q) o l order -api nId (SearchQuery q SearchDocWithNgrams) o l order = undefined +api _nId (SearchQuery _q SearchDocWithNgrams) _o _l _order = undefined ----------------------------------------------------------------------- ----------------------------------------------------------------------- @@ -135,7 +135,7 @@ instance FromJSON SearchResultTypes where parseJSON = genericParseJSON (defaultOptions { sumEncoding = defaultTaggedObject }) instance ToJSON SearchResultTypes where toJSON = genericToJSON (defaultOptions { sumEncoding = defaultTaggedObject }) - + instance Arbitrary SearchResultTypes where arbitrary = do srd <- SearchResultDoc <$> arbitrary @@ -166,7 +166,7 @@ data Row = deriving (Generic) instance FromJSON Row where - parseJSON = genericParseJSON + parseJSON = genericParseJSON ( defaultOptions { sumEncoding = defaultTaggedObject } ) instance ToJSON Row where diff --git a/src/Gargantext/Core/Text/Terms.hs b/src/Gargantext/Core/Text/Terms.hs index 72d0c3c906b764be52a97b7fcb4361fae8ae1d5d..78f414ffb7f4cb7722438c59da51fe2abb3f98d5 100644 --- a/src/Gargantext/Core/Text/Terms.hs +++ b/src/Gargantext/Core/Text/Terms.hs @@ -171,17 +171,10 @@ isSimpleNgrams _ = False -- 'MonoMulti' : mono and multi -- TODO : multi terms should exclude mono (intersection is not empty yet) terms :: TermType Lang -> Text -> IO [TermsWithCount] -terms tt txt = do - printDebug "[terms] tt" tt - printDebug "[terms] txt" txt - out <- termsNoLog tt txt - printDebug "[terms] out" out - pure out -termsNoLog :: TermType Lang -> Text -> IO [TermsWithCount] -termsNoLog (Mono lang) txt = pure $ monoTerms lang txt -termsNoLog (Multi lang) txt = multiterms lang txt -termsNoLog (MonoMulti lang) txt = terms (Multi lang) txt -termsNoLog (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt +terms (Mono lang) txt = pure $ monoTerms lang txt +terms (Multi lang) txt = multiterms lang txt +terms (MonoMulti lang) txt = terms (Multi lang) txt +terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt where m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs index 82b2a716a3ef6638a580ecdca89ce3ef6a15c315..5613984ed90a51ddc29a665d7b845b565fa9d268 100644 --- a/src/Gargantext/Database/Action/Flow.hs +++ b/src/Gargantext/Database/Action/Flow.hs @@ -375,7 +375,7 @@ saveDocNgramsWith :: (FlowCmdM env err m) -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId (Int, TermsCount))) -> m () saveDocNgramsWith lId mapNgramsDocs' = do - printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' + --printDebug "[saveDocNgramsWith] mapNgramsDocs'" mapNgramsDocs' let mapNgramsDocsNoCount = over (traverse . traverse . traverse) fst mapNgramsDocs' terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocsNoCount @@ -524,8 +524,6 @@ instance ExtractNgramsT HyperdataDocument termsWithCounts' <- map (\(t, cnt) -> (enrichedTerms (lang' ^. tt_lang) CoreNLP NP t, cnt)) <$> concat <$> liftBase (extractTerms lang' $ hasText doc) - printDebug "[extractNgramsT HyperdataDocument] termsWithCounts'" termsWithCounts' - printDebug "[extractNgramsT HyperdataDocument] termsWithLargerCounts" $ filter (\(_, cnt) -> cnt > 1) termsWithCounts' pure $ HashMap.fromList $ [(SimpleNgrams source, (Map.singleton Sources 1, 1)) ] diff --git a/src/Gargantext/Database/Action/Search.hs b/src/Gargantext/Database/Action/Search.hs index 7181aa1ffc87bfc4c10c048c79bf99d033a77bb4..656ab6c92dea775568b1b5c97f0e699b6e5f040c 100644 --- a/src/Gargantext/Database/Action/Search.hs +++ b/src/Gargantext/Database/Action/Search.hs @@ -14,7 +14,10 @@ module Gargantext.Database.Action.Search where import Control.Arrow (returnA) import Control.Lens ((^.)) +import qualified Data.List as List +import qualified Data.Map as Map import Data.Maybe +import qualified Data.Set as Set import Data.Text (Text, unpack, intercalate) import Data.Time (UTCTime) import Gargantext.Core @@ -26,9 +29,11 @@ import Gargantext.Database.Query.Filter import Gargantext.Database.Query.Join (leftJoin5) import Gargantext.Database.Query.Table.Node import Gargantext.Database.Query.Table.Context +import Gargantext.Database.Query.Table.ContextNodeNgrams (queryContextNodeNgramsTable) import Gargantext.Database.Query.Table.NodeContext -import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Query.Table.NodeContext_NodeContext +import Gargantext.Database.Schema.ContextNodeNgrams (ContextNodeNgramsPoly(..)) +import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Node import Gargantext.Database.Schema.Context import Gargantext.Prelude @@ -44,7 +49,7 @@ searchDocInDatabase :: HasDBid NodeType -> Cmd err [(NodeId, HyperdataDocument)] searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t) where - -- | Global search query where ParentId is Master Node Corpus Id + -- | Global search query where ParentId is Master Node Corpus Id queryDocInDatabase :: ParentId -> Text -> O.Select (Column SqlInt4, Column SqlJsonb) queryDocInDatabase _p q = proc () -> do row <- queryNodeSearchTable -< () @@ -54,6 +59,8 @@ searchDocInDatabase p t = runOpaQuery (queryDocInDatabase p t) ------------------------------------------------------------------------ +-- | Search ngrams in documents, ranking them by TF-IDF. We narrow our +-- search only to map/candidate terms. searchInCorpusWithNgrams :: HasDBid NodeType => CorpusId -> ListId @@ -64,7 +71,59 @@ searchInCorpusWithNgrams :: HasDBid NodeType -> Maybe Limit -> Maybe OrderBy -> Cmd err [FacetDoc] -searchInCorpusWithNgrams cId lId t ngt q o l order = undefined +searchInCorpusWithNgrams _cId _lId _t _ngt _q _o _l _order = undefined + +-- | Compute TF-IDF for all 'ngramIds' in given 'CorpusId'. In this +-- case only the "TF" part makes sense and so we only compute the +-- ratio of "number of times our terms appear in given document" and +-- "number of all terms in document" and return a sorted list of +-- document ids +tfidfAll :: CorpusId -> [Int] -> Cmd err [Int] +tfidfAll cId ngramIds = do + let ngramIdsSet = Set.fromList ngramIds + docsWithNgrams <- runOpaQuery (queryCorpusWithNgrams cId ngramIds) :: Cmd err [(Int, Int, Int)] + -- NOTE The query returned docs with ANY ngramIds. We need to further + -- restrict to ALL ngramIds. + let docsNgramsM = + Map.fromListWith (Set.union) + [ (ctxId, Set.singleton ngrams_id) + | (ctxId, ngrams_id, _) <- docsWithNgrams] + let docsWithAllNgramsS = Set.fromList $ List.map fst $ + List.filter (\(_, docNgrams) -> + ngramIdsSet == Set.intersection ngramIdsSet docNgrams) $ Map.toList docsNgramsM + let docsWithAllNgrams = + List.filter (\(ctxId, _, _) -> + Set.member ctxId docsWithAllNgramsS) docsWithNgrams + printDebug "[tfidfAll] docsWithAllNgrams" docsWithAllNgrams + let docsWithCounts = Map.fromListWith (+) [ (ctxId, doc_count) + | (ctxId, _, doc_count) <- docsWithAllNgrams] + printDebug "[tfidfAll] docsWithCounts" docsWithCounts + let totals = [ ( ctxId + , ngrams_id + , fromIntegral doc_count :: Double + , fromIntegral (fromMaybe 0 $ Map.lookup ctxId docsWithCounts) :: Double) + | (ctxId, ngrams_id, doc_count) <- docsWithAllNgrams] + let tfidf_sorted = List.sortOn snd [(ctxId, doc_count/s) + | (ctxId, _, doc_count, s) <- totals] + pure $ List.map fst $ List.reverse tfidf_sorted + +-- | Query for searching the 'context_node_ngrams' table so that we +-- find docs with ANY given 'ngramIds'. +queryCorpusWithNgrams :: CorpusId -> [Int] -> Select (Column SqlInt4, Column SqlInt4, Column SqlInt4) +queryCorpusWithNgrams cId ngramIds = proc () -> do + row <- queryContextNodeNgramsTable -< () + restrict -< (_cnng_node_id row) .== (pgNodeId cId) + restrict -< in_ (sqlInt4 <$> ngramIds) (_cnng_ngrams_id row) + returnA -< ( _cnng_context_id row + , _cnng_ngrams_id row + , _cnng_doc_count row) + --returnA -< row + -- returnA -< ( _cnng_context_id row + -- , _cnng_node_id row + -- , _cnng_ngrams_id row + -- , _cnng_ngramsType row + -- , _cnng_weight row + -- , _cnng_doc_count row) ------------------------------------------------------------------------ @@ -225,4 +284,3 @@ queryContactViaDoc = ) ) -> Column SqlBool cond45 (doc, (corpus, (_,(_,_)))) = doc^.cs_id .== corpus^.nc_context_id - diff --git a/src/Gargantext/Database/Query/Facet.hs b/src/Gargantext/Database/Query/Facet.hs index 49ee2d4da8eba741af6a557a6c964f6bbefe3266..11f05b380f56adc42b975514bcf83da4d0b01f48 100644 --- a/src/Gargantext/Database/Query/Facet.hs +++ b/src/Gargantext/Database/Query/Facet.hs @@ -103,7 +103,7 @@ data Facet id created title hyperdata category ngramCount score = , facetDoc_score :: score } deriving (Show, Generic) {- | TODO after demo -data Facet id date hyperdata score = +data Facet id date hyperdata score = FacetDoc { facetDoc_id :: id , facetDoc_date :: date , facetDoc_hyperdata :: hyperdata @@ -163,7 +163,7 @@ type FacetPairedReadNull = FacetPaired (Column (Nullable SqlInt4) ) (Column (Nullable SqlInt4) ) type FacetPairedReadNullAgg = FacetPaired (Aggregator (Column (Nullable SqlInt4) ) - (Column (Nullable SqlInt4) ) + (Column (Nullable SqlInt4) ) ) (Aggregator (Column (Nullable SqlTimestamptz)) (Column (Nullable SqlTimestamptz)) diff --git a/src/Gargantext/Database/Query/Join.hs b/src/Gargantext/Database/Query/Join.hs index 6a609ffe47a713ba02ee0591018b7a75019356e1..3486a9a6dc856688f5c3d562cfc7523db73ac25e 100644 --- a/src/Gargantext/Database/Query/Join.hs +++ b/src/Gargantext/Database/Query/Join.hs @@ -43,7 +43,7 @@ import Opaleye.Internal.Join (NullMaker(..)) import qualified Opaleye.Internal.Unpackspec() -keepWhen :: (a -> Field SqlBool) -> SelectArr a a +keepWhen :: (a -> Field SqlBool) -> SelectArr a a keepWhen p = proc a -> do restrict -< p a returnA -< a @@ -61,7 +61,7 @@ leftJoin2 = leftJoin ------------------------------------------------------------------------ -- | LeftJoin3 in two ways to write it leftJoin3 :: Select columnsA -> Select columnsB -> Select columnsC - -> ((columnsA, columnsB, columnsC) -> Column SqlBool) + -> ((columnsA, columnsB, columnsC) -> Column SqlBool) -> Select (columnsA, columnsB, columnsC) leftJoin3 q1 q2 q3 cond = ((,,) <$> q1 <*> q2 <*> q3) >>> keepWhen cond @@ -82,7 +82,7 @@ leftJoin4' :: Select columnsA -> Select columnsB -> Select columnsC -> Select columnsD - -> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool) + -> ((columnsA, columnsB, columnsC, columnsD) -> Column SqlBool) -> Select (columnsA, columnsB, columnsC, columnsD) leftJoin4' q1 q2 q3 q4 cond = ((,,,) <$> q1 <*> q2 <*> q3 <*> q4) >>> keepWhen cond @@ -375,4 +375,3 @@ leftJoin9 q1 q2 q3 q4 q5 q6 q7 q8 q9 ) cond67 ) cond78 ) cond89 -