NgramsByContext.hs 17.3 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
{-|
Module      : Gargantext.Database.Metrics.NgramsByContext
Description : Ngrams by Node user and master
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Ngrams by node enable contextual metrics.

-}

{-# LANGUAGE QuasiQuotes       #-}

module Gargantext.Database.Action.Metrics.NgramsByContext
  where

-- import Debug.Trace (trace)
--import Data.Map.Strict.Patch (PatchMap, Replace, diff)
-- import Control.Monad (void)
import Data.HashMap.Strict (HashMap)
import Data.Map.Strict (Map)
import Data.Set (Set)
import Data.Text (Text)
import Data.Tuple.Extra (first, second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core
import Gargantext.Data.HashMap.Strict.Utils as HM
import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
import Gargantext.Database.Prelude (Cmd, runPGSQuery)  -- , execPGSQuery)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Prelude
import qualified Data.HashMap.Strict              as HM
import qualified Data.List                        as List
import qualified Data.Map.Strict                  as Map
import qualified Data.Set                         as Set
import qualified Database.PostgreSQL.Simple       as DPS
import qualified Database.PostgreSQL.Simple.Types as DPST

-- | fst is size of Supra Corpus
--   snd is Texts and size of Occurrences (different docs)

countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
                       -> HashMap NgramsTerm (Set ContextId)
                       -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
countContextsByNgramsWith f m = (total, m')
  where
    total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
    m'    = HM.map ( swap . second (fromIntegral . Set.size))
          $ groupContextsByNgramsWith f m


    groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
                              -> HashMap NgramsTerm (Set NodeId)
                              -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
    groupContextsByNgramsWith f' m'' =
      HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
                           $ HM.toList m''

------------------------------------------------------------------------
getContextsByNgramsUser ::  HasDBid NodeType
                        => CorpusId
                        -> NgramsType
                        -> Cmd err (HashMap NgramsTerm (Set ContextId))
getContextsByNgramsUser cId nt =
  HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
                    <$> selectNgramsByContextUser cId nt
    where

      selectNgramsByContextUser :: HasDBid NodeType
                                => CorpusId
                                -> NgramsType
                                -> Cmd err [(NodeId, Text)]
      selectNgramsByContextUser cId' nt' =
        runPGSQuery queryNgramsByContextUser
                    ( cId'
                    , toDBid NodeDocument
                    , ngramsTypeId nt'
           --         , 100 :: Int -- limit
           --         , 0   :: Int -- offset
                    )

      queryNgramsByContextUser :: DPS.Query
      queryNgramsByContextUser = [sql|
        SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
          JOIN ngrams         ng ON cng.ngrams_id = ng.id
          JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
          JOIN contexts        c ON nc.context_id   = c.id
          WHERE nc.node_id      = ? -- CorpusId
            AND c.typename      = ? -- toDBid
            AND cng.ngrams_type = ? -- NgramsTypeId
            AND nc.category     > 0 -- is not in Trash
            GROUP BY cng.context_id, ng.terms
        |]


------------------------------------------------------------------------
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
                       => CorpusId
                       -> Int
                       -> NgramsType
                       -> [NgramsTerm]
                       -> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast_withSample cId int nt ngs =
  HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs


getOccByNgramsOnlyFast :: CorpusId
                       -> ListId
                       -> NgramsType
                       -> Cmd err (HashMap NgramsTerm [ContextId])
getOccByNgramsOnlyFast cId lId nt = do
    --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
    HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
    where

      run :: CorpusId
           -> ListId
           -> NgramsType
           -> Cmd err [(Text, DPST.PGArray Int)]
      run cId' lId' nt' = runPGSQuery query
                ( cId'
                , lId'
                , ngramsTypeId nt'
                )

      query :: DPS.Query
      query = [sql|
                WITH cnnv AS
                ( SELECT DISTINCT context_node_ngrams.context_id,
                    context_node_ngrams.ngrams_id,
                    nodes_contexts.node_id
                  FROM nodes_contexts
                  JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
                ),
                node_context_ids AS
                  (SELECT context_id, ngrams_id, terms
                  FROM cnnv
                  JOIN ngrams ON cnnv.ngrams_id = ngrams.id
                  WHERE node_id = ?
                  ),
                ncids_agg AS
                  (SELECT ngrams_id, terms, array_agg(DISTINCT context_id) AS agg
                    FROM node_context_ids
                    GROUP BY (ngrams_id, terms)),
                ns AS
                  (SELECT ngrams_id, terms
                    FROM node_stories
                    JOIN ngrams ON ngrams_id = ngrams.id
                    WHERE node_id = ? AND ngrams_type_id = ?
                  )

                SELECT ns.terms, CASE WHEN agg IS NULL THEN '{}' ELSE agg END
                FROM ns
                LEFT JOIN ncids_agg ON ns.ngrams_id = ncids_agg.ngrams_id
        |]
      -- query = [sql|
      --           WITH node_context_ids AS
      --             (select context_id, ngrams_id
      --             FROM context_node_ngrams_view
      --             WHERE node_id = ?
      --             ), ns AS
      --           (select ngrams_id FROM node_stories
      --             WHERE node_id = ? AND ngrams_type_id = ?
      --           )

      --           SELECT ng.terms,
      --           ARRAY ( SELECT DISTINCT context_id
      --                     FROM node_context_ids
      --                     WHERE ns.ngrams_id = node_context_ids.ngrams_id
      --                 )
      --           AS context_ids
      --           FROM ngrams ng
      --           JOIN ns ON ng.id = ns.ngrams_id
      --   |]


selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
                                      => CorpusId
                                      -> Int
                                      -> NgramsType
                                      -> [NgramsTerm]
                                      -> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
  fmap (first NgramsTerm) <$>
  runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
                ( int
                , toDBid NodeDocument
                , cId
                , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
                , cId
                , ngramsTypeId nt
                )
    where
      fields = [QualifiedIdentifier Nothing "text"]

queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
  WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
                          JOIN nodes_contexts nn ON n.id = nn.context_id
                            WHERE n.typename  = ?
                            AND nn.node_id = ?),
       input_rows(terms) AS (?)
  SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
    JOIN ngrams ng      ON cng.ngrams_id = ng.id
    JOIN input_rows  ir ON ir.terms      = ng.terms
    JOIN nodes_contexts nn ON nn.context_id   = cng.context_id
    JOIN nodes_sample n ON nn.context_id   = n.id
    WHERE nn.node_id      = ? -- CorpusId
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nn.category     > 0
      GROUP BY cng.node_id, ng.terms
  |]

selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
                                      => CorpusId
                                      -> Int
                                      -> NgramsType
                                      -> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
  fmap (first NgramsTerm) <$>
  runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
                ( int
                , toDBid NodeDocument
                , cId
                , cId
                , ngramsTypeId nt
                )

queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
  WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
                          JOIN nodes_contexts nc ON c.id = nc.context_id
                            WHERE c.typename  = ?
                            AND nc.node_id = ?)
  SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
    JOIN ngrams ng      ON cng.ngrams_id = ng.id
    JOIN node_stories ns ON ns.ngrams_id = ng.id
    JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
    JOIN contexts_sample c ON nc.context_id   = c.id
    WHERE nc.node_id      = ? -- CorpusId
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nc.category     > 0
      GROUP BY ng.id
  |]

------------------------------------------------------------------------
getContextsByNgramsOnlyUser :: HasDBid NodeType
                         => CorpusId
                         -> [ListId]
                         -> NgramsType
                         -> [NgramsTerm]
                         -> Cmd err (HashMap NgramsTerm (Set NodeId))
getContextsByNgramsOnlyUser cId ls nt ngs =
     HM.unionsWith        (<>)
   . map (HM.fromListWith (<>)
   . map (second Set.singleton))
  <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
           (splitEvery 1000 ngs)

getNgramsByContextOnlyUser :: HasDBid NodeType
                        => NodeId
                        -> [ListId]
                        -> NgramsType
                        -> [NgramsTerm]
                        -> Cmd err (Map NodeId (Set NgramsTerm))
getNgramsByContextOnlyUser cId ls nt ngs =
     Map.unionsWith         (<>)
   . map ( Map.fromListWith (<>)
         . map (second Set.singleton)
         )
   . map (map swap)
  <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
           (splitEvery 1000 ngs)

------------------------------------------------------------------------
selectNgramsOnlyByContextUser :: HasDBid NodeType
                           => CorpusId
                           -> [ListId]
                           -> NgramsType
                           -> [NgramsTerm]
                           -> Cmd err [(NgramsTerm, ContextId)]
selectNgramsOnlyByContextUser cId ls nt tms =
  fmap (first NgramsTerm) <$>
  runPGSQuery queryNgramsOnlyByContextUser
                ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
                , Values [QualifiedIdentifier Nothing "int4"]
                         (DPS.Only <$> (map (\(NodeId n) -> n) ls))
                , cId
                , toDBid NodeDocument
                , ngramsTypeId nt
                )
    where
      fields = [QualifiedIdentifier Nothing "text"]

queryNgramsOnlyByContextUser :: DPS.Query
queryNgramsOnlyByContextUser = [sql|
  WITH input_rows(terms) AS (?),
       input_list(id)    AS (?)
  SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
    JOIN ngrams         ng ON cng.ngrams_id = ng.id
    JOIN input_rows     ir ON ir.terms      = ng.terms
    JOIN input_list     il ON il.id         = cng.node_id
    JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
    JOIN contexts        c ON nc.context_id   = c.id
    WHERE nc.node_id      = ? -- CorpusId
      AND c.typename      = ? -- toDBid (maybe not useful with context table)
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nc.category     > 0
      GROUP BY ng.terms, cng.context_id
  |]

getNgramsByDocOnlyUser :: DocId
                       -> [ListId]
                       -> NgramsType
                       -> [NgramsTerm]
                       -> Cmd err (HashMap NgramsTerm (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs =
  HM.unionsWith (<>)
  . map (HM.fromListWith (<>) . map (second Set.singleton))
  <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)


selectNgramsOnlyByDocUser :: DocId
                          -> [ListId]
                          -> NgramsType
                          -> [NgramsTerm]
                          -> Cmd err [(NgramsTerm, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
  fmap (first NgramsTerm) <$>
  runPGSQuery queryNgramsOnlyByDocUser
                ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
                , Values [QualifiedIdentifier Nothing "int4"]
                         (DPS.Only <$> (map (\(NodeId n) -> n) ls))
                , dId
                , ngramsTypeId nt
                )
    where
      fields = [QualifiedIdentifier Nothing "text"]


queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
  WITH input_rows(terms) AS (?),
       input_list(id)    AS (?)
  SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
    JOIN ngrams ng      ON cng.ngrams_id = ng.id
    JOIN input_rows  ir ON ir.terms      = ng.terms
    JOIN input_list  il ON il.id         = cng.context_id
    WHERE cng.node_id     = ? -- DocId
      AND cng.ngrams_type = ? -- NgramsTypeId
      GROUP BY ng.terms, cng.node_id
  |]

------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getContextsByNgramsMaster :: HasDBid NodeType
                          =>  UserCorpusId
                          -> MasterCorpusId
                          -> Cmd err (HashMap Text (Set NodeId))
getContextsByNgramsMaster ucId mcId = unionsWith (<>)
                                 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
                                 -- . takeWhile (not . List.null)
                                 -- . takeWhile (\l -> List.length l > 3)
                                <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]

selectNgramsByContextMaster :: HasDBid NodeType
                         => Int
                         -> UserCorpusId
                         -> MasterCorpusId
                         -> Int
                         -> Cmd err [(NodeId, Text)]
selectNgramsByContextMaster n ucId mcId p = runPGSQuery
                               queryNgramsByContextMaster'
                                 ( ucId
                                 , ngramsTypeId NgramsTerms
                                 , toDBid   NodeDocument
                                 , p
                                 , toDBid   NodeDocument
                                 , p
                                 , n
                                 , mcId
                                 , toDBid   NodeDocument
                                 , ngramsTypeId NgramsTerms
                                 )

-- | TODO fix context_node_ngrams relation
queryNgramsByContextMaster' :: DPS.Query
queryNgramsByContextMaster' = [sql|
  WITH contextsByNgramsUser AS (

  SELECT n.id, ng.terms FROM contexts n
    JOIN nodes_contexts  nn  ON n.id = nn.context_id
    JOIN context_node_ngrams cng ON cng.context_id   = n.id
    JOIN ngrams       ng  ON cng.ngrams_id = ng.id
    WHERE nn.node_id      = ?   -- UserCorpusId
      -- AND n.typename   = ?  -- toDBid
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nn.category > 0
      AND node_pos(n.id,?) >= ?
      AND node_pos(n.id,?) <  ?
    GROUP BY n.id, ng.terms

    ),

  contextsByNgramsMaster AS (

  SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
    JOIN context_node_ngrams cng  ON n.id  = cng.context_id
    JOIN ngrams       ng   ON ng.id = cng.ngrams_id

    WHERE n.parent_id  = ?     -- Master Corpus toDBid
      AND n.typename   = ?     -- toDBid
      AND cng.ngrams_type = ? -- NgramsTypeId
    GROUP BY n.id, ng.terms
    )

  SELECT m.id, m.terms FROM nodesByNgramsMaster m
    RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
  |]

-- | Refreshes the \"context_node_ngrams_view\" materialized view.
-- This function will be run :
--  - periodically
--  - at reindex stage
--  - at the end of each text flow

-- refreshNgramsMaterialized :: Cmd err ()
-- refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
--   where
--     refreshNgramsMaterializedQuery :: DPS.Query
--     refreshNgramsMaterializedQuery =
--       [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]