1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
{-|
Module : Gargantext.Database.Metrics.NgramsByContext
Description : Ngrams by Node user and master
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams by node enable contextual metrics.
-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Action.Metrics.NgramsByContext
where
-- import Debug.Trace (trace)
--import Data.Map.Strict.Patch (PatchMap, Replace, diff)
-- import Control.Monad (void)
import Data.HashMap.Strict (HashMap)
import Data.Map.Strict (Map)
import Data.Set (Set)
import Data.Text (Text)
import Data.Tuple.Extra (first, second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core
import Gargantext.Data.HashMap.Strict.Utils as HM
import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
import Gargantext.Database.Prelude (Cmd, runPGSQuery) -- , execPGSQuery)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HM
import qualified Data.List as List
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Database.PostgreSQL.Simple as DPS
import qualified Database.PostgreSQL.Simple.Types as DPST
-- | fst is size of Supra Corpus
-- snd is Texts and size of Occurrences (different docs)
countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
-> HashMap NgramsTerm (Set ContextId)
-> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
countContextsByNgramsWith f m = (total, m')
where
total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
m' = HM.map ( swap . second (fromIntegral . Set.size))
$ groupContextsByNgramsWith f m
groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
-> HashMap NgramsTerm (Set NodeId)
-> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
groupContextsByNgramsWith f' m'' =
HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
$ HM.toList m''
------------------------------------------------------------------------
getContextsByNgramsUser :: HasDBid NodeType
=> CorpusId
-> NgramsType
-> Cmd err (HashMap NgramsTerm (Set ContextId))
getContextsByNgramsUser cId nt =
HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
<$> selectNgramsByContextUser cId nt
where
selectNgramsByContextUser :: HasDBid NodeType
=> CorpusId
-> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByContextUser cId' nt' =
runPGSQuery queryNgramsByContextUser
( cId'
, toDBid NodeDocument
, ngramsTypeId nt'
-- , 100 :: Int -- limit
-- , 0 :: Int -- offset
)
queryNgramsByContextUser :: DPS.Query
queryNgramsByContextUser = [sql|
SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN nodes_contexts nc ON nc.context_id = cng.context_id
JOIN contexts c ON nc.context_id = c.id
WHERE nc.node_id = ? -- CorpusId
AND c.typename = ? -- toDBid
AND cng.ngrams_type = ? -- NgramsTypeId
AND nc.category > 0 -- is not in Trash
GROUP BY cng.context_id, ng.terms
|]
------------------------------------------------------------------------
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err (HashMap NgramsTerm Int)
getOccByNgramsOnlyFast_withSample cId int nt ngs =
HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
getOccByNgramsOnlyFast :: CorpusId
-> ListId
-> NgramsType
-> Cmd err (HashMap NgramsTerm [ContextId])
getOccByNgramsOnlyFast cId lId nt = do
--HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
where
run :: CorpusId
-> ListId
-> NgramsType
-> Cmd err [(Text, DPST.PGArray Int)]
run cId' lId' nt' = runPGSQuery query
( cId'
, lId'
, ngramsTypeId nt'
)
query :: DPS.Query
query = [sql|
WITH cnnv AS
( SELECT DISTINCT context_node_ngrams.context_id,
context_node_ngrams.ngrams_id,
nodes_contexts.node_id
FROM nodes_contexts
JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
),
node_context_ids AS
(SELECT context_id, ngrams_id, terms
FROM cnnv
JOIN ngrams ON cnnv.ngrams_id = ngrams.id
WHERE node_id = ?
),
ncids_agg AS
(SELECT ngrams_id, terms, array_agg(DISTINCT context_id) AS agg
FROM node_context_ids
GROUP BY (ngrams_id, terms)),
ns AS
(SELECT ngrams_id, terms
FROM node_stories
JOIN ngrams ON ngrams_id = ngrams.id
WHERE node_id = ? AND ngrams_type_id = ?
)
SELECT ns.terms, CASE WHEN agg IS NULL THEN '{}' ELSE agg END
FROM ns
LEFT JOIN ncids_agg ON ns.ngrams_id = ncids_agg.ngrams_id
|]
-- query = [sql|
-- WITH node_context_ids AS
-- (select context_id, ngrams_id
-- FROM context_node_ngrams_view
-- WHERE node_id = ?
-- ), ns AS
-- (select ngrams_id FROM node_stories
-- WHERE node_id = ? AND ngrams_type_id = ?
-- )
-- SELECT ng.terms,
-- ARRAY ( SELECT DISTINCT context_id
-- FROM node_context_ids
-- WHERE ns.ngrams_id = node_context_ids.ngrams_id
-- )
-- AS context_ids
-- FROM ngrams ng
-- JOIN ns ON ng.id = ns.ngrams_id
-- |]
selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> [NgramsTerm]
-> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
( int
, toDBid NodeDocument
, cId
, Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
, cId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
JOIN nodes_contexts nn ON n.id = nn.context_id
WHERE n.typename = ?
AND nn.node_id = ?),
input_rows(terms) AS (?)
SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_contexts nn ON nn.context_id = cng.context_id
JOIN nodes_sample n ON nn.context_id = n.id
WHERE nn.node_id = ? -- CorpusId
AND cng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY cng.node_id, ng.terms
|]
selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
=> CorpusId
-> Int
-> NgramsType
-> Cmd err [(NgramsTerm, Int)]
selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
( int
, toDBid NodeDocument
, cId
, cId
, ngramsTypeId nt
)
queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
JOIN nodes_contexts nc ON c.id = nc.context_id
WHERE c.typename = ?
AND nc.node_id = ?)
SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN node_stories ns ON ns.ngrams_id = ng.id
JOIN nodes_contexts nc ON nc.context_id = cng.context_id
JOIN contexts_sample c ON nc.context_id = c.id
WHERE nc.node_id = ? -- CorpusId
AND cng.ngrams_type = ? -- NgramsTypeId
AND nc.category > 0
GROUP BY ng.id
|]
------------------------------------------------------------------------
getContextsByNgramsOnlyUser :: HasDBid NodeType
=> CorpusId
-> [ListId]
-> NgramsType
-> [NgramsTerm]
-> Cmd err (HashMap NgramsTerm (Set NodeId))
getContextsByNgramsOnlyUser cId ls nt ngs =
HM.unionsWith (<>)
. map (HM.fromListWith (<>)
. map (second Set.singleton))
<$> mapM (selectNgramsOnlyByContextUser cId ls nt)
(splitEvery 1000 ngs)
getNgramsByContextOnlyUser :: HasDBid NodeType
=> NodeId
-> [ListId]
-> NgramsType
-> [NgramsTerm]
-> Cmd err (Map NodeId (Set NgramsTerm))
getNgramsByContextOnlyUser cId ls nt ngs =
Map.unionsWith (<>)
. map ( Map.fromListWith (<>)
. map (second Set.singleton)
)
. map (map swap)
<$> mapM (selectNgramsOnlyByContextUser cId ls nt)
(splitEvery 1000 ngs)
------------------------------------------------------------------------
selectNgramsOnlyByContextUser :: HasDBid NodeType
=> CorpusId
-> [ListId]
-> NgramsType
-> [NgramsTerm]
-> Cmd err [(NgramsTerm, ContextId)]
selectNgramsOnlyByContextUser cId ls nt tms =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOnlyByContextUser
( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
, toDBid NodeDocument
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByContextUser :: DPS.Query
queryNgramsOnlyByContextUser = [sql|
WITH input_rows(terms) AS (?),
input_list(id) AS (?)
SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN input_list il ON il.id = cng.node_id
JOIN nodes_contexts nc ON nc.context_id = cng.context_id
JOIN contexts c ON nc.context_id = c.id
WHERE nc.node_id = ? -- CorpusId
AND c.typename = ? -- toDBid (maybe not useful with context table)
AND cng.ngrams_type = ? -- NgramsTypeId
AND nc.category > 0
GROUP BY ng.terms, cng.context_id
|]
getNgramsByDocOnlyUser :: DocId
-> [ListId]
-> NgramsType
-> [NgramsTerm]
-> Cmd err (HashMap NgramsTerm (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs =
HM.unionsWith (<>)
. map (HM.fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId
-> [ListId]
-> NgramsType
-> [NgramsTerm]
-> Cmd err [(NgramsTerm, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
fmap (first NgramsTerm) <$>
runPGSQuery queryNgramsOnlyByDocUser
( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
, Values [QualifiedIdentifier Nothing "int4"]
(DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
WITH input_rows(terms) AS (?),
input_list(id) AS (?)
SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN input_list il ON il.id = cng.context_id
WHERE cng.node_id = ? -- DocId
AND cng.ngrams_type = ? -- NgramsTypeId
GROUP BY ng.terms, cng.node_id
|]
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getContextsByNgramsMaster :: HasDBid NodeType
=> UserCorpusId
-> MasterCorpusId
-> Cmd err (HashMap Text (Set NodeId))
getContextsByNgramsMaster ucId mcId = unionsWith (<>)
. map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
-- . takeWhile (not . List.null)
-- . takeWhile (\l -> List.length l > 3)
<$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
selectNgramsByContextMaster :: HasDBid NodeType
=> Int
-> UserCorpusId
-> MasterCorpusId
-> Int
-> Cmd err [(NodeId, Text)]
selectNgramsByContextMaster n ucId mcId p = runPGSQuery
queryNgramsByContextMaster'
( ucId
, ngramsTypeId NgramsTerms
, toDBid NodeDocument
, p
, toDBid NodeDocument
, p
, n
, mcId
, toDBid NodeDocument
, ngramsTypeId NgramsTerms
)
-- | TODO fix context_node_ngrams relation
queryNgramsByContextMaster' :: DPS.Query
queryNgramsByContextMaster' = [sql|
WITH contextsByNgramsUser AS (
SELECT n.id, ng.terms FROM contexts n
JOIN nodes_contexts nn ON n.id = nn.context_id
JOIN context_node_ngrams cng ON cng.context_id = n.id
JOIN ngrams ng ON cng.ngrams_id = ng.id
WHERE nn.node_id = ? -- UserCorpusId
-- AND n.typename = ? -- toDBid
AND cng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
AND node_pos(n.id,?) >= ?
AND node_pos(n.id,?) < ?
GROUP BY n.id, ng.terms
),
contextsByNgramsMaster AS (
SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
JOIN context_node_ngrams cng ON n.id = cng.context_id
JOIN ngrams ng ON ng.id = cng.ngrams_id
WHERE n.parent_id = ? -- Master Corpus toDBid
AND n.typename = ? -- toDBid
AND cng.ngrams_type = ? -- NgramsTypeId
GROUP BY n.id, ng.terms
)
SELECT m.id, m.terms FROM nodesByNgramsMaster m
RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
|]
-- | Refreshes the \"context_node_ngrams_view\" materialized view.
-- This function will be run :
-- - periodically
-- - at reindex stage
-- - at the end of each text flow
-- refreshNgramsMaterialized :: Cmd err ()
-- refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
-- where
-- refreshNgramsMaterializedQuery :: DPS.Query
-- refreshNgramsMaterializedQuery =
-- [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]