Cooc.hs 3.4 KB
Newer Older
1
{-|
Alexandre Delanoë's avatar
Alexandre Delanoë committed
2
Module      : Gargantext.Database.Cooc
3 4 5 6 7 8 9 10 11
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

12
{-# LANGUAGE NoImplicitPrelude #-}
13 14 15 16 17 18
--{-# LANGUAGE OverloadedStrings #-}

{-# LANGUAGE QuasiQuotes #-}

module Gargantext.Database.Cooc where

19
import Control.Monad ((>>=))
20 21 22
import Database.PostgreSQL.Simple
import Database.PostgreSQL.Simple.SqlQQ

23
import Gargantext.Prelude
24 25 26 27 28 29 30 31
import Gargantext (connectGargandb)

type CorpusId    = Int
type MainListId  = Int
type GroupListId = Int

coocTest :: IO [(Int, Int, Int)]
coocTest = connectGargandb "gargantext.ini"
32
  >>= \conn -> dBcooc conn 421968 446602 446599 
33

34 35
dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
dBcooc conn corpus mainList groupList = query conn [sql|
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
  set work_mem='1GB';

  --EXPLAIN ANALYZE
      WITH COOC as (
      SELECT
      COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
      COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
      COUNT(*) AS score
      FROM
      nodes AS n
      --      /     --     X   Y
      -- SQL graph for getting the cooccurrences

          -- STEP 1: X axis of the matrix
          INNER JOIN nodes_ngrams
                  AS ngA  ON ngA.node_id  = n.id
          -- \--> get the occurrences node/ngram of the corpus

              INNER JOIN nodes_ngrams
                      AS wlA  ON ngA.ngram_id = wlA.ngram_id
                             AND wlA.node_id  = ?
              -- \--> filter with white/main list (typename 7)

          LEFT JOIN  nodes_ngrams_ngrams
                 AS grA  ON wlA.ngram_id = grA.ngram1_id
                        AND grA.node_id  = ?
          -- \--> adding (joining) ngrams that are grouped (typename 6)
          LEFT JOIN  nodes_ngrams
                 AS wlAA ON grA.ngram2_id = wlAA.ngram_id
                        AND wlAA.node_id  = wlA.node_id
          -- \--> adding (joining) ngrams that are not grouped
          --LEFT JOIN  ngrams        AS wlAA ON grA.ngram2_id = wlAA.id
          -- \--> for joining all synonyms even if they are not in the main list (white list)


          -- STEP 2: Y axi of the matrix
          INNER JOIN nodes_ngrams
                  AS ngB  ON ngB.node_id  = n.id
          -- \--> get the occurrences node/ngram of the corpus

              INNER JOIN nodes_ngrams
                      AS wlB  ON ngB.ngram_id = wlB.ngram_id
                             AND wlB.node_id  = ?
              -- \--> filter with white/main list

          LEFT JOIN  nodes_ngrams_ngrams
                 AS grB  ON wlB.ngram_id = grB.ngram1_id
                        AND grB.node_id  = ?
          -- \--> adding (joining) ngrams that are grouped
          LEFT JOIN  nodes_ngrams
                 AS wlBB ON grB.ngram2_id = wlBB.ngram_id
                        AND wlBB.node_id   = wlB.node_id
          -- \--> adding (joining) ngrams that are not grouped

          -- LEFT JOIN  ngrams        AS wlBB ON grB.ngram2_id = wlBB.id
          -- \--> for joining all synonyms even if they are not in the main list (white list)

          WHERE
              n.typename  = 4
          AND n.parent_id = ?
          GROUP BY 1,2
          --    ==
          -- GROUP BY ngA, ngB
          )

      SELECT ngA, ngB, score
          FROM COOC    --> from the query above

   WHERE score >= 3
  AND 
  ngA <= ngB
  |] (mainList, groupList, mainList, groupList, corpus)