TFICF.hs 4.35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
{-|
Module      : Gargantext.Database.Metrics.TFICF
Description : Ngram connection to the Database
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

TFICF, generalization of TFIDF

-}

{-# LANGUAGE NoImplicitPrelude      #-}
{-# LANGUAGE OverloadedStrings      #-}
{-# LANGUAGE QuasiQuotes            #-}
17
{-# LANGUAGE RankNTypes        #-}
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

module Gargantext.Database.Metrics.TFICF where

import Database.PostgreSQL.Simple.SqlQQ (sql)
import qualified Database.PostgreSQL.Simple as DPS

import Safe (headMay)
import Gargantext.Text.Metrics.TFICF -- (tficf)
import Gargantext.Prelude
import Gargantext.Core.Types.Individu (UsernameMaster)
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)

type OccGlobal = Double
type OccCorpus = Double


getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
          -> Cmd err [Tficf]
getTficf u cId lId ngType = do
  g   <- getTficfGlobal u
  c   <- getTficfCorpus cId
  ngs <- getTficfNgrams u cId lId ngType

  pure $ map (\(nId, nTerms, wm, wn)
        -> Tficf nId nTerms
                 (tficf (TficfCorpus   wn (fromIntegral c))
                        (TficfLanguage wm (fromIntegral g))
                  )
              ) ngs

getTficfGlobal :: UsernameMaster -> Cmd err Int
getTficfGlobal u = maybe 0 identity <$> headMay
                <$> map (\(DPS.Only n) -> n )
                <$> runPGSQuery q p
  where
    p = (u, nodeTypeId NodeDocument)
    q = [sql| SELECT count(*) from nodes n
              JOIN auth_user a ON a.id = n.user_id
              WHERE
              a.username = ?
              AND n.typename = ?
  |]

getTficfCorpus :: CorpusId -> Cmd err Int
getTficfCorpus cId = maybe 0 identity <$> headMay
                  <$> map (\(DPS.Only n) -> n )
                  <$> runPGSQuery q p
  where
    p = (cId, nodeTypeId NodeDocument)
    q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
              SELECT count(*) from nodes_nodes AS nn
              JOIN nodes AS n ON n.id = nn.node2_id
              JOIN input ON nn.node1_id = input.corpusId
              WHERE n.typename = input.typename;
  |]



getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
             -> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
  where
    p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)


queryTficf :: DPS.Query
queryTficf = [sql|
-- TODO add CTE for groups
WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId) 
  AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
  -- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),

list_master AS (
SELECT n.id,n.name,n.user_id from nodes n
JOIN input       ON n.typename = input.typenameList
JOIN auth_user a ON a.id       = n.user_id
WHERE
a.username = input.masterUsername
),

ngrams_master AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
JOIN list_master       ON list_master.id       = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes        n   ON n.id          = nng2.node_id
JOIN input            ON input.typenameDoc = n.typename
JOIN ngrams       ng  ON ng.id         = nng2.ngrams_id
WHERE
 nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
),

ngrams_user AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
FROM nodes_ngrams nng
JOIN list_master       ON list_master.id        = nng.node_id

JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes_nodes  nn   ON nn.node2_id    = nng2.node_id

JOIN ngrams       ng   ON ng.id          = nng2.ngrams_id
JOIN input             ON nn.node1_id    = input.corpusId

WHERE
     nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
)


SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
FROM ngrams_user nu
JOIN ngrams_master nm ON nm.id = nu.id
WHERE
nm.weight > 1
AND
nu.weight > 1
GROUP BY nu.id,nu.terms
--ORDER BY wm DESC
--LIMIT 1000

  |]