Commit 3b8711b9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Graph][WIP] Cooc 2 graph and missing file.

parent 0f2abe5f
{-|
Module : Gargantext.Database.Metrics.TFICF
Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TFICF, generalization of TFIDF
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Metrics.TFICF where
import Database.PostgreSQL.Simple.SqlQQ (sql)
import qualified Database.PostgreSQL.Simple as DPS
import Safe (headMay)
import Gargantext.Text.Metrics.TFICF -- (tficf)
import Gargantext.Prelude
import Gargantext.Core.Types.Individu (UsernameMaster)
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
type OccGlobal = Double
type OccCorpus = Double
getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [Tficf]
getTficf u cId lId ngType = do
g <- getTficfGlobal u
c <- getTficfCorpus cId
ngs <- getTficfNgrams u cId lId ngType
pure $ map (\(nId, nTerms, wm, wn)
-> Tficf nId nTerms
(tficf (TficfCorpus wn (fromIntegral c))
(TficfLanguage wm (fromIntegral g))
)
) ngs
getTficfGlobal :: UsernameMaster -> Cmd err Int
getTficfGlobal u = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (u, nodeTypeId NodeDocument)
q = [sql| SELECT count(*) from nodes n
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = ?
AND n.typename = ?
|]
getTficfCorpus :: CorpusId -> Cmd err Int
getTficfCorpus cId = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (cId, nodeTypeId NodeDocument)
q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
SELECT count(*) from nodes_nodes AS nn
JOIN nodes AS n ON n.id = nn.node2_id
JOIN input ON nn.node1_id = input.corpusId
WHERE n.typename = input.typename;
|]
getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
where
p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
queryTficf :: DPS.Query
queryTficf = [sql|
-- TODO add CTE for groups
WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
-- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
list_master AS (
SELECT n.id,n.name,n.user_id from nodes n
JOIN input ON n.typename = input.typenameList
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = input.masterUsername
),
ngrams_master AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes n ON n.id = nng2.node_id
JOIN input ON input.typenameDoc = n.typename
JOIN ngrams ng ON ng.id = nng2.ngrams_id
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
),
ngrams_user AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
JOIN ngrams ng ON ng.id = nng2.ngrams_id
JOIN input ON nn.node1_id = input.corpusId
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
)
SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
FROM ngrams_user nu
JOIN ngrams_master nm ON nm.id = nu.id
WHERE
nm.weight > 1
AND
nu.weight > 1
GROUP BY nu.id,nu.terms
--ORDER BY wm DESC
--LIMIT 1000
|]
......@@ -23,8 +23,10 @@ import GHC.IO (FilePath)
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Data.Map.Strict (Map)
import Data.Maybe (catMaybes)
import qualified Data.Set as DS
import Data.Text (Text)
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
......@@ -116,7 +118,12 @@ textFlow' termType contexts = do
let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2
g <- cooc2graph myCooc2
pure g
-- TODO use Text only here instead of [Text]
cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
cooc2graph myCooc = do
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
......@@ -124,7 +131,7 @@ textFlow' termType contexts = do
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc2
) myCooc
printDebug "myCooc3 size" $ M.size myCooc3
printDebug "myCooc3" myCooc3
......@@ -146,7 +153,7 @@ textFlow' termType contexts = do
--let distanceMat = distributional matCooc
printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat
--
--let distanceMap = M.filter (>0) $ mat2map distanceMat
let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap
......
......@@ -193,7 +193,7 @@ graphV3ToGraph (GraphV3 links nodes) = Graph (map nodeV32node nodes) (zipWith li
nodeV32node :: NodeV3 -> Node
nodeV32node (NodeV3 no_id' (AttributesV3 cl') no_s' no_lb')
= Node no_s' Terms (cs $ show no_id') no_lb' (Attributes cl')
linkV32edge :: Int -> EdgeV3 -> Edge
linkV32edge n (EdgeV3 eo_s' eo_t' eo_w') = Edge (cs $ show eo_s') (cs $ show eo_t') ((T.read $ T.unpack eo_w') :: Double) (cs $ show n)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment