Commit 3b8711b9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[Graph][WIP] Cooc 2 graph and missing file.

parent 0f2abe5f
{-|
Module : Gargantext.Database.Metrics.TFICF
Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TFICF, generalization of TFIDF
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Metrics.TFICF where
import Database.PostgreSQL.Simple.SqlQQ (sql)
import qualified Database.PostgreSQL.Simple as DPS
import Safe (headMay)
import Gargantext.Text.Metrics.TFICF -- (tficf)
import Gargantext.Prelude
import Gargantext.Core.Types.Individu (UsernameMaster)
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
type OccGlobal = Double
type OccCorpus = Double
getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [Tficf]
getTficf u cId lId ngType = do
g <- getTficfGlobal u
c <- getTficfCorpus cId
ngs <- getTficfNgrams u cId lId ngType
pure $ map (\(nId, nTerms, wm, wn)
-> Tficf nId nTerms
(tficf (TficfCorpus wn (fromIntegral c))
(TficfLanguage wm (fromIntegral g))
)
) ngs
getTficfGlobal :: UsernameMaster -> Cmd err Int
getTficfGlobal u = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (u, nodeTypeId NodeDocument)
q = [sql| SELECT count(*) from nodes n
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = ?
AND n.typename = ?
|]
getTficfCorpus :: CorpusId -> Cmd err Int
getTficfCorpus cId = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (cId, nodeTypeId NodeDocument)
q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
SELECT count(*) from nodes_nodes AS nn
JOIN nodes AS n ON n.id = nn.node2_id
JOIN input ON nn.node1_id = input.corpusId
WHERE n.typename = input.typename;
|]
getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
where
p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
queryTficf :: DPS.Query
queryTficf = [sql|
-- TODO add CTE for groups
WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
-- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
list_master AS (
SELECT n.id,n.name,n.user_id from nodes n
JOIN input ON n.typename = input.typenameList
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = input.masterUsername
),
ngrams_master AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes n ON n.id = nng2.node_id
JOIN input ON input.typenameDoc = n.typename
JOIN ngrams ng ON ng.id = nng2.ngrams_id
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
),
ngrams_user AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
JOIN ngrams ng ON ng.id = nng2.ngrams_id
JOIN input ON nn.node1_id = input.corpusId
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
)
SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
FROM ngrams_user nu
JOIN ngrams_master nm ON nm.id = nu.id
WHERE
nm.weight > 1
AND
nu.weight > 1
GROUP BY nu.id,nu.terms
--ORDER BY wm DESC
--LIMIT 1000
|]
...@@ -23,8 +23,10 @@ import GHC.IO (FilePath) ...@@ -23,8 +23,10 @@ import GHC.IO (FilePath)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Data.Map.Strict (Map)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import qualified Data.Set as DS import qualified Data.Set as DS
import Data.Text (Text)
import qualified Data.Array.Accelerate as A import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
...@@ -116,7 +118,12 @@ textFlow' termType contexts = do ...@@ -116,7 +118,12 @@ textFlow' termType contexts = do
let myCooc2 = M.filter (>0) myCooc1 let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2 size" (M.size myCooc2) printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2 printDebug "myCooc2" myCooc2
g <- cooc2graph myCooc2
pure g
-- TODO use Text only here instead of [Text]
cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
cooc2graph myCooc = do
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
...@@ -124,7 +131,7 @@ textFlow' termType contexts = do ...@@ -124,7 +131,7 @@ textFlow' termType contexts = do
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue 0 ) (DefaultValue 0 )
) myCooc2 ) myCooc
printDebug "myCooc3 size" $ M.size myCooc3 printDebug "myCooc3 size" $ M.size myCooc3
printDebug "myCooc3" myCooc3 printDebug "myCooc3" myCooc3
...@@ -146,7 +153,7 @@ textFlow' termType contexts = do ...@@ -146,7 +153,7 @@ textFlow' termType contexts = do
--let distanceMat = distributional matCooc --let distanceMat = distributional matCooc
printDebug "distanceMat shape" $ A.arrayShape distanceMat printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat printDebug "distanceMat" distanceMat
--
--let distanceMap = M.filter (>0) $ mat2map distanceMat --let distanceMap = M.filter (>0) $ mat2map distanceMat
let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap printDebug "distanceMap size" $ M.size distanceMap
......
...@@ -193,7 +193,7 @@ graphV3ToGraph (GraphV3 links nodes) = Graph (map nodeV32node nodes) (zipWith li ...@@ -193,7 +193,7 @@ graphV3ToGraph (GraphV3 links nodes) = Graph (map nodeV32node nodes) (zipWith li
nodeV32node :: NodeV3 -> Node nodeV32node :: NodeV3 -> Node
nodeV32node (NodeV3 no_id' (AttributesV3 cl') no_s' no_lb') nodeV32node (NodeV3 no_id' (AttributesV3 cl') no_s' no_lb')
= Node no_s' Terms (cs $ show no_id') no_lb' (Attributes cl') = Node no_s' Terms (cs $ show no_id') no_lb' (Attributes cl')
linkV32edge :: Int -> EdgeV3 -> Edge linkV32edge :: Int -> EdgeV3 -> Edge
linkV32edge n (EdgeV3 eo_s' eo_t' eo_w') = Edge (cs $ show eo_s') (cs $ show eo_t') ((T.read $ T.unpack eo_w') :: Double) (cs $ show n) linkV32edge n (EdgeV3 eo_s' eo_t' eo_w') = Edge (cs $ show eo_s') (cs $ show eo_t') ((T.read $ T.unpack eo_w') :: Double) (cs $ show n)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment