Commit 545bb1a3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW] new TFICF function (full Haskell).

parent a364ea38
......@@ -19,38 +19,65 @@ Ngrams by node enable special metrics.
module Gargantext.Database.Metrics.NgramsByNode
where
import Data.Map.Strict (Map, fromListWith, {-elems,-} toList)
import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
import Data.Set (Set)
import Data.Text (Text)
import Data.Tuple.Extra (second)
import Data.Tuple.Extra (second, swap)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Prelude
import Gargantext.Text.Metrics.TFICF -- (tficf)
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Database.PostgreSQL.Simple as DPS
type GlobalNodeId = NodeId
type LocalNodeId = NodeId
joinNodesByNgrams :: Map Text (Set NodeId)
-> Map Text (Set NodeId)
-> Map Text (Set GlobalNodeId, Set LocalNodeId)
joinNodesByNgrams = undefined
getTficf :: UserCorpusId -> MasterCorpusId -> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text))
getTficf u m f = do
u' <- getNodesByNgramsUser u
m' <- getNodesByNgramsMaster u m
countNodesByNgramsWith :: (Text -> Text) -> Map Text (Set NodeId)
-> Map Text (Set Text, Int)
countNodesByNgramsWith f m = Map.map (second Set.size)
$ groupNodesByNgramsWith f m
pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m')
groupNodesByNgramsWith :: (Text -> Text) -> Map Text (Set NodeId)
-> Map Text (Set Text, Set NodeId)
type Context = (Double, Map Text (Double, Set Text))
type Supra = Context
type Infra = Context
toTficfData :: Infra -> Supra
-> Map Text (Double, Set Text)
toTficfData (ti, mi) (ts, ms) =
fromList [ (t, ( tficf (TficfInfra ti n)
(TficfSupra ts $ maybe 0 fst $ Map.lookup t ms)
, ns
)
)
| (t, (n,ns)) <- toList mi
]
-- | fst is size of Supra Corpus
-- snd is Texts and size of Occurrences (different docs)
countNodesByNgramsWith :: (Text -> Text)
-> Map Text (Set NodeId)
-> (Double, Map Text (Double, Set Text))
countNodesByNgramsWith f m = (total, m')
where
total = fromIntegral $ Set.size $ Set.unions $ elems m
m' = Map.map (swap . second (fromIntegral . Set.size)) $ groupNodesByNgramsWith f m
groupNodesByNgramsWith :: (Text -> Text)
-> Map Text (Set NodeId)
-> Map Text (Set Text, Set NodeId)
groupNodesByNgramsWith f m =
fromListWith (\a b -> (fst a <> fst b, snd a <> snd b))
$ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
......@@ -84,15 +111,18 @@ queryNgramsByNodeUser = [sql|
|]
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsMaster ucId mcId = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeMaster ucId mcId
getNodesByNgramsMaster :: CorpusId -> Cmd err (Map Text (Set NodeId))
getNodesByNgramsMaster cId = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeMaster cId
selectNgramsByNodeMaster :: CorpusId -> Cmd err [(NodeId, Text)]
selectNgramsByNodeMaster cId = runPGSQuery
selectNgramsByNodeMaster :: UserCorpusId -> MasterCorpusId -> Cmd err [(NodeId, Text)]
selectNgramsByNodeMaster ucId mcId = runPGSQuery
queryNgramsByNodeMaster
( cId
( ucId
, nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms
, mcId
, nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms
)
......@@ -100,16 +130,33 @@ selectNgramsByNodeMaster cId = runPGSQuery
queryNgramsByNodeMaster :: DPS.Query
queryNgramsByNodeMaster = [sql|
SELECT nng.node_id, ng.terms FROM nodes_ngrams nng
JOIN ngrams ng ON ng.id = nng.ngrams_id
JOIN nodes n ON n.id = nng.node_id
WHERE n.parent_id = ? -- Master Corpus NodeTypeId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
GROUP BY nng.node_id, ng.terms
LIMIT 10000 -- TODO remove the hard limit and limit with corpus only
WITH nodesByNgramsUser AS (
SELECT nng.node_id, ng.id, ng.terms FROM nodes_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- UserCorpusId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node_id, ng.id, ng.terms
),
nodesByNgramsMaster AS (
SELECT nng.node_id, ng.id, ng.terms FROM nodes_ngrams nng
JOIN ngrams ng ON ng.id = nng.ngrams_id
JOIN nodes n ON n.id = nng.node_id
WHERE n.parent_id = ? -- Master Corpus NodeTypeId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
GROUP BY nng.node_id, ng.id, ng.terms)
SELECT m.node_id, m.terms FROM nodesByNgramsMaster m
RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
|]
......
......@@ -87,7 +87,9 @@ type ListId = NodeId
type DocumentId = NodeId
type DocId = DocumentId -- todo: remove this
type RootId = NodeId
type MasterCorpusId = NodeId
type MasterCorpusId = CorpusId
type UserCorpusId = CorpusId
type AnnuaireId = NodeId
type ContactId = NodeId
......
......@@ -20,17 +20,24 @@ module Gargantext.Text.Metrics.TFICF where
import Gargantext.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms)
data TficfContext n m = TficfLanguage n m | TficfCorpus n m | TficfDocument n m
data TficfContext n m = TficfLanguage n m
| TficfCorpus n m
| TficfDocument n m
| TficfInfra n m
| TficfSupra n m
deriving (Show)
data Tficf = Tficf { tficf_ngramsId :: NgramsId
, tficf_ngramsTerms :: NgramsTerms
, tficf_score :: Double
} deriving (Show)
data Tficf = Tficf
{ tficf_ngramsId :: NgramsId
, tficf_ngramsTerms :: NgramsTerms
, tficf_score :: Double
} deriving (Show)
data Tficf' = Tficf'
{ tficf'_terms :: NgramsTerms
, tficf'_score :: Double
} deriving (Show)
data Tficf' = Tficf' { tficf'_terms :: NgramsTerms
, tficf'_score :: Double
} deriving (Show)
type SupraContext = TficfContext
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment