Commit 4838c6b8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM + FIX] TFICF

parent e93236e8
...@@ -15,13 +15,14 @@ module Gargantext.Core.Text.List ...@@ -15,13 +15,14 @@ module Gargantext.Core.Text.List
-- import Data.Either (partitionEithers, Either(..)) -- import Data.Either (partitionEithers, Either(..))
import Data.Map (Map) import Data.Map (Map)
import Data.Set (Set) import Data.Set (Set, empty)
import Data.Text (Text) import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList) import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
-- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..)) -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..)) import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith) import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Core.Text.Metrics.TFICF (sortTficf) import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
...@@ -138,8 +139,8 @@ buildNgramsTermsList :: Lang ...@@ -138,8 +139,8 @@ buildNgramsTermsList :: Lang
-> UserCorpusId -> UserCorpusId
-> MasterCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do buildNgramsTermsList _l _n _m s uCid mCid = do
candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m) candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms
let let
candidatesSize = 400 candidatesSize = 400
...@@ -150,12 +151,12 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -150,12 +151,12 @@ buildNgramsTermsList l n m s uCid mCid = do
candidatesHead = List.take candidatesSize candidates candidatesHead = List.take candidatesSize candidates
candidatesTail = List.drop candidatesSize candidates candidatesTail = List.drop candidatesSize candidates
termList = termList =
-- (toTermList a b ((isStopTerm s) . fst) candidatesHead) -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
(map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead) (map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
<> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail) <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
ngs = List.concat $ map toNgramsElement termList ngs = List.concat $ map toNgramsElement $ map (\(lt, (t,d)) -> (lt, ((t, (d,empty))))) termList
pure $ Map.fromList [(NgramsTerms, ngs)] pure $ Map.fromList [(NgramsTerms, ngs)]
......
...@@ -25,7 +25,6 @@ module Gargantext.Core.Text.Metrics.TFICF ( TFICF ...@@ -25,7 +25,6 @@ module Gargantext.Core.Text.Metrics.TFICF ( TFICF
import Data.Text (Text) import Data.Text (Text)
import Gargantext.Prelude import Gargantext.Prelude
import Data.Set (Set)
import Gargantext.Core.Types (Ordering(..)) import Gargantext.Core.Types (Ordering(..))
import Data.Map.Strict (Map, toList) import Data.Map.Strict (Map, toList)
import qualified Data.Ord as DO (Down(..)) import qualified Data.Ord as DO (Down(..))
...@@ -54,8 +53,8 @@ tficf _ _ = panic $ "[ERR]" <> path <> "Undefined for these contexts" ...@@ -54,8 +53,8 @@ tficf _ _ = panic $ "[ERR]" <> path <> "Undefined for these contexts"
sortTficf :: Ordering sortTficf :: Ordering
-> (Map Text (Double, Set Text)) -> Map Text Double
-> [ (Text,(Double, Set Text))] -> [(Text, Double)]
sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList sortTficf Down = List.sortOn (DO.Down . snd) . toList
sortTficf Up = List.sortOn (fst . snd) . toList sortTficf Up = List.sortOn snd . toList
...@@ -24,7 +24,6 @@ import Gargantext.Core.Viz.Graph.Louvain (LouvainNodeId, CommunityId, comId2node ...@@ -24,7 +24,6 @@ import Gargantext.Core.Viz.Graph.Louvain (LouvainNodeId, CommunityId, comId2node
[LouvainNode] -> Map CommunityId LouvainNodeId [LouvainNode] -> Map CommunityId LouvainNodeId
[(CommunityId, [LouvainNodeId])] [(CommunityId, [LouvainNodeId])]
sort by length LouvainNodeIds sort by length LouvainNodeIds
...@@ -39,6 +38,8 @@ subgraph with [LouvainNodeId] ...@@ -39,6 +38,8 @@ subgraph with [LouvainNodeId]
Map NodeId Label Map NodeId Label
-> map [LouvainNodeId] -> [(CommunityId, take 3 [Label])] -> map [LouvainNodeId] -> [(CommunityId, take 3 [Label])]
use specGen incExc score to order the labels
take 7 [(CommunityId, take 3 [Label])] take 7 [(CommunityId, take 3 [Label])]
......
...@@ -16,7 +16,7 @@ Ngrams by node enable contextual metrics. ...@@ -16,7 +16,7 @@ Ngrams by node enable contextual metrics.
module Gargantext.Database.Action.Metrics.NgramsByNode module Gargantext.Database.Action.Metrics.NgramsByNode
where where
import Data.Map.Strict (Map, fromListWith, elems, toList, fromList) import Data.Map.Strict (Map, fromListWith, elems, toList)
import Data.Map.Strict.Patch (PatchMap, Replace, diff) import Data.Map.Strict.Patch (PatchMap, Replace, diff)
import Data.Set (Set) import Data.Set (Set)
import Data.Text (Text) import Data.Text (Text)
...@@ -30,7 +30,6 @@ import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId) ...@@ -30,7 +30,6 @@ import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd, runPGSQuery) import Gargantext.Database.Prelude (Cmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..)) import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Core.Text.Terms.Mono.Stem (stem) import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map.Strict as Map import qualified Data.Map.Strict as Map
...@@ -56,52 +55,6 @@ ngramsGroup l _m _n = Text.intercalate " " ...@@ -56,52 +55,6 @@ ngramsGroup l _m _n = Text.intercalate " "
. Text.replace "-" " " . Text.replace "-" " "
getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text))
getTficf u m nt f = do
u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m')
{-
getTficfWith :: UserCorpusId
-> MasterCorpusId
-> [ListId]
-> NgramsType
-> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text))
getTficfWith u m ls nt mtxt = do
u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
m' <- getNodesByNgramsMaster u m
let f x = case Map.lookup x mtxt of
Nothing -> x
Just x' -> maybe x identity x'
pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
-}
type Context = (Double, Map Text (Double, Set Text))
type Supra = Context
type Infra = Context
toTficfData :: Infra
-> Supra
-> Map Text (Double, Set Text)
toTficfData (ti, mi) (ts, ms) =
fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
(TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
, ns
)
)
| (t, (n,ns)) <- toList mi
]
-- | fst is size of Supra Corpus -- | fst is size of Supra Corpus
-- snd is Texts and size of Occurrences (different docs) -- snd is Texts and size of Occurrences (different docs)
......
{-|
Module : Gargantext.Database.Metrics.TFICF
Description : Ngrams by Node user and master
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Action.Metrics.TFICF
where
-- import Debug.Trace (trace)
-- import Gargantext.Core (Lang(..))
import Data.Map.Strict (Map, toList, fromList)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast)
import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNode (selectCountDocs)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> Cmd err (Map Text Double)
getTficf cId mId nt = do
mapTextDoubleLocal <- Map.filter (> 1)
<$> Map.map (fromIntegral . Set.size)
<$> getNodesByNgramsUser cId nt
mapTextDoubleGlobal <- Map.map fromIntegral
<$> getOccByNgramsOnlyFast mId nt (Map.keys mapTextDoubleLocal)
countLocal <- selectCountDocs cId
countGlobal <- selectCountDocs mId
pure $ fromList [ ( t
, tficf (TficfInfra (Count n )
(Total $ fromIntegral countLocal ))
(TficfSupra (Count $ fromMaybe 0 $ Map.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal))
)
| (t, n) <- toList mapTextDoubleLocal
]
...@@ -29,6 +29,7 @@ module Gargantext.Database.Query.Table.NodeNode ...@@ -29,6 +29,7 @@ module Gargantext.Database.Query.Table.NodeNode
, insertNodeNode , insertNodeNode
, deleteNodeNode , deleteNodeNode
, selectPublicNodes , selectPublicNodes
, selectCountDocs
) )
where where
...@@ -145,7 +146,20 @@ nodeNodesCategory inputData = map (\(PGS.Only a) -> a) ...@@ -145,7 +146,20 @@ nodeNodesCategory inputData = map (\(PGS.Only a) -> a)
|] |]
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use UTCTime fast selectCountDocs :: CorpusId -> Cmd err Int
selectCountDocs cId = runCountOpaQuery (queryCountDocs cId)
where
queryCountDocs cId' = proc () -> do
(n, nn) <- joinInCorpus -< ()
restrict -< nn^.nn_node1_id .== (toNullable $ pgNodeId cId')
restrict -< nn^.nn_category .>= (toNullable $ pgInt4 1)
restrict -< n^.node_typename .== (pgInt4 $ nodeTypeId NodeDocument)
returnA -< n
-- | TODO use UTCTime fast
selectDocsDates :: CorpusId -> Cmd err [Text] selectDocsDates :: CorpusId -> Cmd err [Text]
selectDocsDates cId = map (head' "selectDocsDates" . splitOn "-") selectDocsDates cId = map (head' "selectDocsDates" . splitOn "-")
<$> catMaybes <$> catMaybes
......
...@@ -96,10 +96,6 @@ getOrMk_RootWithCorpus user cName c = do ...@@ -96,10 +96,6 @@ getOrMk_RootWithCorpus user cName c = do
pure (userId, rootId, corpusId) pure (userId, rootId, corpusId)
mkRoot :: HasNodeError err mkRoot :: HasNodeError err
=> User => User
-> Cmd err [RootId] -> Cmd err [RootId]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment