Commit 4838c6b8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM + FIX] TFICF

parent e93236e8
Pipeline #1026 failed with stage
......@@ -15,13 +15,14 @@ module Gargantext.Core.Text.List
-- import Data.Either (partitionEithers, Either(..))
import Data.Map (Map)
import Data.Set (Set)
import Data.Set (Set, empty)
import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
-- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Action.Metrics.NgramsByNode ({-ngramsGroup,-} getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Action.Metrics.TFICF (getTficf)
import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
......@@ -138,8 +139,8 @@ buildNgramsTermsList :: Lang
-> UserCorpusId
-> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
buildNgramsTermsList _l _n _m s uCid mCid = do
candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms
let
candidatesSize = 400
......@@ -150,12 +151,12 @@ buildNgramsTermsList l n m s uCid mCid = do
candidatesHead = List.take candidatesSize candidates
candidatesTail = List.drop candidatesSize candidates
termList =
termList =
-- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
(map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
(map (toGargList ((isStopTerm s) .fst) MapTerm) candidatesHead)
<> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
ngs = List.concat $ map toNgramsElement termList
ngs = List.concat $ map toNgramsElement $ map (\(lt, (t,d)) -> (lt, ((t, (d,empty))))) termList
pure $ Map.fromList [(NgramsTerms, ngs)]
......
......@@ -25,7 +25,6 @@ module Gargantext.Core.Text.Metrics.TFICF ( TFICF
import Data.Text (Text)
import Gargantext.Prelude
import Data.Set (Set)
import Gargantext.Core.Types (Ordering(..))
import Data.Map.Strict (Map, toList)
import qualified Data.Ord as DO (Down(..))
......@@ -54,8 +53,8 @@ tficf _ _ = panic $ "[ERR]" <> path <> "Undefined for these contexts"
sortTficf :: Ordering
-> (Map Text (Double, Set Text))
-> [ (Text,(Double, Set Text))]
sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
sortTficf Up = List.sortOn (fst . snd) . toList
-> Map Text Double
-> [(Text, Double)]
sortTficf Down = List.sortOn (DO.Down . snd) . toList
sortTficf Up = List.sortOn snd . toList
......@@ -24,7 +24,6 @@ import Gargantext.Core.Viz.Graph.Louvain (LouvainNodeId, CommunityId, comId2node
[LouvainNode] -> Map CommunityId LouvainNodeId
[(CommunityId, [LouvainNodeId])]
sort by length LouvainNodeIds
......@@ -39,6 +38,8 @@ subgraph with [LouvainNodeId]
Map NodeId Label
-> map [LouvainNodeId] -> [(CommunityId, take 3 [Label])]
use specGen incExc score to order the labels
take 7 [(CommunityId, take 3 [Label])]
......
......@@ -16,7 +16,7 @@ Ngrams by node enable contextual metrics.
module Gargantext.Database.Action.Metrics.NgramsByNode
where
import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
import Data.Map.Strict (Map, fromListWith, elems, toList)
import Data.Map.Strict.Patch (PatchMap, Replace, diff)
import Data.Set (Set)
import Data.Text (Text)
......@@ -30,7 +30,6 @@ import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import qualified Data.List as List
import qualified Data.Map.Strict as Map
......@@ -56,52 +55,6 @@ ngramsGroup l _m _n = Text.intercalate " "
. Text.replace "-" " "
getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> (Text -> Text)
-> Cmd err (Map Text (Double, Set Text))
getTficf u m nt f = do
u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
pure $ toTficfData (countNodesByNgramsWith f u')
(countNodesByNgramsWith f m')
{-
getTficfWith :: UserCorpusId
-> MasterCorpusId
-> [ListId]
-> NgramsType
-> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text))
getTficfWith u m ls nt mtxt = do
u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
m' <- getNodesByNgramsMaster u m
let f x = case Map.lookup x mtxt of
Nothing -> x
Just x' -> maybe x identity x'
pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
-}
type Context = (Double, Map Text (Double, Set Text))
type Supra = Context
type Infra = Context
toTficfData :: Infra
-> Supra
-> Map Text (Double, Set Text)
toTficfData (ti, mi) (ts, ms) =
fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
(TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
, ns
)
)
| (t, (n,ns)) <- toList mi
]
-- | fst is size of Supra Corpus
-- snd is Texts and size of Occurrences (different docs)
......
{-|
Module : Gargantext.Database.Metrics.TFICF
Description : Ngrams by Node user and master
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Action.Metrics.TFICF
where
-- import Debug.Trace (trace)
-- import Gargantext.Core (Lang(..))
import Data.Map.Strict (Map, toList, fromList)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Gargantext.Core.Text.Metrics.TFICF
import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getOccByNgramsOnlyFast)
import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNode (selectCountDocs)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
getTficf :: UserCorpusId
-> MasterCorpusId
-> NgramsType
-> Cmd err (Map Text Double)
getTficf cId mId nt = do
mapTextDoubleLocal <- Map.filter (> 1)
<$> Map.map (fromIntegral . Set.size)
<$> getNodesByNgramsUser cId nt
mapTextDoubleGlobal <- Map.map fromIntegral
<$> getOccByNgramsOnlyFast mId nt (Map.keys mapTextDoubleLocal)
countLocal <- selectCountDocs cId
countGlobal <- selectCountDocs mId
pure $ fromList [ ( t
, tficf (TficfInfra (Count n )
(Total $ fromIntegral countLocal ))
(TficfSupra (Count $ fromMaybe 0 $ Map.lookup t mapTextDoubleGlobal)
(Total $ fromIntegral countGlobal))
)
| (t, n) <- toList mapTextDoubleLocal
]
......@@ -29,6 +29,7 @@ module Gargantext.Database.Query.Table.NodeNode
, insertNodeNode
, deleteNodeNode
, selectPublicNodes
, selectCountDocs
)
where
......@@ -145,7 +146,20 @@ nodeNodesCategory inputData = map (\(PGS.Only a) -> a)
|]
------------------------------------------------------------------------
-- | TODO use UTCTime fast
selectCountDocs :: CorpusId -> Cmd err Int
selectCountDocs cId = runCountOpaQuery (queryCountDocs cId)
where
queryCountDocs cId' = proc () -> do
(n, nn) <- joinInCorpus -< ()
restrict -< nn^.nn_node1_id .== (toNullable $ pgNodeId cId')
restrict -< nn^.nn_category .>= (toNullable $ pgInt4 1)
restrict -< n^.node_typename .== (pgInt4 $ nodeTypeId NodeDocument)
returnA -< n
-- | TODO use UTCTime fast
selectDocsDates :: CorpusId -> Cmd err [Text]
selectDocsDates cId = map (head' "selectDocsDates" . splitOn "-")
<$> catMaybes
......
......@@ -96,10 +96,6 @@ getOrMk_RootWithCorpus user cName c = do
pure (userId, rootId, corpusId)
mkRoot :: HasNodeError err
=> User
-> Cmd err [RootId]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment