diff --git a/src/Gargantext/API/Ngrams/Tools.hs b/src/Gargantext/API/Ngrams/Tools.hs index 8c552898d859e23ade8610e46fcd09dc3fc351bf..081011f006967c84ce9e2f8f805343015cbfd4f1 100644 --- a/src/Gargantext/API/Ngrams/Tools.hs +++ b/src/Gargantext/API/Ngrams/Tools.hs @@ -79,7 +79,7 @@ groupNodesByNgrams syn occs = Map.fromListWith (<>) occs' where occs' = map toSyn (Map.toList occs) toSyn (t,ns) = case Map.lookup t syn of - Nothing -> panic $ "Garg.API.Ngrams.Tools: groupNodesByNgrams, unknown key: " <> t + Nothing -> panic $ "[Garg.API.Ngrams.Tools.groupNodesByNgrams] unknown key: " <> t Just r -> case r of Nothing -> (t, ns) Just r' -> (r',ns) diff --git a/src/Gargantext/API/Node.hs b/src/Gargantext/API/Node.hs index 1036447ddfa5ae2cb2c2cd0d6dc1dad9edf481f4..8fc70622d89b36d4214c145fdd287b91ecbb6918 100644 --- a/src/Gargantext/API/Node.hs +++ b/src/Gargantext/API/Node.hs @@ -47,7 +47,7 @@ import Gargantext.API.Search ( SearchAPI, searchIn, SearchInQuery) import Gargantext.Core.Types (Offset, Limit, ListType(..)) import Gargantext.Core.Types.Main (Tree, NodeTree) import Gargantext.Database.Facet (FacetDoc , runViewDocuments, OrderBy(..),FacetChart,runViewAuthorsDoc) -import Gargantext.Database.Metrics (getMetrics') +import qualified Gargantext.Database.Metrics as Metrics import Gargantext.Database.Metrics.NgramsByNode (getNodesByNgramsOnlyUser) import Gargantext.Database.Node.Children (getChildren) import Gargantext.Database.Schema.Ngrams (NgramsType(..)) @@ -394,12 +394,12 @@ type MetricsAPI = Summary "SepGen IncExc metrics" getMetrics :: NodeId -> GargServer MetricsAPI getMetrics cId maybeListId tabType maybeLimit = do - (ngs', scores) <- getMetrics' cId maybeListId tabType maybeLimit + (ngs', scores) <- Metrics.getMetrics' cId maybeListId tabType maybeLimit let - metrics = map (\(Scored t s1 s2) -> Metric t s1 s2 (listType t ngs')) scores - errorMsg = "API.Node.metrics: key absent" + metrics = map (\(Scored t s1 s2) -> Metric t s1 s2 (listType t ngs')) scores listType t m = maybe (panic errorMsg) fst $ Map.lookup t m + errorMsg = "API.Node.metrics: key absent" pure $ Metrics metrics diff --git a/src/Gargantext/Database/Metrics.hs b/src/Gargantext/Database/Metrics.hs index 326ff311782ef563d2e9fd08b4329ea662f8c236..0f3b59bd729ca878c8000194b862bd18ff10bc75 100644 --- a/src/Gargantext/Database/Metrics.hs +++ b/src/Gargantext/Database/Metrics.hs @@ -11,6 +11,7 @@ Node API -} +{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE FlexibleContexts #-} @@ -20,31 +21,62 @@ module Gargantext.Database.Metrics import Data.Map (Map) import Data.Text (Text) import Gargantext.API.Ngrams (TabType(..), ngramsTypeFromTabType) -import Gargantext.API.Ngrams.Tools +import Gargantext.API.Ngrams.Tools (filterListWithRoot, groupNodesByNgrams, Diagonal(..), getCoocByNgrams, mapTermListRoot, RootTerm) import Gargantext.Core.Types (ListType(..), Limit) import Gargantext.Database.Flow (FlowCmdM) -import Gargantext.Database.Metrics.NgramsByNode (getNodesByNgramsOnlyUser) +import Gargantext.Database.Metrics.NgramsByNode (getNodesByNgramsOnlyUser, getTficfWith) import Gargantext.Database.Schema.Node (defaultList) import Gargantext.Database.Types.Node (ListId, CorpusId) +import Gargantext.Database.Flow (getOrMkRootWithCorpus) +import Gargantext.Database.Config (userMaster) import Gargantext.Prelude -import Gargantext.Text.Metrics (scored, Scored(..)) +import Gargantext.Text.Metrics (scored, Scored(..), localMetrics, toScored) import Servant (ServantErr) -import qualified Data.Map as Map +import qualified Data.Map as Map +import qualified Data.Vector.Storable as Vec + getMetrics' :: FlowCmdM env ServantErr m => CorpusId -> Maybe ListId -> TabType -> Maybe Limit -> m (Map Text (ListType, Maybe Text), [Scored Text]) getMetrics' cId maybeListId tabType maybeLimit = do + (ngs, _, myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit + pure (ngs, scored myCooc) - lId <- case maybeListId of - Nothing -> defaultList cId - Just lId' -> pure lId' - let ngramsType = ngramsTypeFromTabType tabType +getMetrics :: FlowCmdM env ServantErr m + => CorpusId -> Maybe ListId -> TabType -> Maybe Limit + -> m (Map Text (ListType, Maybe Text), [Scored Text]) +getMetrics cId maybeListId tabType maybeLimit = do + (ngs, ngs', metrics) <- getLocalMetrics cId maybeListId tabType maybeLimit + + (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" + + metrics' <- getTficfWith cId masterCorpusId (ngramsTypeFromTabType tabType) ngs' + + pure (ngs , toScored [metrics, Map.fromList $ map (\(a,b) -> (a, Vec.fromList [fst b])) $ Map.toList metrics']) - ngs' <- mapTermListRoot [lId] ngramsType - let ngs = Map.unions $ map (\t -> filterListWithRoot t ngs') - [GraphTerm, StopTerm, CandidateTerm] + +getLocalMetrics :: (FlowCmdM env ServantErr m) + => CorpusId -> Maybe ListId -> TabType -> Maybe Limit + -> m ( Map Text (ListType, Maybe Text) + , Map Text (Maybe RootTerm) + , Map Text (Vec.Vector Double) + ) +getLocalMetrics cId maybeListId tabType maybeLimit = do + (ngs, ngs', myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit + pure (ngs, ngs', localMetrics myCooc) + + + +getNgramsCooc :: (FlowCmdM env ServantErr m) + => CorpusId -> Maybe ListId -> TabType -> Maybe Limit + -> m ( Map Text (ListType, Maybe Text) + , Map Text (Maybe RootTerm) + , Map (Text, Text) Int + ) +getNgramsCooc cId maybeListId tabType maybeLimit = do + (ngs', ngs) <- getNgrams cId maybeListId tabType let take' Nothing xs = xs @@ -52,7 +84,22 @@ getMetrics' cId maybeListId tabType maybeLimit = do myCooc <- Map.filter (>1) <$> getCoocByNgrams (Diagonal True) <$> groupNodesByNgrams ngs - <$> getNodesByNgramsOnlyUser cId ngramsType (take' maybeLimit $ Map.keys ngs) + <$> getNodesByNgramsOnlyUser cId (ngramsTypeFromTabType tabType) + (take' maybeLimit $ Map.keys ngs) + pure $ (ngs', ngs, myCooc) + + - pure $ (ngs', scored myCooc) +getNgrams :: (FlowCmdM env ServantErr m) + => CorpusId -> Maybe ListId -> TabType + -> m (Map Text (ListType, Maybe Text), Map Text (Maybe RootTerm)) +getNgrams cId maybeListId tabType = do + lId <- case maybeListId of + Nothing -> defaultList cId + Just lId' -> pure lId' + + lists <- mapTermListRoot [lId] (ngramsTypeFromTabType tabType) + let maybeSyn = Map.unions $ map (\t -> filterListWithRoot t lists) + [GraphTerm, StopTerm, CandidateTerm] + pure (lists, maybeSyn) diff --git a/src/Gargantext/Database/Metrics/NgramsByNode.hs b/src/Gargantext/Database/Metrics/NgramsByNode.hs index df8dff9e8da6b1d11dd0e5cdf54f174310af6472..c3ef393a20f06209ff580f728167d045588bf2a7 100644 --- a/src/Gargantext/Database/Metrics/NgramsByNode.hs +++ b/src/Gargantext/Database/Metrics/NgramsByNode.hs @@ -59,15 +59,31 @@ sortTficf :: (Map Text (Double, Set Text)) sortTficf = List.sortOn (fst . snd) . toList -getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text) +getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text) -> Cmd err (Map Text (Double, Set Text)) -getTficf' u m f = do - u' <- getNodesByNgramsUser u NgramsTerms +getTficf' u m nt f = do + u' <- getNodesByNgramsUser u nt m' <- getNodesByNgramsMaster u m pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m') +--{- +getTficfWith :: UserCorpusId -> MasterCorpusId + -> NgramsType -> Map Text (Maybe Text) + -> Cmd err (Map Text (Double, Set Text)) +getTficfWith u m nt mtxt = do + u' <- getNodesByNgramsOnlyUser u nt (Map.keys mtxt) + m' <- getNodesByNgramsMaster u m + + let f x = case Map.lookup x mtxt of + Nothing -> x + Just x' -> maybe x identity x' + + pure $ toTficfData (countNodesByNgramsWith f u') + (countNodesByNgramsWith f m') +--} + type Context = (Double, Map Text (Double, Set Text)) type Supra = Context @@ -269,7 +285,3 @@ SELECT m.node_id, m.terms FROM nodesByNgramsMaster m RIGHT JOIN nodesByNgramsUser u ON u.id = m.id |] - - - - diff --git a/src/Gargantext/Text/List.hs b/src/Gargantext/Text/List.hs index 45a296d575cec6cdedbdaa69fe25ababe4a8455f..aed97b8b63b3c69c00cf0d97a03edbcfb64a61c5 100644 --- a/src/Gargantext/Text/List.hs +++ b/src/Gargantext/Text/List.hs @@ -57,7 +57,7 @@ buildNgramsOthersList uCid groupIt nt = do buildNgramsTermsList :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId -> Cmd err (Map NgramsType [NgramsElement]) buildNgramsTermsList l n m uCid mCid = do - candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup l n m) + candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m) --printDebug "candidate" (length candidates) let termList = toTermList (isStopTerm . fst) candidates diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs index 31137bbc1cfa9968386af5d960eca6495be243b3..304026da4411811701cb9d2a973371fef3e2a3e4 100644 --- a/src/Gargantext/Text/Metrics.hs +++ b/src/Gargantext/Text/Metrics.hs @@ -42,27 +42,37 @@ type GraphListSize = Int type InclusionSize = Int -takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t] -takeScored listSize incSize = map _scored_terms - . linearTakes listSize incSize _scored_speGen - _scored_incExc - . scored + +toScored :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t] +toScored = map2scored + . (reduceTo (Dimension 2)) + . (Map.filter (\v -> Vec.length v > 1)) + . (Map.unionsWith (<>)) scored :: Ord t => Map (t,t) Int -> [Scored t] -scored = map2scored . (reduceDim 2) . scored2map +scored = map2scored . (reduceTo (Dimension 2)) . scored2map +scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double) +scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m +map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t] +map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList + +-- TODO change type with (x,y) data Scored ts = Scored { _scored_terms :: !ts , _scored_incExc :: !InclusionExclusion , _scored_speGen :: !SpecificityGenericity } deriving (Show) +data Dimension = Dimension Int -reduceDim :: Ord t => Int -> Map t (Vec.Vector Double) - -> Map t (Vec.Vector Double) -reduceDim d ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d +reduceTo :: Ord t + => Dimension + -> Map t (Vec.Vector Double) + -> Map t (Vec.Vector Double) +reduceTo (Dimension d) ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d where ss'' :: Array Int (Vec.Vector Double) ss'' = listArray (1, List.length ss') ss' @@ -70,12 +80,21 @@ reduceDim d ss = Map.fromList $ zip txts $ elems $ pcaReduceN ss'' d (txts,ss') = List.unzip $ Map.toList ss -scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double) -scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m +localMetrics :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double) +localMetrics m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe])) + (Map.toList fi) + scores + where + (ti, fi) = createIndices m + (is, ss) = incExcSpeGen $ cooc2mat ti m + scores = DAA.toList + $ DAA.run + $ DAA.zip (DAA.use is) (DAA.use ss) + -map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t] -map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList + +-- TODO Code to be remove below -- TODO in the textflow we end up needing these indices , it might be -- better to compute them earlier and pass them around. scored' :: Ord t => Map (t,t) Int -> [Scored t] @@ -87,6 +106,18 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) s $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss) + + + + + +takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t] +takeScored listSize incSize = map _scored_terms + . linearTakes listSize incSize _scored_speGen + _scored_incExc + . scored + + -- | Filter Scored data -- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int]) -- [(3,8),(6,5)] @@ -104,17 +135,3 @@ linearTakes gls incSize speGen incExc = take gls . sortOn speGen --- | Filters -{- splitKmeans k scores -TODO: benchmark with accelerate-example kmeans version -splitKmeans x xs = L.concat $ map elements - $ V.take (k-1) - $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)])) - euclidSq x xs - --- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score) --- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts --- each parts is then ordered by Inclusion/Exclusion --- take n scored terms in each parts where n * SampleBins = MapListSize. --} -