[FLOW.GRAPH.METRICS] spec gen inc exc.

020e78de · Alexandre Delanoë · 07305554 · 020e78de · 020e78de · 020e78de
Commit 020e78de authored Mar 12, 2019 by Alexandre Delanoë
5 changed files
--- a/src/Gargantext/API/Node.hs
+++ b/src/Gargantext/API/Node.hs
@@ -282,8 +282,8 @@ graphAPI nId = do
                                     ]
                         -- (map (\n -> LegendField n "#FFFFFF" (pack $ show n)) [1..10])
  let cId = maybe (panic "no parentId") identity $ _node_parentId nodeGraph
-  lId <- defaultList cId
+  lId <- defaultList cId
  ngs    <- filterListWithRoot GraphTerm <$> mapTermListRoot [lId] NgramsTerms
  myCooc <- Map.filter (>1) <$> getCoocByNgrams
@@ -296,7 +296,7 @@ graphAPI nId = do
 instance HasNodeError ServantErr where
  _NodeError = prism' mk (const Nothing) -- $ panic "HasNodeError ServantErr: not a prism")
    where
-      e = "NodeError: "
+      e = "Gargantext.NodeError: "
      mk NoListFound   = err404 { errBody = e <> "No list found"         }
      mk NoRootFound   = err404 { errBody = e <> "No Root found"         }
      mk NoCorpusFound = err404 { errBody = e <> "No Corpus found"       }
@@ -333,18 +333,20 @@ rename nId (RenameNode name') = U.update (U.Rename nId name')
 getTable :: NodeId -> Maybe TabType
         -> Maybe Offset  -> Maybe Limit
         -> Maybe OrderBy -> Cmd err [FacetDoc]
-getTable cId ft o l order = case ft of
+getTable cId ft o l order =
-                                (Just Docs)  -> runViewDocuments cId False o l order
+  case ft of
-                                (Just Trash) -> runViewDocuments cId True  o l order
+    (Just Docs)  -> runViewDocuments cId False o l order
-                                _     -> panic "not implemented"
+    (Just Trash) -> runViewDocuments cId True  o l order
+    _     -> panic "not implemented"
 getPairing :: ContactId -> Maybe TabType
         -> Maybe Offset  -> Maybe Limit
         -> Maybe OrderBy -> Cmd err [FacetDoc]
-getPairing cId ft o l order = case ft of
+getPairing cId ft o l order =
-                                (Just Docs)  -> runViewAuthorsDoc cId False o l order
+  case ft of
-                                (Just Trash) -> runViewAuthorsDoc cId True  o l order
+    (Just Docs)  -> runViewAuthorsDoc cId False o l order
-                                _     -> panic "not implemented"
+    (Just Trash) -> runViewAuthorsDoc cId True  o l order
+    _     -> panic "not implemented"
 getChart :: NodeId -> Maybe UTCTime -> Maybe UTCTime

--- a/src/Gargantext/Database/Flow.hs
+++ b/src/Gargantext/Database/Flow.hs
@@ -83,6 +83,7 @@ flowCorpus u cn ff fp = do
  ids <- flowCorpusMaster ff fp
  flowCorpusUser u cn ids
+-- TODO query with complex query
 flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
          => Username -> Text -> m CorpusId
 flowCorpusSearchInDatabase u q = do
@@ -112,11 +113,7 @@ flowCorpusUser userName corpusName ids = do
  -- User List Flow
  (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-  -- /!\ this extract NgramsTerms Only
+  ngs         <- buildNgramsLists userCorpusId masterCorpusId
-  ngs <- buildNgramsLists userCorpusId masterCorpusId
-  --printDebug "ngs" ngs
-  --TODO getNgramsElement of NgramsType...
-  --ngs <- getNgramsElementsWithParentNodeId masterCorpusId
  userListId  <- flowList userId userCorpusId ngs
  printDebug "userListId" userListId
@@ -124,11 +121,10 @@ flowCorpusUser userName corpusName ids = do
  _ <- mkGraph     userCorpusId userId
  -- User Dashboard Flow
-  _ <- mkDashboard userCorpusId userId
+  -- _ <- mkDashboard userCorpusId userId
  -- Annuaire Flow
  -- _ <- mkAnnuaire  rootUserId userId
  pure userCorpusId
@@ -142,7 +138,7 @@ insertMasterDocs hs  =  do
  ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
  let documentsWithId = mergeData (toInserted ids) (toInsert hs)
-  docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
+  docsWithNgrams     <- documentIdWithNgrams extractNgramsT documentsWithId
  let maps            = mapNodeIdNgrams docsWithNgrams

--- a/src/Gargantext/Database/Metrics/NgramsByNode.hs
+++ b/src/Gargantext/Database/Metrics/NgramsByNode.hs
@@ -24,13 +24,12 @@ import Data.Set (Set)
 import Data.Text (Text)
 import Data.Tuple.Extra (second, swap)
 import Database.PostgreSQL.Simple.SqlQQ (sql)
+import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
 import Gargantext.Core (Lang(..))
 import Gargantext.Database.Config (nodeTypeId)
 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
-import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
 import Gargantext.Database.Utils (Cmd, runPGSQuery)
 import Gargantext.Prelude
 import Gargantext.Text.Metrics.TFICF
 import Gargantext.Text.Terms.Mono.Stem (stem)
@@ -54,7 +53,7 @@ ngramsGroup l n = Text.intercalate " "
 sortTficf :: (Map Text (Double, Set Text))
-          -> [(Text, (Double, Set Text))]
+          -> [   (Text,(Double, Set Text))]
 sortTficf  = List.sortOn (fst . snd) . toList
@@ -69,8 +68,8 @@ getTficf' u m f = do
 type Context = (Double, Map Text (Double, Set Text))
-type Supra = Context
+type Supra   = Context
-type Infra = Context
+type Infra   = Context
 toTficfData :: Infra -> Supra
            -> Map Text (Double, Set Text)
@@ -105,17 +104,20 @@ groupNodesByNgramsWith f m =
    $ toList m
 ------------------------------------------------------------------------
-getNodesByNgramsUser :: CorpusId -> NgramsType -> Cmd err (Map Text (Set NodeId))
+getNodesByNgramsUser :: CorpusId -> NgramsType
-getNodesByNgramsUser cId nt = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
+                     -> Cmd err (Map Text (Set NodeId))
-                                         <$> selectNgramsByNodeUser cId nt
+getNodesByNgramsUser cId nt =
+  fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
-selectNgramsByNodeUser :: CorpusId -> NgramsType -> Cmd err [(NodeId, Text)]
+                    <$> selectNgramsByNodeUser cId nt
-selectNgramsByNodeUser cId nt = runPGSQuery
-                               queryNgramsByNodeUser
+selectNgramsByNodeUser :: CorpusId -> NgramsType
-                                 ( cId
+                       -> Cmd err [(NodeId, Text)]
-                                 , nodeTypeId NodeDocument
+selectNgramsByNodeUser cId nt =
-                                 , ngramsTypeId nt
+  runPGSQuery queryNgramsByNodeUser
-                                 )
+              ( cId
+              , nodeTypeId NodeDocument
+              , ngramsTypeId nt
+              )
 queryNgramsByNodeUser :: DPS.Query
 queryNgramsByNodeUser = [sql|
@@ -137,13 +139,17 @@ getOccByNgramsOnly :: CorpusId -> NgramsType -> [Text]
 getOccByNgramsOnly cId nt ngs = Map.map Set.size
                             <$> getNodesByNgramsOnlyUser cId nt ngs
-getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text] -> Cmd err (Map Text (Set NodeId))
+getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
-getNodesByNgramsOnlyUser cId nt ngs = fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
+                         -> Cmd err (Map Text (Set NodeId))
-                                         <$> selectNgramsOnlyByNodeUser cId nt ngs
+getNodesByNgramsOnlyUser cId nt ngs =
+  fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
+                    <$> selectNgramsOnlyByNodeUser cId nt ngs
-selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text] -> Cmd err [(NodeId, Text)]
+selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
-selectNgramsOnlyByNodeUser cId nt tms = runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' )
+                           -> Cmd err [(NodeId, Text)]
+selectNgramsOnlyByNodeUser cId nt tms =
+  runPGSQuery queryNgramsOnlyByNodeUser (DPS.Only $ Values fields tms' )
    where
      fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4", "int4", "int4"]
      tms'   = map (\t -> (t,cId,nodeTypeId NodeDocument, ngramsTypeId nt)) tms

--- a/src/Gargantext/Text/List.hs
+++ b/src/Gargantext/Text/List.hs
 {-|
 Module      : Gargantext.Text.Ngrams.Lists
-Description : 
+Description : Tools to build lists
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-Here is a longer description of this module, containing some
-commentary with @some markup@.
 -}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE RankNTypes        #-}
 module Gargantext.Text.List
  where
-import Data.Set (Set)
 import Data.Map (Map)
+import Data.Set (Set)
 import Data.Text (Text)
 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
 import Gargantext.Core (Lang(..))
@@ -30,11 +26,11 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
 import Gargantext.Database.Utils (Cmd)
 import Gargantext.Prelude
+import qualified Data.Char as Char
+import qualified Data.List as List
 import qualified Data.Map as Map
 import qualified Data.Set as Set
 import qualified Data.Text as Text
-import qualified Data.List as List
-import qualified Data.Char as Char
 -- | TODO improve grouping functions of Authors, Sources, Institutes..
 buildNgramsLists :: UserCorpusId -> MasterCorpusId
@@ -56,20 +52,19 @@ buildNgramsOthersList uCid groupIt nt = do
                        )
                      ]
+-- TODO remove hard coded parameters
 buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
-                -> Cmd err (Map NgramsType [NgramsElement])
+                     -> Cmd err (Map NgramsType [NgramsElement])
 buildNgramsTermsList uCid mCid = do
-            candidates   <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
+  candidates   <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 2)
-            --printDebug "candidate" (length candidates)
+  --printDebug "candidate" (length candidates)
-            let termList = toTermList (isStopTerm . fst) candidates
+  let termList = toTermList (isStopTerm . fst) candidates
-            --printDebug "termlist" (length termList)
+  --printDebug "termlist" (length termList)
-            let ngs = List.concat $ map toNgramsElement termList
+  let ngs = List.concat $ map toNgramsElement termList
-            pure $ Map.fromList [(NgramsTerms, ngs)]
+  pure $ Map.fromList [(NgramsTerms, ngs)]
 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
@@ -87,10 +82,7 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
                                                   (mSetFromList [])
                            ) children
+-- TODO remove hard coded parameters
 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
 toTermList stop ns =  map (toTermList' stop CandidateTerm) xs
                   <> map (toTermList' stop GraphTerm)     ys
@@ -106,8 +98,8 @@ toTermList stop ns =  map (toTermList' stop CandidateTerm) xs
      ys = take b $ drop a ns
      zs = drop b $ drop a ns
-      a = 1
+      a = 10
-      b = 10000
+      b = 3000
 isStopTerm :: Text -> Bool
 isStopTerm x = Text.length x < 3
@@ -118,4 +110,3 @@ isStopTerm x = Text.length x < 3
                       . Text.replace "/" ""
                       ) x
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
 --import Debug.Trace (trace)
 --import Math.KMeans (kmeans, euclidSq, elements)
 import Data.Map (Map)
-import Data.Ord (Down(..))
+import Data.List.Extra (sortOn)
 import GHC.Real (round)
 import Gargantext.Prelude
 import Gargantext.Viz.Graph.Distances.Matrice
@@ -32,65 +32,14 @@ import qualified Data.Array.Accelerate.Interpreter as DAA
 import qualified Data.List as L
 import qualified Data.Map  as M
-data MapListSize   = MapListSize   Int
+type GraphListSize = Int
-data InclusionSize = InclusionSize Int
+type InclusionSize = Int
-data SampleBins    = SampleBins    Double
-data Clusters      = Clusters      Int
-data DefaultValue  = DefaultValue  Int
-data FilterConfig = FilterConfig
-  { fc_mapListSize   :: MapListSize
-  , fc_inclusionSize :: InclusionSize
-  , fc_sampleBins    :: SampleBins
-  , fc_clusters      :: Clusters
-  , fc_defaultValue  :: DefaultValue
-  }
-filterCooc :: (Show t, Ord t)
-           => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
-filterCooc fc cc = (filterCooc' fc) ts cc
-  where
-    ts     = map _scored_terms $ takeSome fc $ coocScored cc
-filterCooc' :: (Show t, Ord t)
-            => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
-filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-  -- trace ("coocScored " <> show ts) $
-  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
-    M.empty (listToCombi identity ts)
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
-takeSome :: Ord t
-         => FilterConfig -> [Scored t] -> [Scored t]
-takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
-                    $ takeSample n m
-                    $ L.take l'
-                    $ reverse $ sortWith (Down . _scored_incExc) scores
-                    -- splitKmeans k scores
-  where
-    -- TODO: benchmark with accelerate-example kmeans version
-    --splitKmeans x xs = L.concat $ map elements
-    --                 $ V.take (k-1)
-    --                 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
-    --                          euclidSq x xs
-    n = round ((fromIntegral l)/s)
-    m = round $ (fromIntegral $ length scores) / (s)
-    takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
-                        L.concat $ map (L.take n')
-                                 $ map (sortWith (Down . _scored_incExc))
-                                 -- TODO use kmeans s instead of splitEvery
-                                 -- in order to split in s heteregenous parts
-                                 -- without homogeneous order hypothesis
-                                 $ splitEvery m'
-                                 $ sortWith (Down . _scored_speGen) xs
+takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
+takeScored listSize incSize = map _scored_terms
+                            . linearTakes listSize incSize _scored_speGen
+                                                           _scored_incExc
+                            . scored
 data Scored ts = Scored
  { _scored_terms  :: !ts
@@ -98,11 +47,45 @@ data Scored ts = Scored
  , _scored_speGen :: !SpecificityGenericity
  } deriving (Show)
 -- TODO in the textflow we end up needing these indices, it might be better
 -- to compute them earlier and pass them around.
-coocScored :: Ord t => Map (t,t) Int -> [Scored t]
+scored :: Ord t => Map (t,t) Int -> [Scored t]
-coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
+scored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
  where
    (ti, fi) = createIndices m
    (is, ss) = incExcSpeGen $ cooc2mat ti m
-    scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
+    scores   = DAA.toList
+             $ DAA.run
+             $ DAA.zip (DAA.use is) (DAA.use ss)
+-- | Filter Scored data
+-- >>> linearTakes 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
+-- [(3,8),(6,5)]
+linearTakes :: (Ord b1, Ord b2)
+            => GraphListSize -> InclusionSize -> (a -> b2) -> (a -> b1) -> [a] -> [a]
+linearTakes gls incSize speGen incExc = take gls
+                      . L.concat
+                      . map (take $ round
+                                  $ (fromIntegral gls     :: Double)
+                                  / (fromIntegral incSize :: Double)
+                             )
+                      . map (sortOn incExc)
+                      . splitEvery incSize
+                      . sortOn speGen
+-- | Filters
+{- splitKmeans k scores
+TODO: benchmark with accelerate-example kmeans version
+splitKmeans x xs = L.concat $ map elements
+                 $ V.take (k-1)
+                 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
+                          euclidSq x xs
+-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
+-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
+-- each parts is then ordered by Inclusion/Exclusion
+-- take n scored terms in each parts where n * SampleBins = MapListSize.
+-}