[TextFlow] MapList building, score needs normalization

9e3e9e0f · Alexandre Delanoë · 48eb263b · 9e3e9e0f · 9e3e9e0f · 9e3e9e0f
Commit 9e3e9e0f authored Sep 22, 2020 by Alexandre Delanoë
5 changed files
--- a/bin/gargantext-init/Main.hs
+++ b/bin/gargantext-init/Main.hs
@@ -17,7 +17,6 @@ module Main where

 import Data.Text (Text)
 import Data.Either (Either(..))
-import Data.Maybe (Maybe(..))
 import Gargantext.API.Admin.Settings (withDevEnv, runCmdDev)
 import Gargantext.API.Prelude (GargError)
 import Gargantext.API.Node () -- instances only
@@ -29,7 +28,6 @@ import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
 import Gargantext.Database.Admin.Types.Node
 import Gargantext.Database.Admin.Trigger.Init (initFirstTriggers, initLastTriggers)
 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataCorpus)
-import Gargantext.Database.Admin.Types.Node (CorpusId, RootId, ListId)
 import Gargantext.Database.Prelude (Cmd, )
 import Gargantext.Prelude
 import System.Environment (getArgs)

--- a/src/Gargantext/API/Ngrams.hs
+++ b/src/Gargantext/API/Ngrams.hs
@@ -968,9 +968,9 @@ putListNgrams' :: (HasInvalidError err, RepoCmdM env err m)
               -> Map NgramsTerm NgramsRepoElement
               -> m ()
 putListNgrams' nodeId ngramsType ns = do
-  printDebug "[putListNgrams'] nodeId" nodeId
-  printDebug "[putListNgrams'] ngramsType" ngramsType
-  printDebug "[putListNgrams'] ns" ns
+  -- printDebug "[putListNgrams'] nodeId" nodeId
+  -- printDebug "[putListNgrams'] ngramsType" ngramsType
+  -- printDebug "[putListNgrams'] ns" ns

  let p1 = NgramsTablePatch . PM.fromMap $ NgramsReplace Nothing . Just <$> ns
      (p0, p0_validity) = PM.singleton nodeId p1

--- a/src/Gargantext/Core/Text/List.hs
+++ b/src/Gargantext/Core/Text/List.hs
@@ -14,9 +14,9 @@ Portability : POSIX
 module Gargantext.Core.Text.List
  where

-- import Data.Either (partitionEithers, Either(..))
-import Control.Lens (makeLenses, set)
-import Data.Maybe (fromMaybe)
+import Control.Lens (makeLenses)
+import Data.Maybe (fromMaybe, catMaybes)
+import Data.Ord (Down(..))
 import Data.Map (Map)
 import Data.Set (Set)
 import Data.Text (Text)
@@ -29,9 +29,9 @@ import qualified Data.Text as Text
 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
 import Gargantext.Core (Lang(..))
-import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
+import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
 import Gargantext.Database.Admin.Types.Node (NodeId)
-import Gargantext.Core.Text.Metrics (scored')
+import Gargantext.Core.Text.Metrics (scored', Scored(..))
 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
 import Gargantext.Database.Query.Table.Node (defaultList)
@@ -117,12 +117,14 @@ buildNgramsTermsList l n m s uCid mCid = do

 -- Computing global speGen score
  allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
+
  -- printDebug "head candidates" (List.take 10 $ allTerms)
  -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)

  -- First remove stops terms
-  let 
-    (stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
+  let
+    -- stopTerms ignored for now (need to be tagged already)
+    (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms

  -- Grouping the ngrams and keeping the maximum score for label
  let grouped = groupStems'
@@ -134,22 +136,29 @@ buildNgramsTermsList l n m s uCid mCid = do

      (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped

+-- printDebug "groupedMult" groupedMult
  -- splitting monterms and multiterms to take proportional candidates
  let
    listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
-    monoSizeGlobal = 0.6  :: Double
-    multSizeGlobal = 1 - monoSizeGlobal
+    monoSize = 0.4  :: Double
+    multSize = 1 - monoSize

-    splitAt n ns = List.splitAt (round $ n * listSizeGlobal) $ List.sort $ Map.elems ns
+    splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
+ 
+    (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
+    (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult

-    (groupedMonoHead, groupedMonoTail) = splitAt monoSizeGlobal groupedMono
-    (groupedMultHead, groupedMultTail) = splitAt multSizeGlobal groupedMult
+  printDebug "groupedMonoHead" (List.length groupedMonoHead)
+  printDebug "groupedMonoTail" (List.length groupedMonoHead)
+  printDebug "groupedMultHead" (List.length groupedMultHead)
+  printDebug "groupedMultTail" (List.length groupedMultTail)

+  let
    -- Get Local Scores now for selected grouped ngrams
    selectedTerms = Set.toList $ List.foldl'
-                      (\set (GroupedText _ l _ g _ _ _ ) -> Set.union set
-                                                         $ Set.union g
-                                                         $ Set.singleton l
+                      (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
+                                                            $ Set.union g
+                                                            $ Set.singleton l'
                      )
                      Set.empty
                      (groupedMonoHead <> groupedMultHead)
@@ -176,50 +185,84 @@ buildNgramsTermsList l n m s uCid mCid = do
                  $ Map.keys mapTextDocIds

    -- compute cooccurrences
-    mapCooc = Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
+    mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
                           | (t1, s1) <- mapStemNodeIds
                           , (t2, s2) <- mapStemNodeIds
-                           , t1 /= t2 -- Null Diagonal
                           ]
      where
        mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
+  -- printDebug "mapCooc" mapCooc

+  let
    -- computing scores
-    scores = scored' mapCooc
-    -- dilate scores
-
-
-    -- sort / filter
-
-
+    mapScores f = Map.fromList $ map (\(Scored t g s') -> (t, f (g,s'))) $ scored' mapCooc
+
+    groupsWithScores = catMaybes
+                     $ map (\(stem, g)
+                             -> case Map.lookup stem mapScores' of
+                                 Nothing -> Nothing
+                                 Just s'  -> Just $ g { _gt_score = s'}
+                            ) $ Map.toList contextsAdded
+      where
+        mapScores' = mapScores adapt1 -- identity
+        adapt1 (s1,s2) = (log' 5 s1, log' 2 s2)
+        log' n' x     = 1 + (if x <= 0 then 0 else log $ (10^(n'::Int)) * x)
+        -- adapt2 TOCHECK with DC
+  -- printDebug "groupsWithScores" groupsWithScores
  let
-    (mono, multi)          = List.partition (\t -> (size . fst) t < 2) candidateTerms
-    (monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSizeGlobal) mono
-    (multiHead, multiTail) = List.splitAt (round $ 0.40 * listSizeGlobal) multi
-
-
-  -- Computing local speGen score
-    listSizeLocal  = 350  :: Double
-
-    -- Final Step building the Typed list
-    termList = (map (toGargList $ Just StopTerm) stopTerms)
-            <> (map (toGargList $ Just MapTerm)       (monoHead <> multiHead))
-            <> (map (toGargList $ Just CandidateTerm) (monoTail <> multiTail))
+    -- sort / partition / split
+      -- filter mono/multi again
+    (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
+      -- filter with max score
+    partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )

+    (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
+    (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored

+      -- splitAt
+  let
+    listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
+    inclSize = 0.4  :: Double
+    exclSize = 1 - inclSize
+    splitAt' n' = List.splitAt (round $ n' * listSizeLocal)

+    (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
+    (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl

+    (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
+    (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl

-    ngs = List.concat
-        $ map toNgramsElement
-        $ groupStems
-        $ map (\(listType, (t,d)) -> let stem = ngramsGroup l n m t
-                                      in ( stem
-                                         , GroupedText listType t d Set.empty (size t) stem Set.empty
-                                         )
-              ) termList

-  pure $ Map.fromList [(NgramsTerms, ngs)]
+    -- Final Step building the Typed list
+    -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
+    termListHead = 
+             (map (\g -> g { _gt_listType = Just MapTerm} )  (  monoScoredInclHead
+                                                 <> monoScoredExclHead
+                                                 <> multScoredInclHead
+                                                 <> multScoredExclHead
+                                                 )
+               )
+            <> (map (\g -> g { _gt_listType = Just CandidateTerm }) (  monoScoredInclTail
+                                                      <> monoScoredExclTail
+                                                      <> multScoredInclTail
+                                                      <> multScoredExclTail
+                                                      )
+               )
+
+    termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
+
+--  printDebug "monoScoredInclHead" monoScoredInclHead
+--  printDebug "monoScoredExclHead" monoScoredExclTail
+--
+  printDebug "multScoredInclHead" multScoredInclHead
+  printDebug "multScoredExclTail" multScoredExclTail
+
+
+
+  pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
+                                  <> (List.concat $ map toNgramsElement $ termListTail)
+                       )
+                      ]

 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
 groupStems = Map.elems . groupStems'
@@ -239,7 +282,7 @@ groupStems' = Map.fromListWith grouping



-toNgramsElement :: GroupedText Double -> [NgramsElement]
+toNgramsElement :: GroupedText a -> [NgramsElement]
 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
  [parentElem] <> childrenElems
    where
@@ -278,7 +321,8 @@ data GroupedText score =
              , _gt_stem     :: !Stem
              , _gt_nodes    :: !(Set NodeId)
              }
-
+instance Show score => Show (GroupedText score) where
+  show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s

 instance (Eq a) => Eq (GroupedText a) where
  (==) (GroupedText _ _ score1 _ _ _ _)

--- a/src/Gargantext/Core/Text/Metrics.hs
+++ b/src/Gargantext/Core/Text/Metrics.hs
@@ -18,18 +18,13 @@ module Gargantext.Core.Text.Metrics

 --import Data.Array.Accelerate ((:.)(..), Z(..))
 --import Math.KMeans (kmeans, euclidSq, elements)
-
--import GHC.Float (exp)
-import Data.Tuple.Extra (both)
 import Data.Map (Map)
-import Data.List.Extra (sortOn)
 import Gargantext.Prelude
 import Gargantext.Core.Viz.Graph.Distances.Matrice
 import Gargantext.Core.Viz.Graph.Index
 import Gargantext.Core.Statistics (pcaReduceTo, Dimension(..))
 import qualified Data.Array.Accelerate as DAA
 import qualified Data.Array.Accelerate.Interpreter as DAA
-import qualified Data.List as List
 import qualified Data.Map  as Map

 import qualified Data.Vector.Storable as Vec
@@ -37,14 +32,6 @@ import qualified Data.Vector.Storable as Vec
 type MapListSize = Int
 type InclusionSize = Int

-{-
-toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t] 
-toScored' = map2scored
-         . (pcaReduceTo (Dimension 2))
-         . (Map.filter (\v -> Vec.length v > 1))
-         . (Map.unionsWith (<>))
-}
-
 scored :: Ord t => Map (t,t) Int -> [Scored t]
 scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map
  where
@@ -61,6 +48,7 @@ data Scored ts = Scored
  , _scored_speExc :: !SpecificityExclusion
  } deriving (Show)

+
 localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
 localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
                                         (Map.toList fi)
@@ -85,39 +73,3 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (Map.toList fi) score
             $ DAA.zip (DAA.use is) (DAA.use ss)


-takeScored :: Ord t => MapListSize -> InclusionSize -> Map (t,t) Int -> ([t],[t])
-takeScored listSize incSize = both (map _scored_terms)
-                            . takeLinear listSize incSize _scored_genInc
-                                                          _scored_speExc
-                            . scored
-
-
-- | Filter Scored data
-- >>> takeLinear 2 3 fst snd $ Prelude.zip ([1..10] :: [Int]) (reverse $ [1..10] :: [Int])
-- [(3,8),(6,5)]
-takeLinear :: (Ord b1, Ord b2)
-            => MapListSize -> InclusionSize
-            -> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
-takeLinear mls incSize speGen incExc = (List.splitAt mls)
-                      . List.concat
-                      . map (take $ round
-                                  $ (fromIntegral mls     :: Double)
-                                  / (fromIntegral incSize :: Double)
-                             )
-                      . map (sortOn speGen)
-                      . splitEvery incSize
-                      . take 5000
-                      . takePercent (0.70)
-                      . sortOn incExc
-
-takePercent :: Double -> [a] -> [a]
-takePercent l xs = List.take l' xs
-  where
-    l' = round $ l * (fromIntegral $ List.length xs)
-
-splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
-splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
-  where
-    (mpa, ca) = List.splitAt a $ List.filter af xs
-    (mpb, cb) = List.splitAt b $ List.filter bf xs
-
--- a/src/Gargantext/Database/Action/Metrics.hs
+++ b/src/Gargantext/Database/Action/Metrics.hs
@@ -46,7 +46,7 @@ getNgramsCooc :: (FlowCmdM env err m)
                 )
 getNgramsCooc cId maybeListId tabType maybeLimit = do
  (ngs', ngs) <- getNgrams cId maybeListId tabType
-  
+
  let
    take' Nothing  xs = xs
    take' (Just n) xs = take n xs