SynchronicClustering.hs 12.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
{-|
Module      : Gargantext.Viz.Phylo.SynchronicClustering
Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}


qlobbe's avatar
qlobbe committed
12 13 14
module Gargantext.Viz.Phylo.SynchronicClustering where

import Gargantext.Prelude
qlobbe's avatar
qlobbe committed
15 16
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
qlobbe's avatar
qlobbe committed
17
import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
18
import Gargantext.Viz.Phylo.PhyloExport (processDynamics)
qlobbe's avatar
qlobbe committed
19

20 21 22
import Data.List ((++), null, intersect, nub, concat, sort, sortOn, all, groupBy, group, maximum)
import Data.Map  (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
import Data.Text (Text)
qlobbe's avatar
qlobbe committed
23 24

import Control.Lens hiding (Level)
qlobbe's avatar
qlobbe committed
25
import Control.Parallel.Strategies (parList, rdeepseq, using)
26
-- import Debug.Trace (trace)
qlobbe's avatar
qlobbe committed
27

qlobbe's avatar
qlobbe committed
28
import qualified Data.Map as Map
29
import qualified Data.Set as Set
qlobbe's avatar
qlobbe committed
30

qlobbe's avatar
qlobbe committed
31 32 33 34 35

-------------------------
-- | New Level Maker | --
-------------------------

36
mergeBranchIds :: [[Int]] -> [Int]
37
mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
38
  where
39
    --  2) find the most Up Left ids in the hierarchy of similarity
40 41 42 43
    -- mostUpLeft :: [[Int]] -> [[Int]]
    -- mostUpLeft ids' = 
    --      let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
    --          inf = (fst . minimum) groupIds
44
    --      in map snd $ filter (\gIds -> fst gIds == inf) groupIds
45
    --  1) find the most frequent ids
46 47 48 49 50 51 52 53 54 55 56
    mostFreq' :: [[Int]] -> [[Int]]
    mostFreq' ids' = 
       let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
           sup = (fst . maximum) groupIds
        in map snd $ filter (\gIds -> fst gIds == sup) groupIds


mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
mergeMeta bId groups = 
  let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups  
   in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]     
57 58 59 60


groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches' groups =
61
    --  run the related component algorithm
qlobbe's avatar
qlobbe committed
62 63 64 65
    let egos  = map (\g -> [getGroupId g] 
                        ++ (map fst $ g ^. phylo_groupPeriodParents)
                        ++ (map fst $ g ^. phylo_groupPeriodChilds) ) $ elems groups
        graph = relatedComponents egos
66
    --  update each group's branch id
67 68 69
    in map (\ids ->
        let groups' = elems $ restrictKeys groups (Set.fromList ids)
            bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
70
         in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
71

qlobbe's avatar
qlobbe committed
72 73 74

mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs = 
qlobbe's avatar
qlobbe committed
75
    let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
qlobbe's avatar
qlobbe committed
76 77
    in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id)  ""
                  (sum $ map _phylo_groupSupport childs)  ngrams
78
                  (ngramsToCooc ngrams coocs) 
79 80
                  ((snd $ fst id),bId)
                  (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
qlobbe's avatar
qlobbe committed
81 82
                  (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
                  (updatePointers $ concat $ map _phylo_groupPeriodChilds  childs)
83 84 85 86 87
    where
        --------------------
        bId :: [Int]
        bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
        --------------------
qlobbe's avatar
qlobbe committed
88 89 90 91 92 93 94 95 96 97
        updatePointers :: [Pointer] -> [Pointer]
        updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers


addPhyloLevel :: Level -> Phylo -> Phylo
addPhyloLevel lvl phylo = 
  over ( phylo_periods .  traverse ) 
       (\phyloPrd -> phyloPrd & phylo_periodLevels 
                        %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo

qlobbe's avatar
qlobbe committed
98

99
toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
100
toNextLevel' phylo groups =
qlobbe's avatar
qlobbe committed
101 102
    let curLvl = getLastLevel phylo
        oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
103 104
        newGroups = concat $ groupsToBranches'
                  $ fromList $ map (\g -> (getGroupId g, g))
qlobbe's avatar
qlobbe committed
105
                  $ foldlWithKey (\acc id groups' ->
106
                        --  4) create the parent group
qlobbe's avatar
qlobbe committed
107
                        let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
108
                        in  acc ++ [parent]) []
109
                  --  3) group the current groups by parentId
qlobbe's avatar
qlobbe committed
110
                  $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
111 112

        newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
qlobbe's avatar
qlobbe committed
113 114
    in  traceSynchronyEnd 
      $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
115
             --  6) update each period at curLvl + 1
qlobbe's avatar
qlobbe committed
116
             . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
117
             --  7) by adding the parents
qlobbe's avatar
qlobbe committed
118
             (\phyloLvl -> 
119
                if member (phyloLvl ^. phylo_levelPeriod) newPeriods
qlobbe's avatar
qlobbe committed
120
                    then phyloLvl & phylo_levelGroups
121
                            .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
qlobbe's avatar
qlobbe committed
122
                    else phyloLvl)
123
      --  2) add the curLvl + 1 phyloLevel to the phylo
qlobbe's avatar
qlobbe committed
124
      $ addPhyloLevel (curLvl + 1)
125
      --  1) update the current groups (with level parent pointers) in the phylo
qlobbe's avatar
qlobbe committed
126
      $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo 
qlobbe's avatar
qlobbe committed
127 128 129 130 131

--------------------
-- | Clustering | --
--------------------

132 133 134 135 136 137 138 139
toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
toPairs strategy groups = case strategy of 
  MergeRegularGroups -> pairs
                      $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
  MergeAllGroups -> pairs groups
  where 
    pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
    pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
qlobbe's avatar
qlobbe committed
140

qlobbe's avatar
qlobbe committed
141 142 143 144 145 146 147 148 149 150 151 152 153 154

toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
toDiamonds groups = foldl' (\acc groups' ->
                        acc ++ ( elems
                               $ Map.filter (\v -> length v > 1)
                               $ fromListWith (++)
                               $ foldl' (\acc' g -> 
                                    acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
                  $ elems
                  $ Map.filter (\v -> length v > 1)
                  $ fromListWith (++)
                  $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents)  ) [] groups


qlobbe's avatar
qlobbe committed
155 156
groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
groupsToEdges prox sync nbDocs diago groups =
qlobbe's avatar
qlobbe committed
157
    case sync of
158 159 160 161 162
        ByProximityThreshold  thr sens _ strat ->
            filter (\(_,w) -> w >= thr)
          $ toEdges sens
          $ toPairs strat groups         
        ByProximityDistribution sens strat -> 
qlobbe's avatar
qlobbe committed
163
            let diamonds = sortOn snd 
164 165
                         $ toEdges sens $ concat
                         $ map (\gs -> toPairs strat gs) $ toDiamonds groups 
qlobbe's avatar
qlobbe committed
166 167 168 169 170
             in take (div (length diamonds) 2) diamonds
    where 
        toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
        toEdges sens edges = 
            case prox of
171 172 173
                WeightedLogJaccard _ -> map (\(g,g') -> 
                                                     ((g,g'), weightedLogJaccard' sens nbDocs diago
                                                                  (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
qlobbe's avatar
qlobbe committed
174 175
                _ -> undefined  

qlobbe's avatar
qlobbe committed
176 177 178


toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
qlobbe's avatar
qlobbe committed
179 180 181 182
toRelatedComponents nodes edges = 
  let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
      clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes)) 
   in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters 
qlobbe's avatar
qlobbe committed
183

qlobbe's avatar
qlobbe committed
184 185 186
toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex) 

qlobbe's avatar
qlobbe committed
187

qlobbe's avatar
qlobbe committed
188
reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
189
reduceGroups prox sync docs diagos branch =
190
    --  1) reduce a branch as a set of periods & groups
qlobbe's avatar
qlobbe committed
191 192 193
    let periods = fromListWith (++)
                 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
    in  (concat . concat . elems)
qlobbe's avatar
qlobbe committed
194
      $ mapWithKey (\prd groups -> 
195
            --  2) for each period, transform the groups as a proximity graph filtered by a threshold
qlobbe's avatar
qlobbe committed
196 197
            let diago = reduceDiagos $ filterDiago diagos [prd]
                edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
qlobbe's avatar
qlobbe committed
198
             in map (\comp -> 
199
                    --  4) add to each groups their futur level parent group
qlobbe's avatar
qlobbe committed
200
                    let parentId = toParentId (head' "parentId" comp)
qlobbe's avatar
qlobbe committed
201
                    in  map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
202
                -- 3) reduce the graph a a set of related components
203 204 205
              $ toRelatedComponents groups edges) periods 


206 207 208
adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
adjustClustering sync branches = case sync of
  ByProximityThreshold _ _ scope _ -> case scope of 
209
      SingleBranch -> branches
210 211
      SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g  ^. phylo_groupMeta) ! "breaks") 
                                        == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
212 213 214 215
                       $ sortOn _phylo_groupBranchId $ concat branches
      AllBranches -> [concat branches]
  ByProximityDistribution _ _ -> branches

qlobbe's avatar
qlobbe committed
216 217 218


synchronicClustering :: Phylo -> Phylo
qlobbe's avatar
qlobbe committed
219 220 221 222
synchronicClustering phylo =
    let prox = phyloProximity $ getConfig phylo
        sync = phyloSynchrony $ getConfig phylo
        docs = phylo ^. phylo_timeDocs
qlobbe's avatar
qlobbe committed
223 224
        diagos = map coocToDiago $ phylo ^. phylo_timeCooc
        newBranches  = map (\branch -> reduceGroups prox sync docs diagos branch) 
225
                     $ map processDynamics
226
                     $ adjustClustering sync
227 228 229 230
                     $ phyloToLastBranches 
                     $ traceSynchronyStart phylo
        newBranches' = newBranches `using` parList rdeepseq
     in toNextLevel' phylo $ concat newBranches'
qlobbe's avatar
qlobbe committed
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253


----------------
-- | probes | --
----------------

-- synchronicDistance :: Phylo -> Level -> String
-- synchronicDistance phylo lvl = 
--     foldl' (\acc branch -> 
--              acc <> (foldl' (\acc' period ->
--                               acc' <> let prox  = phyloProximity $ getConfig phylo
--                                           sync  = phyloSynchrony $ getConfig phylo
--                                           docs  = _phylo_timeDocs phylo
--                                           prd   = _phylo_groupPeriod $ head' "distance" period
--                                           edges = groupsToEdges prox 0.1 (_bpt_sensibility sync) 
--                                                   ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
--                                       in foldl' (\mem (_,w) -> 
--                                           mem <> show (prd)
--                                               <> "\t"
--                                               <> show (w)
--                                               <> "\n"
--                                         ) "" edges 
--                      ) ""  $ elems $ groupByField _phylo_groupPeriod branch)
254
--     ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo