PhyloMaker.hs 17.6 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Viz.Phylo.PhyloMaker
3 4 5 6 7 8 9 10
Description : Maker engine for rebuilding a Phylo
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}

11
module Gargantext.Core.Viz.Phylo.PhyloMaker where
qlobbe's avatar
qlobbe committed
12

13 14 15
import Control.DeepSeq (NFData)
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
qlobbe's avatar
qlobbe committed
16 17
import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
qlobbe's avatar
qlobbe committed
18
import Data.Text (Text)
19 20
import Data.Vector (Vector)
import Debug.Trace (trace)
qlobbe's avatar
qlobbe committed
21

22 23
import Gargantext.Core.Methods.Distances (Distance(Conditional))
import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
24
import Gargantext.Core.Text.Context (TermList)
qlobbe's avatar
qlobbe committed
25
import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
26
import Gargantext.Core.Viz.Phylo
27
import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28 29 30 31
import Gargantext.Core.Viz.Phylo.PhyloTools
import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
import Gargantext.Prelude
32 33

import qualified Data.Set as Set
34
import qualified Data.Vector as Vector
35 36 37 38 39

------------------
-- | To Phylo | --
------------------

40 41 42 43 44 45
{-
-- TODO AD
data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
            | PhyloN    { _phylo'_phylo1    :: Phylo}


46
toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
47 48 49 50
toPhylo' (PhyloN    phylo) = toPhylo' 
toPhylo' (PhyloBase phylo) = toPhylo 
-}

51

qlobbe's avatar
qlobbe committed
52 53 54 55 56
toPhylo :: Phylo -> Phylo
toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
                      $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
    if (phyloLevel $ getConfig phyloStep) > 1
      then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
qlobbe's avatar
qlobbe committed
57
      else phylo1 
58
    where
59 60
        --------------------------------------
        phyloAncestors :: Phylo
61
        phyloAncestors = 
qlobbe's avatar
qlobbe committed
62
            if (findAncestors $ getConfig phyloStep)
63 64
              then toHorizon phylo1
              else phylo1
65
        --------------------------------------
qlobbe's avatar
qlobbe committed
66
        phylo1 :: Phylo
qlobbe's avatar
qlobbe committed
67
        phylo1 = toPhylo1 phyloStep
68 69 70 71 72 73 74
        --------------------------------------


--------------------
-- | To Phylo 1 | --
--------------------

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
toGroupsProxi :: Level -> Phylo -> Phylo
toGroupsProxi lvl phylo = 
  let proximity = phyloProximity $ getConfig phylo
      groupsProxi = foldlWithKey (\acc pId pds -> 
                      -- 1) process period by period
                      let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
                               $ elems 
                               $ view ( phylo_periodLevels 
                                      . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl) 
                                      . phylo_levelGroups ) pds
                          next    = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
                          targets = map (\g ->  (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
                          docs    = filterDocs  (phylo ^. phylo_timeDocs) ([pId] ++ next)
                          diagos  = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
                          -- 2) compute the pairs in parallel
                          pairs  = map (\(id,ngrams) -> 
                                        map (\(id',ngrams') -> 
                                            let nbDocs = (sum . elems) $ filterDocs docs    ([idToPrd id, idToPrd id'])
                                                diago  = reduceDiagos  $ filterDiago diagos ([idToPrd id, idToPrd id'])
                                             in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
                                        ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets 
                                 ) egos
                          pairs' = pairs `using` parList rdeepseq
                       in acc ++ (concat pairs')
                    ) [] $ phylo ^. phylo_periods
   in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi) 


qlobbe's avatar
qlobbe committed
103
appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
qlobbe's avatar
qlobbe committed
104 105 106 107 108 109 110
appendGroups f lvl m phylo =  trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
    $ over ( phylo_periods
           .  traverse
           . phylo_periodLevels
           .  traverse)
           (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
                         then
qlobbe's avatar
qlobbe committed
111 112
                            let pId  = phyloLvl ^. phylo_levelPeriod
                                pId' = phyloLvl ^. phylo_levelPeriod' 
113
                                phyloCUnit = m ! pId
qlobbe's avatar
qlobbe committed
114 115
                            in  phyloLvl 
                              & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
qlobbe's avatar
qlobbe committed
116
                                    groups ++ [ (((pId,lvl),length groups)
qlobbe's avatar
qlobbe committed
117
                                              , f obj pId pId' lvl (length groups)
qlobbe's avatar
qlobbe committed
118
                                                  (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
119
                                              ] ) [] phyloCUnit)
qlobbe's avatar
qlobbe committed
120 121 122 123 124
                         else 
                            phyloLvl )
           phylo  


qlobbe's avatar
qlobbe committed
125 126
cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level ->  Int -> [Cooc] -> PhyloGroup
cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
127
                   (fis ^. phyloClique_support)
qlobbe's avatar
qlobbe committed
128
                   (fis ^. phyloClique_weight)
qlobbe's avatar
qlobbe committed
129
                   (fis ^. phyloClique_sources)
qlobbe's avatar
qlobbe committed
130 131
                   (fis ^. phyloClique_nodes)
                   (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
132
                   (1,[0]) -- branchid (lvl,[path in the branching tree])
133
                   (fromList [("breaks",[0]),("seaLevels",[0])])
qlobbe's avatar
qlobbe committed
134
                   [] [] [] [] [] [] []
qlobbe's avatar
qlobbe committed
135 136


qlobbe's avatar
qlobbe committed
137 138 139 140 141 142 143
toPhylo1 :: Phylo -> Phylo
toPhylo1 phyloStep = case (getSeaElevation phyloStep) of 
    Constante start gap -> constanteTemporalMatching  start gap phyloStep
    Adaptative steps    -> adaptativeTemporalMatching steps phyloStep

-----------------------
-- | To Phylo Step | --
qlobbe's avatar
qlobbe committed
144 145 146 147 148 149 150 151 152 153 154 155 156
-----------------------    


indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)  
indexDates' m = map (\docs -> 
  let ds = map (\d -> date' d) docs
      f = if (null ds)
            then ""
            else toFstDate ds
      l = if (null ds) 
            then ""
            else toLstDate ds
   in (f,l)) m
qlobbe's avatar
qlobbe committed
157 158 159


-- To build the first phylo step from docs and terms
160
-- QL: backend entre phyloBase et phyloClique
161
toPhyloStep :: [Document] -> TermList -> PhyloConfig -> Phylo
qlobbe's avatar
qlobbe committed
162
toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of 
qlobbe's avatar
qlobbe committed
163 164 165
    Constante  _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
    Adaptative _   -> toGroupsProxi 1 
                    $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
166 167
    where
        --------------------------------------
168 169
        phyloClique :: Map (Date,Date) [PhyloClique]
        phyloClique =  toPhyloClique phyloBase docs'
170
        --------------------------------------
qlobbe's avatar
qlobbe committed
171
        docs' :: Map (Date,Date) [Document]
172
        -- QL: Time Consuming here
qlobbe's avatar
qlobbe committed
173
        docs' =  groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
qlobbe's avatar
qlobbe committed
174 175 176 177
        --------------------------------------
        phyloBase :: Phylo
        phyloBase = toPhyloBase docs lst conf
        --------------------------------------
178 179 180 181 182 183

---------------------------
-- | Frequent Item Set | --
---------------------------


184
--  To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
185 186
filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
filterClique keep thr f m = case keep of
187 188 189 190
  False -> map (\l -> f thr l) m
  True  -> map (\l -> keepFilled (f) thr l) m


191
--  To filter Fis with small Support
192 193
filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
194 195


196
--  To filter Fis with small Clique size
197
filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
qlobbe's avatar
qlobbe committed
198
filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
199 200


201
--  To filter nested Fis
202 203 204
filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
filterCliqueByNested m = 
  let clq  = map (\l -> 
qlobbe's avatar
qlobbe committed
205
                foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
206 207
                                 then mem
                                 else 
qlobbe's avatar
qlobbe committed
208
                                    let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
209 210
                                    in  fMax ++ [f] ) [] l)
           $ elems m 
211 212
      clq' = clq `using` parList rdeepseq
  in  fromList $ zip (keys m) clq' 
213 214


qlobbe's avatar
qlobbe committed
215
-- | To transform a time map of docs into a time map of Fis with some filters
216 217
toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of 
qlobbe's avatar
qlobbe committed
218 219
    Fis s s'    -> -- traceFis "Filtered Fis"
                   filterCliqueByNested 
220
                 {- \$ traceFis "Filtered by clique size" -}
qlobbe's avatar
qlobbe committed
221
                 $ filterClique True s' (filterCliqueBySize)
222
                 {- \$ traceFis "Filtered by support" -}
qlobbe's avatar
qlobbe committed
223
                 $ filterClique True s (filterCliqueBySupport)
224
                 {- \$ traceFis "Unfiltered Fis" -}
qlobbe's avatar
qlobbe committed
225
                 phyloClique
226 227
    MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
                       phyloClique
228
    where
qlobbe's avatar
qlobbe committed
229
        -------------------------------------- 
230 231
        phyloClique :: Map (Date,Date) [PhyloClique]
        phyloClique = case (clique $ getConfig phylo) of 
qlobbe's avatar
qlobbe committed
232 233
          Fis _ _     ->  
                      let fis  = map (\(prd,docs) -> 
qlobbe's avatar
qlobbe committed
234
                                      case (corpusParser $ getConfig phylo) of
qlobbe's avatar
qlobbe committed
235 236 237
                                        Csv' _  -> let lst = toList 
                                                                  $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
                                                           in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
qlobbe's avatar
qlobbe committed
238 239
                                        _  -> let lst = toList 
                                                      $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
qlobbe's avatar
qlobbe committed
240
                                              in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
qlobbe's avatar
qlobbe committed
241
                                      )
242 243 244
                               $ toList phyloDocs
                          fis' = fis `using` parList rdeepseq
                       in fromList fis'
245
          MaxClique _ thr filterType -> 
qlobbe's avatar
qlobbe committed
246 247 248 249 250
                      let mcl  = map (\(prd,docs) -> 
                                    let cooc = map round
                                             $ foldl sumCooc empty
                                             $ map listToMatrix 
                                             $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
qlobbe's avatar
qlobbe committed
251
                                     in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc)) 
qlobbe's avatar
qlobbe committed
252 253 254
                               $ toList phyloDocs
                          mcl' = mcl `using` parList rdeepseq                               
                       in fromList mcl' 
255
        -------------------------------------- 
qlobbe's avatar
qlobbe committed
256

qlobbe's avatar
qlobbe committed
257 258
        -- dev viz graph maxClique getMaxClique

qlobbe's avatar
qlobbe committed
259 260

--------------------
261
-- | Coocurency | --
qlobbe's avatar
qlobbe committed
262 263
--------------------

264

265
--  To transform the docs into a time map of coocurency matrix 
qlobbe's avatar
qlobbe committed
266 267
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
docsToTimeScaleCooc docs fdt = 
268 269 270 271 272
    let mCooc  = fromListWith sumCooc
               $ map (\(_d,l) -> (_d, listToMatrix l))
               $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
        mCooc' = fromList
               $ map (\t -> (t,empty))
qlobbe's avatar
qlobbe committed
273
               $ toTimeScale (map date docs) 1
274
    in  trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
275 276 277 278 279 280
       $ unionWith sumCooc mCooc mCooc'


-----------------------
-- | to Phylo Base | --
-----------------------
Alexandre Delanoë's avatar
Alexandre Delanoë committed
281
-- TODO anoe
qlobbe's avatar
qlobbe committed
282
groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
283
groupDocsByPeriodRec f prds docs acc =
qlobbe's avatar
qlobbe committed
284 285 286 287 288 289 290 291
    if ((null prds) || (null docs))
      then acc 
      else 
        let prd = head' "groupBy" prds
            docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
         in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)


292
--  To group a list of Documents by fixed periods
qlobbe's avatar
qlobbe committed
293
groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
294
groupDocsByPeriod' f pds docs =
qlobbe's avatar
qlobbe committed
295 296 297 298 299 300 301 302 303 304 305 306
  let docs'    = groupBy (\d d' -> f d == f d') $ sortOn f docs
      periods  = map (inPeriode f docs') pds
      periods' = periods `using` parList rdeepseq
   in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n") 
    $ fromList $ zip pds periods'
  where
    --------------------------------------
    inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
    inPeriode f' h (start,end) =
      concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h


307

308
--  To group a list of Documents by fixed periods
309 310
groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
groupDocsByPeriod _ _   [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
Alexandre Delanoë's avatar
Alexandre Delanoë committed
311
groupDocsByPeriod f pds es =
312 313 314 315 316 317 318 319 320 321 322 323 324
  let periods  = map (inPeriode f es) pds
      periods' = periods `using` parList rdeepseq

  in  trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n") 
    $ fromList $ zip pds periods'
  where
    --------------------------------------
    inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
    inPeriode f' h (start,end) =
      fst $ partition (\d -> f' d >= start && f' d <= end) h
    --------------------------------------   


qlobbe's avatar
qlobbe committed
325 326 327
docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
docsToTermFreq docs fdt =
  let nbDocs = fromIntegral $ length docs
qlobbe's avatar
qlobbe committed
328
      freqs = map (/(nbDocs))
qlobbe's avatar
qlobbe committed
329
             $ fromList
qlobbe's avatar
qlobbe committed
330
             $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst)) 
qlobbe's avatar
qlobbe committed
331 332 333 334
             $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
      sumFreqs = sum $ elems freqs
   in map (/sumFreqs) freqs

qlobbe's avatar
qlobbe committed
335 336 337 338
docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
docsToLastTermFreq n docs fdt = 
  let last   = take n $ reverse $ sort $ map date docs
      nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
qlobbe's avatar
qlobbe committed
339
      freqs  = map (/(nbDocs))
qlobbe's avatar
qlobbe committed
340 341 342 343 344 345
             $ fromList
             $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst)) 
             $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
      sumFreqs = sum $ elems freqs
   in map (/sumFreqs) freqs  

qlobbe's avatar
qlobbe committed
346

347
--  To count the number of docs by unit of time
qlobbe's avatar
qlobbe committed
348 349
docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb docs = 
qlobbe's avatar
qlobbe committed
350
    let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
qlobbe's avatar
qlobbe committed
351
        time  = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
352 353
    in  trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n") 
      $ unionWith (+) time docs'
qlobbe's avatar
qlobbe committed
354 355


qlobbe's avatar
qlobbe committed
356 357
initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
initPhyloLevels lvlMax pId = 
qlobbe's avatar
qlobbe committed
358 359
    fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]

qlobbe's avatar
qlobbe committed
360 361


362
--  To init the basic elements of a Phylo
363
toPhyloBase :: [Document] -> TermList -> PhyloConfig -> Phylo
364 365
toPhyloBase docs lst conf = 
    let foundations  = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
qlobbe's avatar
qlobbe committed
366
        docsSources  = PhyloSources     (Vector.fromList $ nub $ concat $ map sources docs)
367
        params = defaultPhyloParam { _phyloParam_config = conf }
qlobbe's avatar
qlobbe committed
368
        periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
369 370
    in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n") 
       $ Phylo foundations
qlobbe's avatar
qlobbe committed
371
               docsSources
qlobbe's avatar
qlobbe committed
372
               (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
qlobbe's avatar
qlobbe committed
373
               (docsToTimeScaleNb docs)
qlobbe's avatar
qlobbe committed
374
               (docsToTermFreq docs (foundations ^. foundations_roots))
qlobbe's avatar
qlobbe committed
375
               (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
376
               empty
qlobbe's avatar
qlobbe committed
377
               empty
378
               params
qlobbe's avatar
qlobbe committed
379
               (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)