AdaptativePhylo.hs 14.2 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Viz.AdaptativePhylo
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
Description : Phylomemy definitions and types.
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Specifications of Phylomemy export format.

Phylomemy can be described as a Temporal Graph with different scale of
granularity of group of ngrams (terms and multi-terms).

The main type is Phylo which is synonym of Phylomemy (only difference is
the number of chars).

References:
Chavalarias, D., Cointet, J.-P., 2013. Phylomemetic patterns
in science evolution — the rise and fall of scientific fields. PloS
one 8, e54847.
-}

24
{-# LANGUAGE DeriveAnyClass   #-}
25 26
{-# LANGUAGE TemplateHaskell   #-}

27
module Gargantext.Core.Viz.AdaptativePhylo where
28 29 30

import Data.Aeson
import Data.Aeson.TH (deriveJSON)
qlobbe's avatar
qlobbe committed
31
import Data.Text   (Text, pack)
32
import Data.Vector (Vector)
qlobbe's avatar
qlobbe committed
33
import Data.Map (Map)
34 35 36

import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
37
import Gargantext.Core.Text.Context (TermList)
38 39 40 41 42 43

import GHC.Generics
import GHC.IO (FilePath)
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses)

44 45
import qualified Data.Text.Lazy as TextLazy

46 47 48 49 50 51

----------------
-- | Config | --
----------------  


qlobbe's avatar
qlobbe committed
52
data CorpusParser = 
qlobbe's avatar
qlobbe committed
53 54 55
      Wos  {_wos_limit  :: Int}
    | Csv  {_csv_limit  :: Int}
    | Csv' {_csv'_limit :: Int}
qlobbe's avatar
qlobbe committed
56 57
    deriving (Show,Generic,Eq) 

58 59 60 61 62 63 64
data SeaElevation = 
      Constante  
      { _cons_start :: Double
      , _cons_step  :: Double }
    | Adaptative 
      { _adap_granularity :: Double }
    deriving (Show,Generic,Eq)
qlobbe's avatar
qlobbe committed
65 66 67

data Proximity = 
      WeightedLogJaccard 
68 69
      { _wlj_sensibility   :: Double
{-
70 71 72 73
      -- , _wlj_thresholdInit :: Double
      -- , _wlj_thresholdStep :: Double
      -- | max height for sea level in temporal matching
      -- , _wlj_elevation     :: Double
74
-}
75
      }
76 77 78 79 80 81 82 83 84
    | WeightedLogSim 
      { _wlj_sensibility   :: Double
{-
      -- , _wlj_thresholdInit :: Double
      -- , _wlj_thresholdStep :: Double
      -- | max height for sea level in temporal matching
      -- , _wlj_elevation     :: Double
-}
      } 
qlobbe's avatar
qlobbe committed
85 86 87 88
    | Hamming 
    deriving (Show,Generic,Eq) 


89 90 91 92
data SynchronyScope = SingleBranch | SiblingBranches | AllBranches deriving (Show,Generic,Eq)

data SynchronyStrategy = MergeRegularGroups | MergeAllGroups deriving (Show,Generic,Eq)

qlobbe's avatar
qlobbe committed
93
data Synchrony = 
94
      ByProximityThreshold
qlobbe's avatar
qlobbe committed
95
      { _bpt_threshold :: Double 
96 97 98
      , _bpt_sensibility :: Double
      , _bpt_scope :: SynchronyScope
      , _bpt_strategy :: SynchronyStrategy }
qlobbe's avatar
qlobbe committed
99
    | ByProximityDistribution
100 101
      { _bpd_sensibility :: Double
      , _bpd_strategy :: SynchronyStrategy } 
qlobbe's avatar
qlobbe committed
102 103 104
    deriving (Show,Generic,Eq)     


qlobbe's avatar
qlobbe committed
105 106 107 108 109
data TimeUnit = 
      Year 
      { _year_period :: Int
      , _year_step   :: Int
      , _year_matchingFrame :: Int }
qlobbe's avatar
qlobbe committed
110 111 112 113 114 115 116 117 118 119 120 121
    | Month 
      { _month_period :: Int
      , _month_step   :: Int
      , _month_matchingFrame :: Int }      
    | Week 
      { _week_period :: Int
      , _week_step   :: Int
      , _week_matchingFrame :: Int }
    | Day 
      { _day_period :: Int
      , _day_step   :: Int
      , _day_matchingFrame :: Int }      
qlobbe's avatar
qlobbe committed
122 123
      deriving (Show,Generic,Eq) 

124
data CliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
qlobbe's avatar
qlobbe committed
125

126
data Clique = 
qlobbe's avatar
qlobbe committed
127 128 129
      Fis 
      { _fis_support :: Int
      , _fis_size    :: Int }
130
    | MaxClique
131 132 133
      { _mcl_size      :: Int
      , _mcl_threshold :: Double
      , _mcl_filter    :: CliqueFilter } 
134 135 136 137
      deriving (Show,Generic,Eq)      


data Quality = 
138 139
     Quality { _qua_granularity :: Double
             , _qua_minBranch   :: Int }
140
      deriving (Show,Generic,Eq)   
141

qlobbe's avatar
qlobbe committed
142

143
data Config = 
qlobbe's avatar
qlobbe committed
144 145 146 147 148 149 150
     Config { corpusPath     :: FilePath
            , listPath       :: FilePath
            , outputPath     :: FilePath
            , corpusParser   :: CorpusParser
            , phyloName      :: Text
            , phyloLevel     :: Int
            , phyloProximity :: Proximity
151
            , seaElevation   :: SeaElevation
152
            , findAncestors  :: Bool
qlobbe's avatar
qlobbe committed
153
            , phyloSynchrony :: Synchrony
154
            , phyloQuality   :: Quality
qlobbe's avatar
qlobbe committed
155
            , timeUnit       :: TimeUnit
156
            , clique         :: Clique
qlobbe's avatar
qlobbe committed
157 158 159
            , exportLabel    :: [PhyloLabel]
            , exportSort     :: Sort
            , exportFilter   :: [Filter]  
qlobbe's avatar
qlobbe committed
160 161
            } deriving (Show,Generic,Eq)

qlobbe's avatar
qlobbe committed
162

163
defaultConfig :: Config
qlobbe's avatar
qlobbe committed
164
defaultConfig = 
qlobbe's avatar
qlobbe committed
165 166 167
     Config { corpusPath     = ""
            , listPath       = ""
            , outputPath     = ""
qlobbe's avatar
qlobbe committed
168
            , corpusParser   = Csv 1000
qlobbe's avatar
qlobbe committed
169
            , phyloName      = pack "Default Phylo"
qlobbe's avatar
qlobbe committed
170
            , phyloLevel     = 2
171
            , phyloProximity = WeightedLogJaccard 10
qlobbe's avatar
qlobbe committed
172
            , seaElevation   = Constante 0.1 0.1
173
            , findAncestors  = True
174
            , phyloSynchrony = ByProximityThreshold 0.1 10 SiblingBranches MergeAllGroups
qlobbe's avatar
qlobbe committed
175
            , phyloQuality   = Quality 0 1
qlobbe's avatar
qlobbe committed
176
            , timeUnit       = Year 3 1 5
177
            , clique         = MaxClique 0 3 ByNeighbours
qlobbe's avatar
qlobbe committed
178
            , exportLabel    = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
qlobbe's avatar
qlobbe committed
179 180
            , exportSort     = ByHierarchy
            , exportFilter   = [ByBranchSize 2]  
qlobbe's avatar
qlobbe committed
181
            }
182 183 184 185 186

instance FromJSON Config
instance ToJSON Config
instance FromJSON CorpusParser
instance ToJSON CorpusParser
qlobbe's avatar
qlobbe committed
187 188
instance FromJSON Proximity
instance ToJSON Proximity
189 190
instance FromJSON SeaElevation
instance ToJSON SeaElevation
qlobbe's avatar
qlobbe committed
191 192
instance FromJSON TimeUnit
instance ToJSON TimeUnit
193 194
instance FromJSON CliqueFilter
instance ToJSON CliqueFilter
195 196
instance FromJSON Clique
instance ToJSON Clique
qlobbe's avatar
qlobbe committed
197 198
instance FromJSON PhyloLabel
instance ToJSON PhyloLabel
199 200
instance FromJSON Tagger
instance ToJSON Tagger
qlobbe's avatar
qlobbe committed
201 202 203 204 205 206
instance FromJSON Sort
instance ToJSON Sort
instance FromJSON Order
instance ToJSON Order
instance FromJSON Filter
instance ToJSON Filter
207 208 209 210
instance FromJSON SynchronyScope
instance ToJSON SynchronyScope
instance FromJSON SynchronyStrategy
instance ToJSON SynchronyStrategy
qlobbe's avatar
qlobbe committed
211 212
instance FromJSON Synchrony
instance ToJSON Synchrony
213 214
instance FromJSON Quality
instance ToJSON Quality
215 216


qlobbe's avatar
qlobbe committed
217 218 219 220 221 222
-- | Software parameters
data Software =
     Software { _software_name    :: Text
              , _software_version :: Text
     } deriving (Generic, Show, Eq)

223
defaultSoftware :: Software
qlobbe's avatar
qlobbe committed
224 225 226 227 228 229 230 231 232 233 234 235
defaultSoftware = 
      Software { _software_name    = pack "Gargantext"
               , _software_version = pack "v4" }


-- | Global parameters of a Phylo
data PhyloParam =
     PhyloParam { _phyloParam_version  :: Text
                , _phyloParam_software :: Software
                , _phyloParam_config   :: Config
     } deriving (Generic, Show, Eq)

236
defaultPhyloParam :: PhyloParam
qlobbe's avatar
qlobbe committed
237 238 239 240 241 242
defaultPhyloParam =
      PhyloParam { _phyloParam_version  = pack "v2.adaptative"
                 , _phyloParam_software = defaultSoftware
                 , _phyloParam_config   = defaultConfig }


243 244 245 246 247 248 249 250 251 252
------------------
-- | Document | --
------------------

-- | Date : a simple Integer
type Date = Int

-- | Ngrams : a contiguous sequence of n terms
type Ngrams = Text

qlobbe's avatar
qlobbe committed
253 254
-- Document : a piece of Text linked to a Date
-- date = computational date; date' = original string date yyyy-mm-dd
255
data Document = Document
qlobbe's avatar
qlobbe committed
256 257 258 259 260
      { date    :: Date
      , date'   :: Text
      , text    :: [Ngrams]
      , weight  :: Maybe Double
      , sources :: [Text]
qlobbe's avatar
qlobbe committed
261
      } deriving (Eq,Show,Generic,NFData)  
262 263 264 265 266 267 268 269 270 271 272 273 274 275


--------------------
-- | Foundation | --
--------------------


-- | The Foundations of a Phylo created from a given TermList 
data PhyloFoundations = PhyloFoundations
      { _foundations_roots   :: !(Vector Ngrams)
      , _foundations_mapList :: TermList
      } deriving (Generic, Show, Eq)


qlobbe's avatar
qlobbe committed
276 277 278 279
data PhyloSources = PhyloSources
      { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)


qlobbe's avatar
qlobbe committed
280 281 282 283 284
---------------------------
-- | Coocurency Matrix | --
---------------------------


285 286
-- | Cooc : a coocurency matrix between two ngrams
type Cooc =  Map (Int,Int) Double
qlobbe's avatar
qlobbe committed
287 288 289 290 291 292 293 294 295 296 297 298


-------------------
-- | Phylomemy | --
-------------------


-- | Phylo datatype of a phylomemy
--  foundations : the foundations of the phylo
--  timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
--  timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
--  param : the parameters of the phylomemy (with the user's configuration)
299
--  periods : the temporal steps of a phylomemy
qlobbe's avatar
qlobbe committed
300
data Phylo =
qlobbe's avatar
qlobbe committed
301
     Phylo { _phylo_foundations  :: PhyloFoundations
qlobbe's avatar
qlobbe committed
302
           , _phylo_sources      :: PhyloSources
qlobbe's avatar
qlobbe committed
303 304 305 306 307 308 309 310
           , _phylo_timeCooc     :: !(Map Date Cooc)
           , _phylo_timeDocs     :: !(Map Date Double)
           , _phylo_termFreq     :: !(Map Int Double)
           , _phylo_lastTermFreq :: !(Map Int Double)           
           , _phylo_horizon      :: !(Map (PhyloGroupId,PhyloGroupId) Double)           
           , _phylo_groupsProxi  :: !(Map (PhyloGroupId,PhyloGroupId) Double)
           , _phylo_param        :: PhyloParam
           , _phylo_periods      :: Map PhyloPeriodId PhyloPeriod
qlobbe's avatar
qlobbe committed
311 312 313 314
           }
           deriving (Generic, Show, Eq)


315 316 317 318 319 320 321
-- | PhyloPeriodId : the id of a given period
type PhyloPeriodId = (Date,Date)

-- | PhyloPeriod : steps of a phylomemy on a temporal axis
--  id: tuple (start date, end date) of the temporal step of the phylomemy
--  levels: levels of granularity
data PhyloPeriod =
qlobbe's avatar
qlobbe committed
322 323 324
     PhyloPeriod { _phylo_periodPeriod  :: (Date,Date)
                 , _phylo_periodPeriod' :: (Text,Text)
                 , _phylo_periodLevels  :: Map PhyloLevelId PhyloLevel
qlobbe's avatar
qlobbe committed
325
                 } deriving (Generic, Show, Eq)   
326 327 328 329 330 331 332 333 334 335 336 337 338 339


-- | Level : a level of clustering
type Level = Int

-- | PhyloLevelId : the id of a level of clustering in a given period 
type PhyloLevelId  = (PhyloPeriodId,Level)

-- | PhyloLevel : levels of phylomemy on a synchronic axis
-- Levels description:
-- Level 0: The foundations and the base of the phylo
-- Level 1: First level of clustering (the Fis)
-- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
data PhyloLevel =
qlobbe's avatar
qlobbe committed
340 341 342 343
     PhyloLevel { _phylo_levelPeriod  :: (Date,Date)
                , _phylo_levelPeriod' :: (Text,Text)
                , _phylo_levelLevel   :: Level 
                , _phylo_levelGroups  :: Map PhyloGroupId PhyloGroup
qlobbe's avatar
qlobbe committed
344
                } 
345 346 347
                deriving (Generic, Show, Eq)   


qlobbe's avatar
qlobbe committed
348
type PhyloGroupId  = (PhyloLevelId, Int)
349

qlobbe's avatar
qlobbe committed
350 351 352
-- | BranchId : (a level, a sequence of branch index)
-- the sequence is a path of heritage from the most to the less specific branch
type PhyloBranchId = (Level, [Int])
353 354 355

-- | PhyloGroup : group of ngrams at each level and period
data PhyloGroup = 
qlobbe's avatar
qlobbe committed
356
      PhyloGroup { _phylo_groupPeriod   :: (Date,Date)
qlobbe's avatar
qlobbe committed
357
                 , _phylo_groupPeriod'  :: (Text,Text)
qlobbe's avatar
qlobbe committed
358
                 , _phylo_groupLevel    :: Level
qlobbe's avatar
qlobbe committed
359
                 , _phylo_groupIndex    :: Int         
360
                 , _phylo_groupLabel    :: Text
qlobbe's avatar
qlobbe committed
361
                 , _phylo_groupSupport  :: Support
qlobbe's avatar
qlobbe committed
362
                 , _phylo_groupWeight   :: Maybe Double
qlobbe's avatar
qlobbe committed
363
                 , _phylo_groupSources  :: [Int]                 
qlobbe's avatar
qlobbe committed
364
                 , _phylo_groupNgrams   :: [Int]
qlobbe's avatar
qlobbe committed
365
                 , _phylo_groupCooc     :: !(Cooc)
qlobbe's avatar
qlobbe committed
366
                 , _phylo_groupBranchId :: PhyloBranchId
367
                 , _phylo_groupMeta     :: Map Text [Double]
qlobbe's avatar
qlobbe committed
368 369 370 371
                 , _phylo_groupLevelParents  :: [Pointer]
                 , _phylo_groupLevelChilds   :: [Pointer]
                 , _phylo_groupPeriodParents :: [Pointer]
                 , _phylo_groupPeriodChilds  :: [Pointer]
qlobbe's avatar
qlobbe committed
372
                 , _phylo_groupAncestors     :: [Pointer]
373
                 }
qlobbe's avatar
qlobbe committed
374
                 deriving (Generic, Show, Eq, NFData)
375

qlobbe's avatar
qlobbe committed
376 377 378 379
-- | Weight : A generic mesure that can be associated with an Id
type Weight = Double

-- | Pointer : A weighted pointer to a given PhyloGroup
qlobbe's avatar
qlobbe committed
380 381 382 383
type Pointer = (PhyloGroupId, Weight)

data Filiation = ToParents | ToChilds deriving (Generic, Show)    
data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)                
qlobbe's avatar
qlobbe committed
384

385

386 387 388
----------------------
-- | Phylo Clique | --
----------------------
389 390 391 392

-- | Support : Number of Documents where a Clique occurs
type Support  = Int

393
data PhyloClique = PhyloClique
qlobbe's avatar
qlobbe committed
394
  { _phyloClique_nodes   :: [Int]
395 396
  , _phyloClique_support :: Support
  , _phyloClique_period  :: (Date,Date)
qlobbe's avatar
qlobbe committed
397
  , _phyloClique_weight  :: Maybe Double
qlobbe's avatar
qlobbe committed
398
  , _phyloClique_sources :: [Int]
399 400
  } deriving (Generic,NFData,Show,Eq)

401 402 403 404 405 406
----------------
-- | Export | --
----------------

type DotId = TextLazy.Text

qlobbe's avatar
qlobbe committed
407
data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | GroupToAncestor | PeriodToPeriod deriving (Show,Generic,Eq)
qlobbe's avatar
qlobbe committed
408

qlobbe's avatar
qlobbe committed
409 410 411 412 413 414
data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)

data Order = Asc | Desc deriving (Show,Generic,Eq)

data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)

qlobbe's avatar
qlobbe committed
415
data Tagger = MostInclusive | MostEmergentInclusive | MostEmergentTfIdf deriving (Show,Generic,Eq)
416

qlobbe's avatar
qlobbe committed
417
data PhyloLabel = 
418 419 420 421 422 423 424 425 426 427 428
      BranchLabel
      { _branch_labelTagger :: Tagger
      , _branch_labelSize   :: Int }
    | GroupLabel
      { _group_labelTagger  :: Tagger
      , _group_labelSize    :: Int }
    deriving (Show,Generic,Eq)

data PhyloBranch =
      PhyloBranch
      { _branch_id :: PhyloBranchId
429 430 431 432
      , _branch_canonId  :: [Int]
      , _branch_seaLevel :: [Double]
      , _branch_x        :: Double
      , _branch_y        :: Double
qlobbe's avatar
qlobbe committed
433 434
      , _branch_w        :: Double
      , _branch_t        :: Double
435 436 437
      , _branch_label    :: Text
      , _branch_meta     :: Map Text [Double]
      } deriving (Generic, Show, Eq)
438 439 440

data PhyloExport =
      PhyloExport
qlobbe's avatar
qlobbe committed
441 442
      { _export_groups    :: [PhyloGroup]
      , _export_branches  :: [PhyloBranch]
443 444
      } deriving (Generic, Show)

445 446 447 448
----------------
-- | Lenses | --
----------------

qlobbe's avatar
qlobbe committed
449
makeLenses ''Config
qlobbe's avatar
qlobbe committed
450
makeLenses ''Proximity
451
makeLenses ''SeaElevation
452
makeLenses ''Quality
453
makeLenses ''Clique
qlobbe's avatar
qlobbe committed
454
makeLenses ''PhyloLabel
qlobbe's avatar
qlobbe committed
455
makeLenses ''TimeUnit
456
makeLenses ''PhyloFoundations
457
makeLenses ''PhyloClique
458 459 460 461 462
makeLenses ''Phylo
makeLenses ''PhyloPeriod
makeLenses ''PhyloLevel
makeLenses ''PhyloGroup
makeLenses ''PhyloParam
463 464
makeLenses ''PhyloExport
makeLenses ''PhyloBranch
465 466 467 468 469

------------------------
-- | JSON instances | --
------------------------

qlobbe's avatar
qlobbe committed
470 471
instance FromJSON Phylo
instance ToJSON Phylo
qlobbe's avatar
qlobbe committed
472 473
instance FromJSON PhyloSources
instance ToJSON PhyloSources
qlobbe's avatar
qlobbe committed
474 475 476 477 478 479 480 481 482 483
instance FromJSON PhyloParam
instance ToJSON PhyloParam
instance FromJSON PhyloPeriod
instance ToJSON PhyloPeriod
instance FromJSON PhyloLevel
instance ToJSON PhyloLevel
instance FromJSON Software
instance ToJSON Software
instance FromJSON PhyloGroup
instance ToJSON PhyloGroup
484

485
$(deriveJSON (unPrefix "_foundations_"  ) ''PhyloFoundations)