Commit a7cafc56 authored by lobbeque's avatar lobbeque

comment the phylo.hs file

parent 95553245
...@@ -41,70 +41,114 @@ import Gargantext.Core.Utils.Prefix (unPrefixSwagger) ...@@ -41,70 +41,114 @@ import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.Text.Lazy as TextLazy import qualified Data.Text.Lazy as TextLazy
--------------------- ---------------------
-- | PhyloConfig | -- -- | PhyloConfig | --
--------------------- ---------------------
-- | CorpusParser : control which csv collumns should be taken into account for reconstructing a phylo
data CorpusParser = data CorpusParser =
Wos {_wos_limit :: Int} Wos
| Csv {_csv_limit :: Int} -- not used anymore
| Csv' {_csv'_limit :: Int} {_wos_limit :: Int}
| Csv
-- consider Publication_Day, Publication_Month, Publication_Year, Authors, Title, Abstract
{_csv_limit :: Int}
| Csv'
-- consider Publication_Day, Publication_Month, Publication_Year, Authors, Title, Abstract, Source, Weight
{_csv'_limit :: Int}
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
instance ToSchema CorpusParser where instance ToSchema CorpusParser where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data ListParser = V3 | V4 deriving (Show,Generic,Eq) -- | ListParser : is the map list in Csv or in Json?
data ListParser =
V3
-- the map list is in Csv
| V4
-- the map list in in Json
deriving (Show,Generic,Eq)
instance ToSchema ListParser instance ToSchema ListParser
-- | SeaElevation : for a given level of observation,
-- define a set of similarity values that will be tested by the sea level rise algorithm
data SeaElevation = data SeaElevation =
Constante Constante
-- test a constant set of values (see Gargantext.Core.Viz.Phylo.PhyloMaker.constSeaLadder)
{ _cons_start :: Double { _cons_start :: Double
, _cons_gap :: Double } , _cons_gap :: Double }
| Adaptative | Adaptative
-- test a set of values that matches the similarity spectrum of the corpus
-- (see Gargantext.Core.Viz.Phylo.PhyloMaker.constSeaLadder)
{ _adap_steps :: Double } { _adap_steps :: Double }
| Evolving | Evolving
-- test a set of values that tries to directly maximize the quality of the phylo
-- the similarity spectrum of the corpus (see Gargantext.Core.Viz.Phylo.PhyloMaker.evolvSeaLadder)
{ _evol_neighborhood :: Bool } { _evol_neighborhood :: Bool }
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
instance ToSchema SeaElevation instance ToSchema SeaElevation
-- | PhyloSimilarity : define the similarity measure used to for inter temporal matching
data PhyloSimilarity = data PhyloSimilarity =
WeightedLogJaccard WeightedLogJaccard
-- the default one (see Gargantext.Core.Viz.Phylo.TemporalMatching.weightedLogJaccard')
{ _wlj_sensibility :: Double { _wlj_sensibility :: Double
, _wlj_minSharedNgrams :: Int } , _wlj_minSharedNgrams :: Int }
| WeightedLogSim | WeightedLogSim
-- not used
{ _wls_sensibility :: Double { _wls_sensibility :: Double
, _wls_minSharedNgrams :: Int } , _wls_minSharedNgrams :: Int }
| Hamming | Hamming
-- not implemented
{ _hmg_sensibility :: Double { _hmg_sensibility :: Double
, _hmg_minSharedNgrams :: Int} , _hmg_minSharedNgrams :: Int}
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
instance ToSchema PhyloSimilarity where instance ToSchema PhyloSimilarity where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data SynchronyScope = SingleBranch | SiblingBranches | AllBranches -- | SynchronyScope : define which groups should be considered by the synchonic clustering
data SynchronyScope =
SingleBranch
-- consider only the groups belonging to the same branch
| SiblingBranches
-- consider only the groups belonging to sibling branches
-- (ie. branches that split at the same level of similarity)
| AllBranches
-- consider every groups of every branches
deriving (Show,Generic,Eq, ToSchema) deriving (Show,Generic,Eq, ToSchema)
data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
-- | SynchronyStrategy : define which groups should be merged when satisfying the synchonic clustering
data SynchronyStrategy =
MergeRegularGroups
-- only merge groups that don't contain emerging or declining ngrams
| MergeAllGroups
-- merge every groups
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
instance ToSchema SynchronyStrategy where instance ToSchema SynchronyStrategy where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Synchrony : define the synchronic clustering strategy
data Synchrony = data Synchrony =
ByProximityThreshold ByProximityThreshold
-- select all groups that satisfy a given similarity threshold
{ _bpt_threshold :: Double { _bpt_threshold :: Double
, _bpt_sensibility :: Double , _bpt_sensibility :: Double
, _bpt_scope :: SynchronyScope , _bpt_scope :: SynchronyScope
, _bpt_strategy :: SynchronyStrategy } , _bpt_strategy :: SynchronyStrategy }
| ByProximityDistribution | ByProximityDistribution
-- select the top x groups sorted by similarity
{ _bpd_sensibility :: Double { _bpd_sensibility :: Double
, _bpd_strategy :: SynchronyStrategy } , _bpd_strategy :: SynchronyStrategy }
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
...@@ -113,7 +157,10 @@ instance ToSchema Synchrony where ...@@ -113,7 +157,10 @@ instance ToSchema Synchrony where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
-- | TimeUnit : define the temporal granularity
-- period = size of a PhyloPeriod
-- step = step between two subsequent PhyloPeriod
-- frame = maximum number of PhyloPeriod considered for intertemporal matching
data TimeUnit = data TimeUnit =
Epoch Epoch
{ _epoch_period :: Int { _epoch_period :: Int
...@@ -141,18 +188,28 @@ instance ToSchema TimeUnit where ...@@ -141,18 +188,28 @@ instance ToSchema TimeUnit where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq) -- | MaxCliqueFilter : define a strategy for computing MaxClique
data MaxCliqueFilter =
ByThreshold
-- consider ngrams whose confidence probability satisfies a given threshold
| ByNeighbours
-- consider the top x ngrams sorted by confidence probability
deriving (Show,Generic,Eq)
instance ToSchema MaxCliqueFilter where instance ToSchema MaxCliqueFilter where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Cluster : define a ngrams clustering method for computing PhyloGroups
-- Reference : Uno, Takeaki et al. “LCM ver. 2: Efficient Mining Algorithms for Frequent/Closed/Maximal Itemsets.”
-- Workshop on Frequent Itemset Mining Implementations (2004).
data Cluster = data Cluster =
Fis Fis
-- frequent item set can be filtered by support and size
{ _fis_support :: Int { _fis_support :: Int
, _fis_size :: Int } , _fis_size :: Int }
| MaxClique | MaxClique
-- max clique can be filtered by size and threshold
{ _mcl_size :: Int { _mcl_size :: Int
, _mcl_threshold :: Double , _mcl_threshold :: Double
, _mcl_filter :: MaxCliqueFilter } , _mcl_filter :: MaxCliqueFilter }
...@@ -162,8 +219,13 @@ instance ToSchema Cluster where ...@@ -162,8 +219,13 @@ instance ToSchema Cluster where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Quality : define a level of observation
data Quality = data Quality =
Quality { _qua_granularity :: Double Quality
-- _qua_granularity <=> level of observation or λ in (chavalarias, lobbe & delanoe 2021)
-- if λ = 0 then we have one big branch
-- if λ = 1 then we have many little branches
{ _qua_granularity :: Double
, _qua_minBranch :: Int } , _qua_minBranch :: Int }
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
...@@ -171,6 +233,7 @@ instance ToSchema Quality where ...@@ -171,6 +233,7 @@ instance ToSchema Quality where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
-- | PhyloConfig : full list of parameters used to reconstruct a Phylomemy from the command line
data PhyloConfig = data PhyloConfig =
PhyloConfig { corpusPath :: FilePath PhyloConfig { corpusPath :: FilePath
, listPath :: FilePath , listPath :: FilePath
...@@ -197,6 +260,8 @@ data PhyloConfig = ...@@ -197,6 +260,8 @@ data PhyloConfig =
-- | SubConfig API & 1Click | -- -- | SubConfig API & 1Click | --
-------------------------------- --------------------------------
-- | PhyloSubConfigAPI : selected list of parameters used to reconstruct a Phylomemy from the API
data PhyloSubConfigAPI = data PhyloSubConfigAPI =
PhyloSubConfigAPI { _sc_phyloProximity :: Double PhyloSubConfigAPI { _sc_phyloProximity :: Double
, _sc_phyloSynchrony :: Double , _sc_phyloSynchrony :: Double
...@@ -223,6 +288,8 @@ subConfigAPI2config subConfig = defaultConfig ...@@ -223,6 +288,8 @@ subConfigAPI2config subConfig = defaultConfig
-- | SubConfig 1Click | -- -- | SubConfig 1Click | --
-------------------------- --------------------------
-- | defaultConfig : default configuration used by the 1'Click feature
defaultConfig :: PhyloConfig defaultConfig :: PhyloConfig
defaultConfig = defaultConfig =
PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
...@@ -304,7 +371,7 @@ instance FromJSON Quality ...@@ -304,7 +371,7 @@ instance FromJSON Quality
instance ToJSON Quality instance ToJSON Quality
-- | Software parameters -- | Software : software parameters
data Software = data Software =
Software { _software_name :: Text Software { _software_name :: Text
, _software_version :: Text , _software_version :: Text
...@@ -314,14 +381,13 @@ instance ToSchema Software where ...@@ -314,14 +381,13 @@ instance ToSchema Software where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
defaultSoftware :: Software defaultSoftware :: Software
defaultSoftware = defaultSoftware =
Software { _software_name = pack "GarganText" Software { _software_name = pack "GarganText"
, _software_version = pack "v5" } , _software_version = pack "v5" }
-- | Global parameters of a Phylo -- | PhyloParam : global parameters of a Phylo
data PhyloParam = data PhyloParam =
PhyloParam { _phyloParam_version :: Text PhyloParam { _phyloParam_version :: Text
, _phyloParam_software :: Software , _phyloParam_software :: Software
...@@ -332,7 +398,6 @@ instance ToSchema PhyloParam where ...@@ -332,7 +398,6 @@ instance ToSchema PhyloParam where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
defaultPhyloParam :: PhyloParam defaultPhyloParam :: PhyloParam
defaultPhyloParam = defaultPhyloParam =
PhyloParam { _phyloParam_version = pack "v3" PhyloParam { _phyloParam_version = pack "v3"
...@@ -353,14 +418,18 @@ type DateStr = Text ...@@ -353,14 +418,18 @@ type DateStr = Text
-- | Ngrams : a contiguous sequence of n terms -- | Ngrams : a contiguous sequence of n terms
type Ngrams = Text type Ngrams = Text
-- Document : a piece of Text linked to a Date -- | Document : a piece of Text linked to a Date
-- date = computational date; date' = original string date yyyy-mm-dd
-- Export Database to Document
data Document = Document data Document = Document
{ date :: Date -- datatype Date {unDate :: Int} { date :: Date
, date' :: DateStr -- show date -- the Int date used to compute the periods, groups, etc.
-- created by toPhyloDate in Gargantext.Core.Viz.Phylo.API.Tools
, date' :: DateStr
-- the original String date (yyyy-mm-dd) that will be displayed in the interface
-- created by toPhyloDate' in Gargantext.Core.Viz.Phylo.API.Tools
, text :: [Ngrams] , text :: [Ngrams]
, weight :: Maybe Double , weight :: Maybe Double
-- a Double attached to each Document that will be used to set up the size of the phylogroup in the interface
-- only taken into account when CorpusParser is CSV'
, sources :: [Text] , sources :: [Text]
, docTime :: TimeUnit , docTime :: TimeUnit
} deriving (Eq,Show,Generic,NFData) } deriving (Eq,Show,Generic,NFData)
...@@ -371,12 +440,14 @@ data Document = Document ...@@ -371,12 +440,14 @@ data Document = Document
-------------------- --------------------
-- | The Foundations of a Phylo created from a given TermList -- | PhyloFoundations : store and index all the ngrams (named roots) that will appear in the Phylomemy
data PhyloFoundations = PhyloFoundations data PhyloFoundations = PhyloFoundations
{ _foundations_roots :: (Vector Ngrams) { _foundations_roots :: (Vector Ngrams)
, _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups , _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
} deriving (Generic, Show, Eq) } deriving (Generic, Show, Eq)
-- | PhyloCounts : store various counters related to roots or dates
data PhyloCounts = PhyloCounts data PhyloCounts = PhyloCounts
{ coocByDate :: !(Map Date Cooc) { coocByDate :: !(Map Date Cooc)
, docsByDate :: !(Map Date Double) , docsByDate :: !(Map Date Double)
...@@ -386,9 +457,12 @@ data PhyloCounts = PhyloCounts ...@@ -386,9 +457,12 @@ data PhyloCounts = PhyloCounts
, lastRootsFreq :: !(Map Int Double) , lastRootsFreq :: !(Map Int Double)
} deriving (Generic, Show, Eq) } deriving (Generic, Show, Eq)
-- | PhyloSources : store sources that will be used in the interface to highlight some PhyloGroups
data PhyloSources = PhyloSources data PhyloSources = PhyloSources
{ _sources :: !(Vector Text) } deriving (Generic, Show, Eq) { _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where instance ToSchema PhyloFoundations where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_") declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
instance ToSchema PhyloCounts where instance ToSchema PhyloCounts where
...@@ -521,10 +595,13 @@ type Thr = Double ...@@ -521,10 +595,13 @@ type Thr = Double
-- | Pointer : A weighted pointer to a given PhyloGroup -- | Pointer : A weighted pointer to a given PhyloGroup
type Pointer = (PhyloGroupId, Weight) type Pointer = (PhyloGroupId, Weight)
-- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold -- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
type Pointer' = (PhyloGroupId, (Thr,Weight)) type Pointer' = (PhyloGroupId, (Thr,Weight))
data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show) data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show) data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
...@@ -535,6 +612,8 @@ data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show) ...@@ -535,6 +612,8 @@ data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
-- | Support : Number of Documents where a Cluster occurs -- | Support : Number of Documents where a Cluster occurs
type Support = Int type Support = Int
-- | Clustering : define the structure of a cluster of ngrams
data Clustering = Clustering data Clustering = Clustering
{ _clustering_roots :: [Int] { _clustering_roots :: [Int]
, _clustering_support :: Support , _clustering_support :: Support
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment