Commit a7cafc56 authored by lobbeque's avatar lobbeque

comment the phylo.hs file

parent 95553245
......@@ -41,70 +41,114 @@ import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Prelude
import qualified Data.Text.Lazy as TextLazy
---------------------
-- | PhyloConfig | --
---------------------
-- | CorpusParser : control which csv collumns should be taken into account for reconstructing a phylo
data CorpusParser =
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
| Csv' {_csv'_limit :: Int}
Wos
-- not used anymore
{_wos_limit :: Int}
| Csv
-- consider Publication_Day, Publication_Month, Publication_Year, Authors, Title, Abstract
{_csv_limit :: Int}
| Csv'
-- consider Publication_Day, Publication_Month, Publication_Year, Authors, Title, Abstract, Source, Weight
{_csv'_limit :: Int}
deriving (Show,Generic,Eq)
instance ToSchema CorpusParser where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data ListParser = V3 | V4 deriving (Show,Generic,Eq)
-- | ListParser : is the map list in Csv or in Json?
data ListParser =
V3
-- the map list is in Csv
| V4
-- the map list in in Json
deriving (Show,Generic,Eq)
instance ToSchema ListParser
-- | SeaElevation : for a given level of observation,
-- define a set of similarity values that will be tested by the sea level rise algorithm
data SeaElevation =
Constante
-- test a constant set of values (see Gargantext.Core.Viz.Phylo.PhyloMaker.constSeaLadder)
{ _cons_start :: Double
, _cons_gap :: Double }
| Adaptative
-- test a set of values that matches the similarity spectrum of the corpus
-- (see Gargantext.Core.Viz.Phylo.PhyloMaker.constSeaLadder)
{ _adap_steps :: Double }
| Evolving
-- test a set of values that tries to directly maximize the quality of the phylo
-- the similarity spectrum of the corpus (see Gargantext.Core.Viz.Phylo.PhyloMaker.evolvSeaLadder)
{ _evol_neighborhood :: Bool }
deriving (Show,Generic,Eq)
instance ToSchema SeaElevation
-- | PhyloSimilarity : define the similarity measure used to for inter temporal matching
data PhyloSimilarity =
WeightedLogJaccard
-- the default one (see Gargantext.Core.Viz.Phylo.TemporalMatching.weightedLogJaccard')
{ _wlj_sensibility :: Double
, _wlj_minSharedNgrams :: Int }
| WeightedLogSim
-- not used
{ _wls_sensibility :: Double
, _wls_minSharedNgrams :: Int }
| Hamming
-- not implemented
{ _hmg_sensibility :: Double
, _hmg_minSharedNgrams :: Int}
deriving (Show,Generic,Eq)
instance ToSchema PhyloSimilarity where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data SynchronyScope = SingleBranch | SiblingBranches | AllBranches
-- | SynchronyScope : define which groups should be considered by the synchonic clustering
data SynchronyScope =
SingleBranch
-- consider only the groups belonging to the same branch
| SiblingBranches
-- consider only the groups belonging to sibling branches
-- (ie. branches that split at the same level of similarity)
| AllBranches
-- consider every groups of every branches
deriving (Show,Generic,Eq, ToSchema)
data SynchronyStrategy = MergeRegularGroups | MergeAllGroups
-- | SynchronyStrategy : define which groups should be merged when satisfying the synchonic clustering
data SynchronyStrategy =
MergeRegularGroups
-- only merge groups that don't contain emerging or declining ngrams
| MergeAllGroups
-- merge every groups
deriving (Show,Generic,Eq)
instance ToSchema SynchronyStrategy where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Synchrony : define the synchronic clustering strategy
data Synchrony =
ByProximityThreshold
-- select all groups that satisfy a given similarity threshold
{ _bpt_threshold :: Double
, _bpt_sensibility :: Double
, _bpt_scope :: SynchronyScope
, _bpt_strategy :: SynchronyStrategy }
| ByProximityDistribution
-- select the top x groups sorted by similarity
{ _bpd_sensibility :: Double
, _bpd_strategy :: SynchronyStrategy }
deriving (Show,Generic,Eq)
......@@ -113,7 +157,10 @@ instance ToSchema Synchrony where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
-- | TimeUnit : define the temporal granularity
-- period = size of a PhyloPeriod
-- step = step between two subsequent PhyloPeriod
-- frame = maximum number of PhyloPeriod considered for intertemporal matching
data TimeUnit =
Epoch
{ _epoch_period :: Int
......@@ -141,18 +188,28 @@ instance ToSchema TimeUnit where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq)
-- | MaxCliqueFilter : define a strategy for computing MaxClique
data MaxCliqueFilter =
ByThreshold
-- consider ngrams whose confidence probability satisfies a given threshold
| ByNeighbours
-- consider the top x ngrams sorted by confidence probability
deriving (Show,Generic,Eq)
instance ToSchema MaxCliqueFilter where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Cluster : define a ngrams clustering method for computing PhyloGroups
-- Reference : Uno, Takeaki et al. “LCM ver. 2: Efficient Mining Algorithms for Frequent/Closed/Maximal Itemsets.”
-- Workshop on Frequent Itemset Mining Implementations (2004).
data Cluster =
Fis
-- frequent item set can be filtered by support and size
{ _fis_support :: Int
, _fis_size :: Int }
| MaxClique
-- max clique can be filtered by size and threshold
{ _mcl_size :: Int
, _mcl_threshold :: Double
, _mcl_filter :: MaxCliqueFilter }
......@@ -162,15 +219,21 @@ instance ToSchema Cluster where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
-- | Quality : define a level of observation
data Quality =
Quality { _qua_granularity :: Double
, _qua_minBranch :: Int }
Quality
-- _qua_granularity <=> level of observation or λ in (chavalarias, lobbe & delanoe 2021)
-- if λ = 0 then we have one big branch
-- if λ = 1 then we have many little branches
{ _qua_granularity :: Double
, _qua_minBranch :: Int }
deriving (Show,Generic,Eq)
instance ToSchema Quality where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_qua_")
-- | PhyloConfig : full list of parameters used to reconstruct a Phylomemy from the command line
data PhyloConfig =
PhyloConfig { corpusPath :: FilePath
, listPath :: FilePath
......@@ -197,6 +260,8 @@ data PhyloConfig =
-- | SubConfig API & 1Click | --
--------------------------------
-- | PhyloSubConfigAPI : selected list of parameters used to reconstruct a Phylomemy from the API
data PhyloSubConfigAPI =
PhyloSubConfigAPI { _sc_phyloProximity :: Double
, _sc_phyloSynchrony :: Double
......@@ -223,6 +288,8 @@ subConfigAPI2config subConfig = defaultConfig
-- | SubConfig 1Click | --
--------------------------
-- | defaultConfig : default configuration used by the 1'Click feature
defaultConfig :: PhyloConfig
defaultConfig =
PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
......@@ -304,7 +371,7 @@ instance FromJSON Quality
instance ToJSON Quality
-- | Software parameters
-- | Software : software parameters
data Software =
Software { _software_name :: Text
, _software_version :: Text
......@@ -314,14 +381,13 @@ instance ToSchema Software where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_software_")
defaultSoftware :: Software
defaultSoftware =
Software { _software_name = pack "GarganText"
, _software_version = pack "v5" }
-- | Global parameters of a Phylo
-- | PhyloParam : global parameters of a Phylo
data PhyloParam =
PhyloParam { _phyloParam_version :: Text
, _phyloParam_software :: Software
......@@ -332,7 +398,6 @@ instance ToSchema PhyloParam where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phyloParam_")
defaultPhyloParam :: PhyloParam
defaultPhyloParam =
PhyloParam { _phyloParam_version = pack "v3"
......@@ -353,14 +418,18 @@ type DateStr = Text
-- | Ngrams : a contiguous sequence of n terms
type Ngrams = Text
-- Document : a piece of Text linked to a Date
-- date = computational date; date' = original string date yyyy-mm-dd
-- Export Database to Document
-- | Document : a piece of Text linked to a Date
data Document = Document
{ date :: Date -- datatype Date {unDate :: Int}
, date' :: DateStr -- show date
{ date :: Date
-- the Int date used to compute the periods, groups, etc.
-- created by toPhyloDate in Gargantext.Core.Viz.Phylo.API.Tools
, date' :: DateStr
-- the original String date (yyyy-mm-dd) that will be displayed in the interface
-- created by toPhyloDate' in Gargantext.Core.Viz.Phylo.API.Tools
, text :: [Ngrams]
, weight :: Maybe Double
-- a Double attached to each Document that will be used to set up the size of the phylogroup in the interface
-- only taken into account when CorpusParser is CSV'
, sources :: [Text]
, docTime :: TimeUnit
} deriving (Eq,Show,Generic,NFData)
......@@ -371,12 +440,14 @@ data Document = Document
--------------------
-- | The Foundations of a Phylo created from a given TermList
-- | PhyloFoundations : store and index all the ngrams (named roots) that will appear in the Phylomemy
data PhyloFoundations = PhyloFoundations
{ _foundations_roots :: (Vector Ngrams)
, _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
} deriving (Generic, Show, Eq)
-- | PhyloCounts : store various counters related to roots or dates
data PhyloCounts = PhyloCounts
{ coocByDate :: !(Map Date Cooc)
, docsByDate :: !(Map Date Double)
......@@ -386,9 +457,12 @@ data PhyloCounts = PhyloCounts
, lastRootsFreq :: !(Map Int Double)
} deriving (Generic, Show, Eq)
-- | PhyloSources : store sources that will be used in the interface to highlight some PhyloGroups
data PhyloSources = PhyloSources
{ _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
instance ToSchema PhyloCounts where
......@@ -521,10 +595,13 @@ type Thr = Double
-- | Pointer : A weighted pointer to a given PhyloGroup
type Pointer = (PhyloGroupId, Weight)
-- | Pointer' : A weighted pointer to a given PhyloGroup with a lower bounded threshold
type Pointer' = (PhyloGroupId, (Thr,Weight))
data Filiation = ToParents | ToChilds | ToParentsMemory | ToChildsMemory deriving (Generic, Show)
data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
......@@ -535,6 +612,8 @@ data PointerType = TemporalPointer | ScalePointer deriving (Generic, Show)
-- | Support : Number of Documents where a Cluster occurs
type Support = Int
-- | Clustering : define the structure of a cluster of ngrams
data Clustering = Clustering
{ _clustering_roots :: [Int]
, _clustering_support :: Support
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment