Commit 92b4221b authored by qlobbe's avatar qlobbe

add branching

parent d3097207
Pipeline #548 failed with stage
......@@ -128,16 +128,16 @@ csvToCorpus limit path = Vector.toList
-- | To use the correct parser given a CorpusType
fileToCorpus :: CorpusParser -> Int -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser limit path = case parser of
Wos -> wosToCorpus limit path
Csv -> csvToCorpus limit path
fileToCorpus :: CorpusParser -> FilePath -> IO ([(Int,Text)])
fileToCorpus parser path = case parser of
Wos limit -> wosToCorpus limit path
Csv limit -> csvToCorpus limit path
-- | To parse a file into a list of Document
fileToDocs :: CorpusParser -> Int -> FilePath -> TermList -> IO [Document]
fileToDocs parser limit path lst = do
corpus <- fileToCorpus parser limit path
fileToDocs :: CorpusParser -> FilePath -> TermList -> IO [Document]
fileToDocs parser path lst = do
corpus <- fileToCorpus parser path
let patterns = buildPatterns lst
pure $ map ( (\(y,t) -> Document y t) . filterTerms patterns) corpus
......@@ -162,7 +162,7 @@ main = do
printIOMsg "Parse the corpus"
mapList <- csvGraphTermList (listPath config)
corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
corpus <- fileToDocs (corpusParser config) (corpusPath config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the Phylo"
......
......@@ -50,46 +50,63 @@ import Control.Lens (makeLenses)
----------------
data CorpusParser = Wos | Csv deriving (Show,Generic,Eq)
data CorpusParser =
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
deriving (Show,Generic,Eq)
data Proximity =
WeightedLogJaccard
{ _wlj_sensibility :: Double
, _wlj_thresholdInit :: Double
, _wlj_thresholdStep :: Double }
| Hamming
deriving (Show,Generic,Eq)
data TimeUnit =
Year
{ _year_period :: Int
, _year_step :: Int
, _year_matchingFrame :: Int }
deriving (Show,Generic,Eq)
data ContextualUnit =
Fis
{ _fis_support :: Int
, _fis_size :: Int }
deriving (Show,Generic,Eq)
data Proximity = WeightedLogJaccard {_sensibility :: Double}
| Hamming
deriving (Show,Generic,Eq)
data Config =
Config { corpusPath :: FilePath
, listPath :: FilePath
, outputPath :: FilePath
, corpusParser :: CorpusParser
, corpusLimit :: Int
, phyloName :: Text
, phyloLevel :: Int
, phyloQuality :: Double
, phyloProximity :: Proximity
, timeUnit :: Int
, maxTimeMatch :: Int
, timePeriod :: Int
, timeStep :: Int
, fisSupport :: Int
, fisSize :: Int
, timeUnit :: TimeUnit
, contextualUnit :: ContextualUnit
, branchSize :: Int
} deriving (Show,Generic,Eq)
defaultConfig :: Config
defaultConfig =
Config { corpusPath = ""
, listPath = ""
, outputPath = ""
, corpusParser = Csv
, corpusLimit = 1000
, corpusParser = Csv 1000
, phyloName = pack "Default Phylo"
, phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10
, timeUnit = 1
, maxTimeMatch = 5
, timePeriod = 3
, timeStep = 1
, fisSupport = 2
, fisSize = 4
, phyloQuality = 0.5
, phyloProximity = WeightedLogJaccard 10 0 0.05
, timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4
, branchSize = 3
}
......@@ -99,6 +116,10 @@ instance FromJSON CorpusParser
instance ToJSON CorpusParser
instance FromJSON Proximity
instance ToJSON Proximity
instance FromJSON TimeUnit
instance ToJSON TimeUnit
instance FromJSON ContextualUnit
instance ToJSON ContextualUnit
-- | Software parameters
......@@ -237,7 +258,7 @@ data PhyloGroup =
, _phylo_groupLevelChilds :: [Pointer]
, _phylo_groupPeriodParents :: [Pointer]
, _phylo_groupPeriodChilds :: [Pointer]
, _phylo_groupBreakPointer :: Maybe Pointer
, _phylo_groupGhostPointers :: [Pointer]
}
deriving (Generic, Show, Eq)
......@@ -276,6 +297,9 @@ data PhyloFis = PhyloFis
----------------
makeLenses ''Config
makeLenses ''Proximity
makeLenses ''ContextualUnit
makeLenses ''TimeUnit
makeLenses ''PhyloFoundations
makeLenses ''PhyloFis
makeLenses ''Phylo
......
......@@ -49,7 +49,7 @@ phylo1 = appendGroups fisToGroup 1 phyloFis phyloBase
phyloFis :: Map (Date,Date) [PhyloFis]
phyloFis = toPhyloFis docsByPeriods (fisSupport config) (fisSize config)
phyloFis = toPhyloFis docsByPeriods (getFisSupport $ contextualUnit config) (getFisSize $ contextualUnit config)
docsByPeriods :: Map (Date,Date) [Document]
......@@ -66,23 +66,22 @@ phyloBase = toPhyloBase docs mapList config
phyloCooc :: Map Date Cooc
phyloCooc = docsToCoocByYear docs (foundations ^. foundations_roots) config
phyloCooc = docsToTimeScaleCooc docs (foundations ^. foundations_roots) config
periods :: [(Date,Date)]
periods = toPeriods (sort $ nub $ map date docs) (timePeriod config) (timeStep config)
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit config) (getTimeStep $ timeUnit config)
nbDocsByYear :: Map Date Double
nbDocsByYear = nbDocsByTime docs (timeUnit config)
nbDocsByYear = docsToTimeScaleNb docs
config :: Config
config =
defaultConfig { phyloName = "Cesar et Cleopatre"
, branchSize = 0
, fisSupport = 0
, fisSize = 0 }
, contextualUnit = Fis 0 0 }
docs :: [Document]
......
......@@ -87,8 +87,7 @@ fisToGroup fis pId lvl idx fdt coocs =
ngrams
(ngramsToCooc ngrams coocs)
(1,[])
[] [] [] []
Nothing
[] [] [] [] []
toPhylo1 :: [Document] -> Phylo -> Phylo
......@@ -96,7 +95,7 @@ toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
where
--------------------------------------
phyloFis :: Map (Date,Date) [PhyloFis]
phyloFis = toPhyloFis docs' (fisSupport $ getConfig phyloBase) (fisSize $ getConfig phyloBase)
phyloFis = toPhyloFis docs' (getFisSupport $ contextualUnit $ getConfig phyloBase) (getFisSize $ contextualUnit $ getConfig phyloBase)
--------------------------------------
docs' :: Map (Date,Date) [Document]
docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
......@@ -174,14 +173,14 @@ ngramsToCooc ngrams coocs =
-- | To transform the docs into a time map of coocurency matrix
docsToCoocByYear :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
docsToCoocByYear docs fdt conf =
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
docsToTimeScaleCooc docs fdt conf =
let mCooc = fromListWith sumCooc
$ map (\(_d,l) -> (_d, listToMatrix l))
$ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
mCooc' = fromList
$ map (\t -> (t,empty))
$ toTimeScale (map date docs) (timeUnit conf)
$ toTimeScale (map date docs) 1
in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
$ unionWith sumCooc mCooc mCooc'
......@@ -208,11 +207,11 @@ groupDocsByPeriod f pds es =
--------------------------------------
-- | To count the number of docs by unit of time (like a year)
nbDocsByTime :: [Document] -> Int -> Map Date Double
nbDocsByTime docs step =
-- | To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb docs =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
$ unionWith (+) time docs'
......@@ -227,10 +226,10 @@ toPhyloBase :: [Document] -> TermList -> Config -> Phylo
toPhyloBase docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
params = defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (timePeriod conf) (timeStep conf)
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
$ Phylo foundations
(docsToCoocByYear docs (foundations ^. foundations_roots) conf)
(nbDocsByTime docs $ timeUnit conf)
(docsToTimeScaleCooc docs (foundations ^. foundations_roots) conf)
(docsToTimeScaleNb docs)
params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)
......@@ -91,6 +91,18 @@ toTimeScale dates step =
in [start, (start + step) .. end]
getTimeStep :: TimeUnit -> Int
getTimeStep time = case time of
Year _ s _ -> s
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
Year p _ _ -> p
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
Year _ _ f -> f
-------------
-- | Fis | --
-------------
......@@ -136,6 +148,22 @@ traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map l
<> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
-------------------------
-- | Contextual unit | --
-------------------------
getFisSupport :: ContextualUnit -> Int
getFisSupport unit = case unit of
Fis s _ -> s
_ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
getFisSize :: ContextualUnit -> Int
getFisSize unit = case unit of
Fis _ s -> s
_ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
--------------
-- | Cooc | --
--------------
......@@ -223,10 +251,31 @@ updatePhyloGroups lvl m phylo =
-- | Pointers | --
------------------
pointersToLinks :: PhyloGroupId -> [Pointer] -> [Link]
pointersToLinks id pointers = map (\p -> ((id,fst p),snd p)) pointers
mergeLinks :: [Link] -> [Link] -> [Link]
mergeLinks toChilds toParents =
let toChilds' = fromList $ map (\((from,to),w) -> ((to,from),w)) toChilds
in toList $ unionWith max (fromList toParents) toChilds'
\ No newline at end of file
in toList $ unionWith max (fromList toParents) toChilds'
-------------------
-- | Proximity | --
-------------------
getSensibility :: Proximity -> Double
getSensibility proxi = case proxi of
WeightedLogJaccard s _ _ -> s
Hamming -> undefined
getThresholdInit :: Proximity -> Double
getThresholdInit proxi = case proxi of
WeightedLogJaccard _ t _ -> t
Hamming -> undefined
getThresholdStep :: Proximity -> Double
getThresholdStep proxi = case proxi of
WeightedLogJaccard _ _ s -> s
Hamming -> undefined
\ No newline at end of file
......@@ -19,11 +19,19 @@ import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Data.List (foldl', (++), null, intersect, (\\), union, nub, concat)
--------------------
-- | Clustering | --
--------------------
relatedComponents :: [PhyloGroup] -> [[PhyloGroup]]
relatedComponents groups = undefined
\ No newline at end of file
relatedComponents :: Eq a => [[a]] -> [[a]]
relatedComponents graphs = foldl' (\mem groups ->
if (null mem)
then mem ++ [groups]
else
let related = filter (\groups' -> (not . null) $ intersect groups groups') mem
in if (null related)
then mem ++ [groups]
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs
\ No newline at end of file
......@@ -16,7 +16,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, any, nub, union)
import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith)
import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, member, (!))
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
......@@ -25,6 +25,8 @@ import Gargantext.Viz.Phylo.SynchronicClustering
import Control.Lens hiding (Level)
import qualified Data.Set as Set
-------------------
-- | Proximity | --
......@@ -73,8 +75,8 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
-- | To choose a proximity function
pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
WeightedLogJaccard sens -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
Hamming -> undefined
WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
Hamming -> undefined
-- | To process the proximity between a current group and a pair of targets group
......@@ -96,23 +98,20 @@ toProximity docs proximity group target target' =
-- | Find pairs of valuable candidates to be matched
makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Double -> Map Date Double -> Proximity -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
makePairs candidates periods thr docs proximity group = case null periods of
makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Map Date Double -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
makePairs candidates periods docs group = case null periods of
True -> []
-- | at least on of the pair candidates should be from the last added period
False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
|| (inLastPeriod cdt' periods))
$ listToKeys
-- | remove poor candidates from previous periods
$ filter (\cdt -> (inLastPeriod cdt periods)
|| ((toProximity (reframeDocs docs periods) proximity group group cdt) >= thr)) candidates
$ listToKeys candidates
where
inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Double -> Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil thr docs proxi group = case pointers of
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil proxi docs group = case pointers of
Nothing -> addPointers group fil TemporalPointer []
Just pts -> addPointers group fil TemporalPointer
$ head' "phyloGroupMatching"
......@@ -126,18 +125,55 @@ phyloGroupMatching candidates fil thr docs proxi group = case pointers of
-- | for each time frame, process the proximity on relevant pairs of targeted groups
$ scanl (\acc groups ->
let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
pairs = makePairs (concat groups) periods thr docs proxi group
in acc ++ ( filter (\(_,proximity) -> proximity >= thr )
$ concat
pairs = makePairs (concat groups) periods docs group
in acc ++ ( concat
$ map (\(c,c') ->
-- | process the proximity between the current group and a pair of candidates
let proximity = toProximity (reframeDocs docs periods) proxi group c c'
let proximity = toProximity (filterDocs docs periods) proxi group c c'
in if (c == c')
then [(getGroupId c,proximity)]
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
) []
-- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
$ inits candidates
--------------------------------------
filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
filterDocs d pds = restrictKeys d $ periodsToYears pds
------------------
-- | Pointers | --
------------------
-- ghostHunter :: [[PhyloGroup]] -> [[PhyloGroup]]
-- ghostHunter branches =
-- map (\branch ->
-- -- | il manque une référence au group source de chaque pointer
-- let pointers = elems $ fromList
-- $ map (\pt -> (groupIds ! (fst pt),pt))
-- $ filter (\pt -> member (fst pt) groupIds) $ concat $ map (\g -> g ^. phylo_groupGhostPointers) branch
-- in undefined
-- ) branches
-- where
-- groupIds :: Map PhyloGroupId Int
-- groupIds = fromList $ map (\g -> (getGroupId g, last' "ghostHunter" $ snd $ g ^. phylo_groupBranchId)) $ concat branches
-- --------------------------------------
-- selectBest :: [Pointers] -> [Pointers]
-- se
filterPointers :: Double -> [PhyloGroup] -> [PhyloGroup]
filterPointers thr groups =
map (\group ->
let ghosts = filter (\(_,w) -> w < thr) $ group ^. phylo_groupPeriodParents
in group & phylo_groupPeriodParents %~ (filter (\(_,w) -> w >= thr))
& phylo_groupPeriodChilds %~ (filter (\(_,w) -> w >= thr))
& phylo_groupGhostPointers %~ (++ ghosts)
) groups
-----------------------------
......@@ -174,14 +210,26 @@ toBranchQuality :: [[PhyloGroup]] -> [(Double,[PhyloGroup])]
toBranchQuality branches = undefined
reframeDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
reframeDocs docs periods = restrictKeys docs $ periodsToYears periods
groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches groups =
-- | run the related component algorithm
let graph = zip [1..]
$ relatedComponents
$ map (\group -> [getGroupId group]
++ (map fst $ group ^. phylo_groupPeriodParents)
++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
-- | update each group's branch id
in map (\(bId,ids) ->
map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
$ elems $ restrictKeys groups (Set.fromList ids)
) graph
-- findGhostLinks :: [Link] -> [[Link]] -> Map PhyloGroupId
adaptativeMatching :: Int -> Double -> Double -> Double -> Map Date Double -> Proximity -> [PhyloGroup] -> [PhyloGroup] -> [PhyloPeriodId] -> [PhyloGroup]
adaptativeMatching maxTime thrStep thrMatch thrQua docs proximity groups candidates periods =
adaptativeMatching :: Proximity -> Double -> Double -> [PhyloGroup] -> [PhyloGroup]
adaptativeMatching proximity thr thrQua groups =
-- | check if we should break some of the new branches or not
case shouldBreak thrQua branches' of
True -> concat $ map (\(s,b) ->
......@@ -190,12 +238,7 @@ adaptativeMatching maxTime thrStep thrMatch thrQua docs proximity groups candida
then b
-- | we break the branch using an increased temporal matching threshold
else let nextGroups = undefined
nextCandidates = undefined
nextPeriods = undefined
in adaptativeMatching maxTime thrStep (thrMatch + thrStep) thrQua
(reframeDocs docs nextPeriods)
proximity
nextGroups nextCandidates nextPeriods
in adaptativeMatching proximity (thr + (getThresholdStep proximity)) thrQua nextGroups
) branches'
-- | the quality of all the new branches is sufficient
False -> concat branches
......@@ -205,25 +248,41 @@ adaptativeMatching maxTime thrStep thrMatch thrQua docs proximity groups candida
branches' = toBranchQuality branches
-- | 2) group the new groups into branches
branches :: [[PhyloGroup]]
branches = relatedComponents groups'
-- | 1) connect each group to its parents and childs
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
-- | 1) filter the pointers of each groups regarding the current state of the quality threshold
groups' :: [PhyloGroup]
groups' = map (\group ->
let childs = getCandidates ToChilds group
(getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) candidates
parents = getCandidates ToParents group
(getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) candidates
-- | match the group to its possible childs then parents
in phyloGroupMatching parents ToParents thrMatch docs proximity
$ phyloGroupMatching childs ToChilds thrMatch docs proximity group
) groups
groups' = filterPointers thr groups
temporalMatching :: Phylo -> Phylo
temporalMatching phylo =
let branches = fromList $ map (\g -> (getGroupId g, g))
$ adaptativeMatching (maxTimeMatch $ getConfig phylo) 0 0 0
(phylo ^. phylo_timeDocs)
(phyloProximity $ getConfig phylo)
(getGroupsFromLevel 1 phylo) (getGroupsFromLevel 1 phylo) (getPeriodIds phylo)
in updatePhyloGroups 1 branches phylo
\ No newline at end of file
temporalMatching phylo = updatePhyloGroups 1 branches phylo
where
-- | 4) find the ghost links and postprocess the branches
branches' :: Map PhyloGroupId PhyloGroup
branches' = undefined
-- | 3) run the adaptative matching to find the best repartition among branches
branches :: Map PhyloGroupId PhyloGroup
branches = fromList
$ map (\g -> (getGroupId g, g))
$ adaptativeMatching proximity (getThresholdInit proximity) (phyloQuality $ getConfig phylo) groups'
-- | 2) for each group process an initial temporal Matching
groups' :: [PhyloGroup]
groups' =
let maxTime = getTimeFrame $ timeUnit $ getConfig phylo
periods = getPeriodIds phylo
docs = phylo ^. phylo_timeDocs
--------------------------------------
in map (\group ->
let childs = getCandidates ToChilds group
(getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) groups
parents = getCandidates ToParents group
(getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) groups
in phyloGroupMatching parents ToParents proximity docs
$ phyloGroupMatching childs ToChilds proximity docs group
) groups
-- | 1) start with all the groups from a given level
groups :: [PhyloGroup]
groups = getGroupsFromLevel 1 phylo
--------------------------------------
proximity :: Proximity
proximity = phyloProximity $ getConfig phylo
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment