Commit 406ae431 authored by qlobbe's avatar qlobbe

optimisation for temporal matching

parent b0826576
...@@ -91,6 +91,12 @@ data ContextualUnit = ...@@ -91,6 +91,12 @@ data ContextualUnit =
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
data Quality =
Quality { _qua_relevance :: Double
, _qua_minBranch :: Int }
deriving (Show,Generic,Eq)
data Config = data Config =
Config { corpusPath :: FilePath Config { corpusPath :: FilePath
, listPath :: FilePath , listPath :: FilePath
...@@ -100,6 +106,7 @@ data Config = ...@@ -100,6 +106,7 @@ data Config =
, phyloLevel :: Int , phyloLevel :: Int
, phyloProximity :: Proximity , phyloProximity :: Proximity
, phyloSynchrony :: Synchrony , phyloSynchrony :: Synchrony
, phyloQuality :: Quality
, timeUnit :: TimeUnit , timeUnit :: TimeUnit
, contextualUnit :: ContextualUnit , contextualUnit :: ContextualUnit
, exportLabel :: [PhyloLabel] , exportLabel :: [PhyloLabel]
...@@ -118,8 +125,9 @@ defaultConfig = ...@@ -118,8 +125,9 @@ defaultConfig =
, phyloLevel = 1 , phyloLevel = 1
, phyloProximity = WeightedLogJaccard 10 0 0.1 , phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityDistribution 0 , phyloSynchrony = ByProximityDistribution 0
, phyloQuality = Quality 1 1
, timeUnit = Year 3 1 5 , timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4 , contextualUnit = Fis 1 4
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2] , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy , exportSort = ByHierarchy
, exportFilter = [ByBranchSize 2] , exportFilter = [ByBranchSize 2]
...@@ -147,6 +155,8 @@ instance FromJSON Filter ...@@ -147,6 +155,8 @@ instance FromJSON Filter
instance ToJSON Filter instance ToJSON Filter
instance FromJSON Synchrony instance FromJSON Synchrony
instance ToJSON Synchrony instance ToJSON Synchrony
instance FromJSON Quality
instance ToJSON Quality
-- | Software parameters -- | Software parameters
...@@ -362,6 +372,7 @@ data PhyloExport = ...@@ -362,6 +372,7 @@ data PhyloExport =
makeLenses ''Config makeLenses ''Config
makeLenses ''Proximity makeLenses ''Proximity
makeLenses ''Quality
makeLenses ''ContextualUnit makeLenses ''ContextualUnit
makeLenses ''PhyloLabel makeLenses ''PhyloLabel
makeLenses ''TimeUnit makeLenses ''TimeUnit
......
...@@ -83,7 +83,7 @@ queryViewEx = "level=3" ...@@ -83,7 +83,7 @@ queryViewEx = "level=3"
phyloQueryView :: PhyloQueryView phyloQueryView :: PhyloQueryView
phyloQueryView = PhyloQueryView 2 Merge False 2 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True phyloQueryView = PhyloQueryView 1 Merge False 1 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True
-------------------------------------------------- --------------------------------------------------
...@@ -110,7 +110,7 @@ queryEx = "title=Cesar et Cleôpatre" ...@@ -110,7 +110,7 @@ queryEx = "title=Cesar et Cleôpatre"
phyloQueryBuild :: PhyloQueryBuild phyloQueryBuild :: PhyloQueryBuild
phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)" phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)"
3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.5 20) 5 0.8 0.5 4 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0) 3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.9 10) 5 0.8 0.5 4 1 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0)
......
...@@ -156,6 +156,8 @@ mergePointers groups = ...@@ -156,6 +156,8 @@ mergePointers groups =
exportToDot :: Phylo -> PhyloExport -> DotGraph DotId exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
exportToDot phylo export = exportToDot phylo export =
trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
<> show(length $ export ^. export_groups) <> " groups to a dot file\n") $
digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
-- | 1) init the dot graph -- | 1) init the dot graph
...@@ -238,10 +240,12 @@ filterByBranchSize thr export = ...@@ -238,10 +240,12 @@ filterByBranchSize thr export =
& export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd branches'))) & export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd branches')))
processFilters :: [Filter] -> PhyloExport -> PhyloExport processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
processFilters filters export = processFilters filters qua export =
foldl (\export' f -> case f of foldl (\export' f -> case f of
ByBranchSize thr -> filterByBranchSize thr export' ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
else filterByBranchSize thr export'
) export filters ) export filters
-------------- --------------
...@@ -439,7 +443,7 @@ processDynamics groups = ...@@ -439,7 +443,7 @@ processDynamics groups =
toPhyloExport :: Phylo -> DotGraph DotId toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = exportToDot phylo toPhyloExport phylo = exportToDot phylo
$ processFilters (exportFilter $ getConfig phylo) $ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
$ processSort (exportSort $ getConfig phylo) $ processSort (exportSort $ getConfig phylo)
$ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) $ processLabels (exportLabel $ getConfig phylo) (getRoots phylo)
$ processMetrics export $ processMetrics export
......
...@@ -15,14 +15,14 @@ Portability : POSIX ...@@ -15,14 +15,14 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile, partition) import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey) import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey, keys, (!))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools import Gargantext.Viz.Phylo.PhyloTools
import Prelude (logBase) -- import Prelude (logBase)
import Control.Lens hiding (Level) import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using) import Control.Parallel.Strategies (parList, rdeepseq, using)
-- import Debug.Trace (trace) -- import Debug.Trace (trace)
...@@ -98,90 +98,84 @@ toProximity docs proximity ego target target' = ...@@ -98,90 +98,84 @@ toProximity docs proximity ego target target' =
-- | Local Matching | -- -- | Local Matching | --
------------------------ ------------------------
toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
toLastPeriod fil periods = case fil of
ToParents -> head' "toLastPeriod" (sortOn fst periods)
ToChilds -> last' "toLastPeriod" (sortOn fst periods)
toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
toLazyPairs pointers fil thr prox prd pairs =
if null pointers then pairs
else let rest = filterPointers prox thr pointers
in if null rest
then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
in if prd' == prd
then []
else filter (\(g,g') ->
case fil of
ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
else []
-- | Find pairs of valuable candidates to be matched -- | Find pairs of valuable candidates to be matched
makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)] makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
makePairs candidates periods periods' = case null periods of makePairs' ego candidates periods pointers fil thr prox docs =
case null periods of
True -> [] True -> []
False -> toLazyPairs pointers fil thr prox lastPrd
-- | at least on of the pair candidates should be from the last added period -- | at least on of the pair candidates should be from the last added period
False -> filter (\(cdt,cdt') -> $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
((inLastPeriod cdt periods) || (inLastPeriod cdt' periods)) || ((g' ^. phylo_groupPeriod) == lastPrd))
&& (not $ inOldPeriods cdt periods') $ listToKeys
&& (not $ inOldPeriods cdt' periods')) $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
$ listToKeys candidates || ((toProximity docs prox ego ego g) >= thr)) candidates
where where
inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool lastPrd :: PhyloPeriodId
inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds) lastPrd = toLastPeriod fil periods
--------------------------------------
inOldPeriods :: PhyloGroup -> [PhyloPeriodId] -> Bool
inOldPeriods g prds = elem (g ^. phylo_groupPeriod) prds
keepOldOnes :: Filiation -> Proximity -> Double -> PhyloGroup -> Bool
keepOldOnes fil proxi thr ego = any (\(_,w) -> filterProximity proxi thr w)
$ getPeriodPointers fil ego
filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer] filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
findLastPeriod :: Filiation -> [Pointer] -> PhyloPeriodId
findLastPeriod fil pts = case fil of
ToParents -> head' "findLastPeriod" $ sortOn fst $ map (fst . fst . fst) pts
ToChilds -> head' "findLastPeriod" $ reverse $ sortOn fst $ map (fst . fst . fst) pts
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil proxi docs thr ego = phyloGroupMatching candidates fil proxi docs thr ego =
if keepOldOnes fil proxi thr ego case null nextPointers of
-- | keep some of the old pointers
then addPointers ego fil TemporalPointer
$ filterPointers proxi thr
$ getPeriodPointers fil ego
else case null pointers of
-- | let's find new pointers -- | let's find new pointers
True -> addPointers ego fil TemporalPointer [] True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
then addPointers ego fil TemporalPointer []
-- | or keep the old ones
else addPointers ego fil TemporalPointer
$ filterPointers proxi thr $ getPeriodPointers fil ego
False -> addPointers ego fil TemporalPointer False -> addPointers ego fil TemporalPointer
$ head' "phyloGroupMatching" $ head' "phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity -- | Keep only the best set of pointers grouped by proximity
$ groupBy (\pt pt' -> snd pt == snd pt') $ groupBy (\pt pt' -> snd pt == snd pt')
$ reverse $ sortOn snd $ head' "pointers" pointers $ reverse $ sortOn snd $ head' "pointers"
$ nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
where where
-------------------------------------- nextPointers :: [[Pointer]]
oldPeriods :: [PhyloPeriodId] -> [PhyloPeriodId] nextPointers = take 1
oldPeriods periods =
if (null $ getPeriodPointers fil ego)
then []
else
let period = findLastPeriod fil $ getPeriodPointers fil ego
in fst $ partition (\prd -> case fil of
ToChilds -> prd <= period
ToParents -> prd >= period ) periods
--------------------------------------
pointers :: [[Pointer]]
pointers = take 1
$ dropWhile (null) $ dropWhile (null)
-- | for each time frame, process the proximity on relevant pairs of targeted groups -- | for each time frame, process the proximity on relevant pairs of targeted groups
$ scanl (\acc groups -> $ scanl (\acc groups ->
let periods = nub let periods = nub $ map _phylo_groupPeriod $ concat groups
$ concat $ map (\gs -> if null gs docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
then [] pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
else [_phylo_groupPeriod $ head' "pointers" gs]) groups
periods' = oldPeriods periods
pairs = makePairs (concat groups) periods periods'
in acc ++ ( filterPointers proxi thr in acc ++ ( filterPointers proxi thr
$ concat $ concat
$ map (\(c,c') -> $ map (\(c,c') ->
-- | process the proximity between the current group and a pair of candidates -- | process the proximity between the current group and a pair of candidates
let proximity = toProximity (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods)) proxi ego c c' let proximity = toProximity docs' proxi ego c c'
in if (c == c') in if (c == c')
then [(getGroupId c,proximity)] then [(getGroupId c,proximity)]
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs) else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
) [] $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
-- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
$ inits candidates
filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
...@@ -215,18 +209,22 @@ getCandidates fil ego targets = ...@@ -215,18 +209,22 @@ getCandidates fil ego targets =
phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup] phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
$ matchByPeriods ToParents -- $ matchByPeriods ToParents
$ groupByField _phylo_groupPeriod -- $ groupByField _phylo_groupPeriod
$ matchByPeriods ToChilds $ matchByPeriods
$ groupByField _phylo_groupPeriod branch $ groupByField _phylo_groupPeriod branch
where where
-------------------------------------- --------------------------------------
matchByPeriods :: Filiation -> Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup] matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
matchByPeriods fil branch' = foldl' (\acc prd -> matchByPeriods branch' = foldl' (\acc prd ->
let periods' = getNextPeriods fil frame prd periods let periodsPar = getNextPeriods ToParents frame prd periods
candidates = map (\prd' -> findWithDefault [] prd' branch') periods' periodsChi = getNextPeriods ToChilds frame prd periods
docs' = filterDocs docs ([prd] ++ periods') candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
egos = map (\g -> phyloGroupMatching (getCandidates fil g candidates) fil proximity docs' thr g) candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
docsPar = filterDocs docs ([prd] ++ periodsPar)
docsChi = filterDocs docs ([prd] ++ periodsChi)
egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
$ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
$ findWithDefault [] prd branch' $ findWithDefault [] prd branch'
egos' = egos `using` parList rdeepseq egos' = egos `using` parList rdeepseq
in acc ++ egos' ) [] periods in acc ++ egos' ) [] periods
...@@ -237,48 +235,78 @@ phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatchin ...@@ -237,48 +235,78 @@ phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatchin
----------------------- -----------------------
termFreq :: Int -> [[PhyloGroup]] -> Double count :: Eq a => a -> [a] -> Int
termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches) count x = length . filter (== x)
/ (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
termFreq' :: Int -> [PhyloGroup] -> Double
termFreq' term groups =
let ngrams = concat $ map _phylo_groupNgrams groups
in (fromIntegral $ count term ngrams)
/ (fromIntegral $ length ngrams)
entropy :: [[PhyloGroup]] -> Double relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]]
entropy branches = relevantBranches term thr branches =
let terms = ngramsInBranches branches filter (\groups -> (length groups >= thr)
in sum $ map (\term -> (1 / log (termFreq term branches)) && (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
/ (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
branchCov' branch branches =
(fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
toRecall freq term thr branches =
-- | given a random term in a phylo
freq
-- | for each relevant branches
* (sum $ map (\branch -> * (sum $ map (\branch ->
let q = branchObs term (length $ concat branches) branch -- | given its local coverage
in if (q == 0) ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
then 0 -- | compute the local recall
else - q * logBase 2 q ) branches) ) terms * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches'))) branches')
where
branches' :: [[PhyloGroup]]
branches' = relevantBranches term thr branches
toAccuracy :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
toAccuracy freq term thr branches =
-- | given a random term in a phylo
freq
-- | for each relevant branches
* (sum $ map (\branch ->
-- | given its local coverage
((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
-- | compute the local accuracy
* ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ (fromIntegral $ length branch))) branches')
where where
-- | Probability to observe a branch given a random term of the phylo branches' :: [[PhyloGroup]]
branchObs :: Int -> Int -> [PhyloGroup] -> Double branches' = relevantBranches term thr branches
branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
/ (fromIntegral total)
toPhyloQuality' :: Quality -> Map Int Double -> [[PhyloGroup]] -> Double
toPhyloQuality' quality frequency branches =
homogeneity :: [[PhyloGroup]] -> Double if (foldl' (\acc b -> acc && (length b < (quality ^. qua_minBranch))) True branches)
homogeneity branches = -- | the local phylo is composed of small branches
let nbGroups = length $ concat branches then 0
in sum else
$ map (\branch -> (if (length branch == nbGroups) let relevance = quality ^. qua_relevance
then 1 -- | compute the F score for a given relevance
else (1 / log (branchCov branch nbGroups)) in ((1 + relevance ** 2) * accuracy * recall)
/ (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches)) / (((relevance ** 2) * accuracy + recall))
* (sum $ map (\term -> (termFreq term branches)
/ (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
* (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
/ (fromIntegral $ sum $ ngramsInBranches [branch])
) $ ngramsInBranches [branch]) ) branches
where where
branchCov :: [PhyloGroup] -> Int -> Double terms :: [Int]
branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total) terms = keys frequency
-- | for each term compute the global accuracy
accuracy :: Double
accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term (quality ^. qua_minBranch) branches) terms
-- | for each term compute the global recall
recall :: Double
recall = sum $ map (\term -> toRecall (frequency ! term) term (quality ^. qua_minBranch) branches) terms
toPhyloQuality :: [[PhyloGroup]] -> Double
toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
----------------------------- -----------------------------
...@@ -289,37 +317,44 @@ toPhyloQuality branches = sqrt (homogeneity branches / entropy branches) ...@@ -289,37 +317,44 @@ toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]] groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches groups = groupsToBranches groups =
-- | run the related component algorithm -- | run the related component algorithm
let graph = zip [1..] let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
$ relatedComponents $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
$ map (\group -> [getGroupId group] $ map (\group -> [getGroupId group]
++ (map fst $ group ^. phylo_groupPeriodParents) ++ (map fst $ group ^. phylo_groupPeriodParents)
++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
-- | first find the related components by inside each ego's period
graph' = map relatedComponents egos
-- | then run it for the all the periods
graph = zip [1..]
$ relatedComponents $ concat (graph' `using` parList rdeepseq)
-- | update each group's branch id -- | update each group's branch id
in map (\(bId,ids) -> in map (\(bId,ids) ->
map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId]))) let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
$ elems $ restrictKeys groups (Set.fromList ids)) graph $ elems $ restrictKeys groups (Set.fromList ids)
in groups' `using` parList rdeepseq ) graph
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup] recursiveMatching :: Proximity -> Quality -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
recursiveMatching proximity thr frame periods docs quality branches = recursiveMatching proximity qua freq thr frame periods docs quality branches =
if (length branches == (length $ concat branches)) if (length branches == (length $ concat branches))
then concat branches then concat branches
else if thr > 1 else if thr >= 1
then concat branches then concat branches
else else
-- trace (show(quality) <> " (vs) sum of " <> show(nextQualities))
case quality <= (sum nextQualities) of case quality <= (sum nextQualities) of
-- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities)) -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
True -> concat True -> concat
$ map (\branches' -> $ map (\branches' ->
let idx = fromJust $ elemIndex branches' nextBranches let idx = fromJust $ elemIndex branches' nextBranches
in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches') in recursiveMatching proximity qua freq (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
$ nextBranches $ nextBranches
-- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities)) -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
False -> concat branches False -> concat branches
where where
-- | 2) for each of the possible next branches process the phyloQuality score -- | 2) for each of the possible next branches process the phyloQuality score
nextQualities :: [Double] nextQualities :: [Double]
nextQualities = map toPhyloQuality nextBranches nextQualities = map (\nextBranch -> toPhyloQuality' qua freq nextBranch) nextBranches
-- | 1) for each local branch process a temporal matching then find the resulting branches -- | 1) for each local branch process a temporal matching then find the resulting branches
nextBranches :: [[[PhyloGroup]]] nextBranches :: [[[PhyloGroup]]]
nextBranches = nextBranches =
...@@ -339,6 +374,8 @@ temporalMatching phylo = updatePhyloGroups 1 branches' phylo ...@@ -339,6 +374,8 @@ temporalMatching phylo = updatePhyloGroups 1 branches' phylo
$ map (\g -> (getGroupId g, g)) $ map (\g -> (getGroupId g, g))
$ traceMatchEnd $ traceMatchEnd
$ recursiveMatching (phyloProximity $ getConfig phylo) $ recursiveMatching (phyloProximity $ getConfig phylo)
(phyloQuality $ getConfig phylo)
frequency
( (getThresholdInit $ phyloProximity $ getConfig phylo) ( (getThresholdInit $ phyloProximity $ getConfig phylo)
+ (getThresholdStep $ phyloProximity $ getConfig phylo)) + (getThresholdStep $ phyloProximity $ getConfig phylo))
(getTimeFrame $ timeUnit $ getConfig phylo) (getTimeFrame $ timeUnit $ getConfig phylo)
...@@ -346,7 +383,12 @@ temporalMatching phylo = updatePhyloGroups 1 branches' phylo ...@@ -346,7 +383,12 @@ temporalMatching phylo = updatePhyloGroups 1 branches' phylo
(phylo ^. phylo_timeDocs) quality branches (phylo ^. phylo_timeDocs) quality branches
-- | 3) process the quality score -- | 3) process the quality score
quality :: Double quality :: Double
quality = toPhyloQuality branches quality = toPhyloQuality' (phyloQuality $ getConfig phylo) frequency branches
-- | 3) process the constants of the quality score
frequency :: Map Int Double
frequency =
let terms = ngramsInBranches branches
in fromList $ map (\t -> (t, ((termFreq' t $ concat branches) / (sum $ map (\t' -> termFreq' t' $ concat branches) terms)))) terms
-- | 2) group into branches -- | 2) group into branches
branches :: [[PhyloGroup]] branches :: [[PhyloGroup]]
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups' branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
......
...@@ -815,7 +815,7 @@ getProximity cluster = case cluster of ...@@ -815,7 +815,7 @@ getProximity cluster = case cluster of
-- | To initialize all the Cluster / Proximity with their default parameters -- | To initialize all the Cluster / Proximity with their default parameters
initFis :: Maybe Bool -> Maybe Support -> Maybe Int -> FisParams initFis :: Maybe Bool -> Maybe Support -> Maybe Int -> FisParams
initFis (def True -> kmf) (def 2 -> min') (def 4 -> thr) = FisParams kmf min' thr initFis (def True -> kmf) (def 0 -> min') (def 0 -> thr) = FisParams kmf min' thr
initHamming :: Maybe Double -> HammingParams initHamming :: Maybe Double -> HammingParams
initHamming (def 0.01 -> sens) = HammingParams sens initHamming (def 0.01 -> sens) = HammingParams sens
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment