Commit 09db0b41 authored by qlobbe's avatar qlobbe

fix the grand bleu effect

parent 3517130e
Pipeline #587 failed with stage
...@@ -122,12 +122,12 @@ defaultConfig = ...@@ -122,12 +122,12 @@ defaultConfig =
, outputPath = "" , outputPath = ""
, corpusParser = Csv 1000 , corpusParser = Csv 1000
, phyloName = pack "Default Phylo" , phyloName = pack "Default Phylo"
, phyloLevel = 1 , phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10 0 0.1 , phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityDistribution 0 , phyloSynchrony = ByProximityDistribution 0
, phyloQuality = Quality 0.5 1 , phyloQuality = Quality 0.1 1
, timeUnit = Year 3 1 5 , timeUnit = Year 3 1 5
, contextualUnit = Fis 1 4 , contextualUnit = Fis 1 5
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2] , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy , exportSort = ByHierarchy
, exportFilter = [ByBranchSize 2] , exportFilter = [ByBranchSize 2]
......
...@@ -15,8 +15,8 @@ Portability : POSIX ...@@ -15,8 +15,8 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile) import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey, keys, (!)) import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
...@@ -25,7 +25,7 @@ import Gargantext.Viz.Phylo.PhyloTools ...@@ -25,7 +25,7 @@ import Gargantext.Viz.Phylo.PhyloTools
-- import Prelude (logBase) -- import Prelude (logBase)
import Control.Lens hiding (Level) import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using) import Control.Parallel.Strategies (parList, rdeepseq, using)
-- import Debug.Trace (trace) import Debug.Trace (trace)
import qualified Data.Set as Set import qualified Data.Set as Set
...@@ -68,6 +68,7 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams' ...@@ -68,6 +68,7 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
-------------------------------------- --------------------------------------
coocInter :: [Double] coocInter :: [Double]
coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc' coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
-- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
-------------------------------------- --------------------------------------
coocUnion :: [Double] coocUnion :: [Double]
coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc' coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
...@@ -241,8 +242,8 @@ count x = length . filter (== x) ...@@ -241,8 +242,8 @@ count x = length . filter (== x)
termFreq' :: Int -> [PhyloGroup] -> Double termFreq' :: Int -> [PhyloGroup] -> Double
termFreq' term groups = termFreq' term groups =
let ngrams = concat $ map _phylo_groupNgrams groups let ngrams = concat $ map _phylo_groupNgrams groups
in (fromIntegral $ count term ngrams) in log((fromIntegral $ count term ngrams)
/ (fromIntegral $ length ngrams) / (fromIntegral $ length ngrams))
relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]] relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]]
relevantBranches term thr branches = relevantBranches term thr branches =
...@@ -285,58 +286,30 @@ toAccuracy freq term thr branches = ...@@ -285,58 +286,30 @@ toAccuracy freq term thr branches =
branches' :: [[PhyloGroup]] branches' :: [[PhyloGroup]]
branches' = relevantBranches term thr branches branches' = relevantBranches term thr branches
toRecallWeighted :: Double -> [Double] -> [Double]
toRecallWeighted old curr =
let old' = old + sum curr
in map (\r -> (r / old') * r) curr
toRecallWeighted :: Double -> Double -> Double
toRecallWeighted old curr = curr / (old + curr)
toRecall' :: Quality -> Map Int Double -> [[PhyloGroup]] -> Double
toRecall' quality frequency branches =
let terms = keys frequency
in sum $ map (\term -> toRecall (frequency ! term) term (quality ^. qua_minBranch) branches) terms
toPhyloQuality :: Quality -> Map Int Double -> Double -> [[PhyloGroup]] -> Double
toPhyloQuality quality frequency recall branches =
if (foldl' (\acc b -> acc && (length b < (quality ^. qua_minBranch))) True branches)
-- | the local phylo is composed of small branches
then 0
else
let relevance = quality ^. qua_relevance
-- | compute the F score for a given relevance
in ((1 + relevance ** 2) * accuracy * recall)
/ (((relevance ** 2) * accuracy + recall))
where
terms :: [Int]
terms = keys frequency
-- | for each term compute the global accuracy
accuracy :: Double
accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term (quality ^. qua_minBranch) branches) terms
toRecall' :: Int -> Map Int Double -> [[PhyloGroup]] -> Double
toRecall' minBranch frequency branches =
let terms = keys frequency
in sum $ map (\term -> toRecall (frequency ! term) term minBranch branches) terms
toPhyloQuality' :: Quality -> Map Int Double -> [[PhyloGroup]] -> Double toPhyloQuality :: Double -> Int -> Map Int Double -> Double -> [[PhyloGroup]] -> Double
toPhyloQuality' quality frequency branches = toPhyloQuality beta minBranch frequency recall branches =
if (foldl' (\acc b -> acc && (length b < (quality ^. qua_minBranch))) True branches) if (foldl' (\acc b -> acc && (length b < minBranch)) True branches)
-- | the local phylo is composed of small branches -- | the local phylo is composed of small branches
then 0 then 0
else else ((1 + beta ** 2) * accuracy * recall)
let relevance = quality ^. qua_relevance / (((beta ** 2) * accuracy + recall))
-- | compute the F score for a given relevance
in ((1 + relevance ** 2) * accuracy * recall)
/ (((relevance ** 2) * accuracy + recall))
where where
terms :: [Int] terms :: [Int]
terms = keys frequency terms = keys frequency
-- | for each term compute the global accuracy -- | for each term compute the global accuracy
accuracy :: Double accuracy :: Double
accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term (quality ^. qua_minBranch) branches) terms accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term minBranch branches) terms
-- | for each term compute the global recall
recall :: Double
recall = sum $ map (\term -> toRecall (frequency ! term) term (quality ^. qua_minBranch) branches) terms
----------------------------- -----------------------------
...@@ -364,68 +337,66 @@ groupsToBranches groups = ...@@ -364,68 +337,66 @@ groupsToBranches groups =
in groups' `using` parList rdeepseq ) graph in groups' `using` parList rdeepseq ) graph
recursiveMatching :: Proximity -> Quality -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Double -> [[PhyloGroup]] -> [PhyloGroup] recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Double -> [PhyloGroup] -> [PhyloGroup]
recursiveMatching proximity qua freq thr frame periods docs quality oldRecall branches = recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality recall groups =
if (length branches == (length $ concat branches)) if (length groups == 1)
then concat branches then trace ("stop : just one group")
else if thr >= 1 $ groups
then concat branches else if (egoThr >= 1)
else then trace ("stop : thr >= 1")
-- trace (show(quality) <> " (vs) sum of " <> show(nextQualities)) $ groups
case quality <= (sum nextQualities) of else if (quality > quality')
-- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities)) then trace ("stop : " <> show(quality) <> " > " <> show(quality'))
True -> concat -- $ trace (show(length groups) <> " groups " <> show(length branches'))
$ map (\branches' -> -- $ trace (show(recall) <> " recall " <> show(recall'))
let idx = fromJust $ elemIndex branches' nextBranches $ groups
in recursiveMatching proximity qua else trace ("go : " <> show(quality) <> " <= " <> show(quality'))
freq (thr + (getThresholdStep proximity)) $ concat
frame periods docs (nextQualities !! idx) $ map (\branch -> recursiveMatching proximity beta minBranch frequency (egoThr + (getThresholdStep proximity))
(sum $ dropByIdx idx nextRecalls) branches') frame periods docs quality' recall' branch)
$ nextBranches $ branches'
-- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
False -> concat branches
where where
-- | 2) for each of the possible next branches process the phyloQuality score -- | 2) for each of the possible next branches process the phyloQuality score
nextQualities :: [Double] quality' :: Double
nextQualities = map (\(nextBranch,recall) -> toPhyloQuality qua freq recall nextBranch) $ zip nextBranches nextRecalls quality' = toPhyloQuality beta minBranch frequency recall' branches'
-- nextQualities = map (\nextBranch -> toPhyloQuality' qua freq nextBranch) nextBranches -- | 3) process a new recall weigted by the last one
------- recall' :: Double
nextRecalls :: [Double] recall' = toRecallWeighted recall
nextRecalls = toRecallWeighted oldRecall $ toRecall' minBranch frequency branches'
$ map (\nextBranch -> toRecall' qua freq nextBranch) nextBranches
-- | 1) for each local branch process a temporal matching then find the resulting branches -- | 1) for each local branch process a temporal matching then find the resulting branches
nextBranches :: [[[PhyloGroup]]] branches' :: [[PhyloGroup]]
nextBranches = branches' =
let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches' $ phyloBranchMatching frame periods proximity egoThr docs groups
clusters' = clusters `using` parList rdeepseq in branches `using` parList rdeepseq
in clusters'
temporalMatching :: Phylo -> Phylo temporalMatching :: Phylo -> Phylo
temporalMatching phylo = updatePhyloGroups 1 branches' phylo temporalMatching phylo = updatePhyloGroups 1 branches' phylo
where where
-- | 4) run the recursive matching to find the best repartition among branches -- | 6) apply the recursive matching
branches' :: Map PhyloGroupId PhyloGroup branches' :: Map PhyloGroupId PhyloGroup
branches' = fromList branches' = fromList
$ map (\g -> (getGroupId g, g)) $ map (\g -> (getGroupId g, g))
$ traceMatchEnd $ traceMatchEnd
$ recursiveMatching (phyloProximity $ getConfig phylo) $ concat
(phyloQuality $ getConfig phylo) $ map (\branch ->
recursiveMatching (phyloProximity $ getConfig phylo)
(_qua_relevance $ phyloQuality $ getConfig phylo)
(_qua_minBranch $ phyloQuality $ getConfig phylo)
frequency frequency
( (getThresholdInit $ phyloProximity $ getConfig phylo) ( (getThresholdInit $ phyloProximity $ getConfig phylo)
+ (getThresholdStep $ phyloProximity $ getConfig phylo)) + (getThresholdStep $ phyloProximity $ getConfig phylo))
(getTimeFrame $ timeUnit $ getConfig phylo) (getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo) (getPeriodIds phylo)
(phylo ^. phylo_timeDocs) quality recall branches (phylo ^. phylo_timeDocs) quality recall branch
-- | 3) process the quality score ) branches
-- | 5) process the quality score
quality :: Double quality :: Double
quality = toPhyloQuality (phyloQuality $ getConfig phylo) frequency recall branches quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) (_qua_minBranch $ phyloQuality $ getConfig phylo) frequency recall branches
-- quality = toPhyloQuality' (phyloQuality $ getConfig phylo) frequency branches -- | 4) find the recall
-------
recall :: Double recall :: Double
recall = toRecall' (phyloQuality $ getConfig phylo) frequency branches recall = toRecall' (_qua_minBranch $ phyloQuality $ getConfig phylo) frequency branches
-- | 3) process the constants of the quality score -- | 3) process the constants of the quality score
frequency :: Map Int Double frequency :: Map Int Double
frequency = frequency =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment