Commit 4f17f5dd authored by qlobbe's avatar qlobbe

fix the freq

parent f55c2f59
Pipeline #595 failed with stage
...@@ -125,7 +125,7 @@ defaultConfig = ...@@ -125,7 +125,7 @@ defaultConfig =
, phyloLevel = 2 , phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10 0 0.1 , phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityDistribution 0 , phyloSynchrony = ByProximityDistribution 0
, phyloQuality = Quality 0.5 1 , phyloQuality = Quality 10 3
, timeUnit = Year 3 1 5 , timeUnit = Year 3 1 5
, contextualUnit = Fis 1 5 , contextualUnit = Fis 1 5
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2] , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
...@@ -239,6 +239,7 @@ data Phylo = ...@@ -239,6 +239,7 @@ data Phylo =
Phylo { _phylo_foundations :: PhyloFoundations Phylo { _phylo_foundations :: PhyloFoundations
, _phylo_timeCooc :: !(Map Date Cooc) , _phylo_timeCooc :: !(Map Date Cooc)
, _phylo_timeDocs :: !(Map Date Double) , _phylo_timeDocs :: !(Map Date Double)
, _phylo_termFreq :: !(Map Int Double)
, _phylo_param :: PhyloParam , _phylo_param :: PhyloParam
, _phylo_periods :: Map PhyloPeriodId PhyloPeriod , _phylo_periods :: Map PhyloPeriodId PhyloPeriod
} }
......
...@@ -338,11 +338,12 @@ branchDating export = ...@@ -338,11 +338,12 @@ branchDating export =
$ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id) $ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
then acc ++ [g ^. phylo_groupPeriod] then acc ++ [g ^. phylo_groupPeriod]
else acc ) [] $ export ^. export_groups else acc ) [] $ export ^. export_groups
periods = nub groups
birth = fst $ head' "birth" groups birth = fst $ head' "birth" groups
age = (snd $ last' "age" groups) - birth age = (snd $ last' "age" groups) - birth
in b & branch_meta %~ insert "birth" [fromIntegral birth] in b & branch_meta %~ insert "birth" [fromIntegral birth]
& branch_meta %~ insert "age" [fromIntegral age] & branch_meta %~ insert "age" [fromIntegral age]
& branch_meta %~ insert "size" [fromIntegral $ length groups] ) export & branch_meta %~ insert "size" [fromIntegral $ length periods] ) export
processMetrics :: PhyloExport -> PhyloExport processMetrics :: PhyloExport -> PhyloExport
processMetrics export = ngramsMetrics processMetrics export = ngramsMetrics
......
...@@ -15,7 +15,7 @@ Portability : POSIX ...@@ -15,7 +15,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloMaker where module Gargantext.Viz.Phylo.PhyloMaker where
import Data.List (concat, nub, partition, sort, (++)) import Data.List (concat, nub, partition, sort, (++), group)
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys) import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys)
import Data.Set (size) import Data.Set (size)
import Data.Vector (Vector) import Data.Vector (Vector)
...@@ -206,6 +206,17 @@ groupDocsByPeriod f pds es = ...@@ -206,6 +206,17 @@ groupDocsByPeriod f pds es =
-------------------------------------- --------------------------------------
docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
docsToTermFreq docs fdt =
let nbDocs = fromIntegral $ length docs
freqs = map (/nbDocs)
$ fromList
$ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
-- | To count the number of docs by unit of time -- | To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb docs = docsToTimeScaleNb docs =
...@@ -230,5 +241,6 @@ toPhyloBase docs lst conf = ...@@ -230,5 +241,6 @@ toPhyloBase docs lst conf =
$ Phylo foundations $ Phylo foundations
(docsToTimeScaleCooc docs (foundations ^. foundations_roots)) (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs) (docsToTimeScaleNb docs)
(docsToTermFreq docs (foundations ^. foundations_roots))
params params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods) (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
...@@ -15,7 +15,7 @@ Portability : POSIX ...@@ -15,7 +15,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete, and) import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete, or)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey) import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -370,23 +370,23 @@ alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int ...@@ -370,23 +370,23 @@ alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
alterBorder border branches branch = border + (length $ concat branches) - (length branch) alterBorder border branches branch = border + (length $ concat branches) - (length branch)
-- | Important ne pas virer les filtree mais les mettre en false
seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)] seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest = seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
-- | 1) keep or not the new division of ego -- | 1) keep or not the new division of ego
let done' = done ++ (if snd ego let done' = done ++ (if snd ego
then (if ((null ego') || (quality > quality')) then (if ((null (fst ego')) || (quality > quality'))
then trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') then trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
<> " | " <> show((length done) + (length ego') + (length rest)) <> " | " <> show(length $ fst ego) <> " groups : "
<> "[" <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
<> show((length $ concat $ map fst done) + (length $ concat ego') + (length $ concat $ map fst rest)) <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
<> "]")
$ [(fst ego,False)] $ [(fst ego,False)]
else trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') else trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
<> " | " <> show((length done) + (length ego') + (length rest)) <> " | " <> show(length $ fst ego) <> " groups : "
<> "[" <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
<> show((length $ concat $ map fst done) + (length $ concat ego') + (length $ concat $ map fst rest)) <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
<> "]") $ ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
$ (map (\e -> (e,True)) ego'))
else [ego]) else [ego])
in in
-- | 2) if there is no more branches in rest then return else continue -- | 2) if there is no more branches in rest then return else continue
...@@ -399,21 +399,21 @@ seqMatching proximity beta frequency minBranch egoThr frame docs periods done eg ...@@ -399,21 +399,21 @@ seqMatching proximity beta frequency minBranch egoThr frame docs periods done eg
quality :: Double quality :: Double
quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest)) quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
-------------------------------------- --------------------------------------
ego' :: [[PhyloGroup]] ego' :: ([[PhyloGroup]],[[PhyloGroup]])
ego' = ego' =
let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g)) let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
$ phyloBranchMatching frame periods proximity egoThr docs (fst ego) $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
branches' = branches `using` parList rdeepseq branches' = branches `using` parList rdeepseq
in filter (\b -> length b >= minBranch) branches' in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch) branches'
-------------------------------------- --------------------------------------
quality' :: Double quality' :: Double
quality' = toPhyloQuality' beta (reduceFrequency frequency ((map fst done) ++ ego' ++ (map fst rest))) quality' = toPhyloQuality' beta frequency
((map fst done) ++ ego' ++ (map fst rest)) ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)] recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches = recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
if (egoThr >= 1) || ((not . and) $ map snd branches) if (egoThr >= 1) || ((not . or) $ map snd branches)
then branches then branches
else else
let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
...@@ -459,23 +459,18 @@ temporalMatching phylo = updatePhyloGroups 1 ...@@ -459,23 +459,18 @@ temporalMatching phylo = updatePhyloGroups 1
(fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches) (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
phylo phylo
where where
-- | 2) init the recursiveMatching
branches :: [[PhyloGroup]] branches :: [[PhyloGroup]]
branches = map fst branches = map fst
$ recursiveMatching' (phyloProximity $ getConfig phylo) $ recursiveMatching' (phyloProximity $ getConfig phylo)
(_qua_relevance $ phyloQuality $ getConfig phylo) (_qua_relevance $ phyloQuality $ getConfig phylo)
(_qua_minBranch $ phyloQuality $ getConfig phylo) (_qua_minBranch $ phyloQuality $ getConfig phylo)
frequency (phylo ^. phylo_termFreq)
(getThresholdInit $ phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
(getTimeFrame $ timeUnit $ getConfig phylo) (getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo) (getPeriodIds phylo)
(phylo ^. phylo_timeDocs) (phylo ^. phylo_timeDocs)
[(groups,True)] [(groups,True)]
-- | 2) process the constants of the quality score
frequency :: Map Int Double
frequency =
let terms = ngramsInBranches [groups]
freqs = map (\t -> termFreq' t groups) terms
in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
-- | 1) for each group process an initial temporal Matching -- | 1) for each group process an initial temporal Matching
groups :: [PhyloGroup] groups :: [PhyloGroup]
groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo) groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment