Commit e2b8b663 authored by qlobbe's avatar qlobbe

temporal is close to be ok, start export

parent 5d618e53
......@@ -44,6 +44,8 @@ import GHC.IO (FilePath)
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses)
import qualified Data.Text.Lazy as TextLazy
----------------
-- | Config | --
......@@ -102,7 +104,7 @@ defaultConfig =
, corpusParser = Csv 1000
, phyloName = pack "Default Phylo"
, phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10 0 0.05
, phyloProximity = WeightedLogJaccard 10 0 0.2
, timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4
, branchSize = 3
......@@ -290,6 +292,12 @@ data PhyloFis = PhyloFis
} deriving (Generic,NFData,Show,Eq)
----------------
-- | Export | --
----------------
type DotId = TextLazy.Text
----------------
-- | Lenses | --
----------------
......
......@@ -29,6 +29,7 @@ import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.PhyloMaker
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Control.Lens
......@@ -40,7 +41,8 @@ import qualified Data.Vector as Vector
-----------------------------------------------
phylo1 :: Phylo
phylo1 = appendGroups fisToGroup 1 phyloFis phyloBase
phylo1 = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
---------------------------------------------
......
......@@ -13,4 +13,55 @@ Portability : POSIX
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Viz.Phylo.PhyloExport where
\ No newline at end of file
module Gargantext.Viz.Phylo.PhyloExport where
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Data.GraphViz.Types.Generalised (DotGraph)
--------------------
-- | Dot export | --
--------------------
toDot :: [PhyloGroup] -> DotGraph DotId
toDot branches = undefined
----------------------
-- | post process | --
----------------------
processFilters :: [PhyloGroup] -> [PhyloGroup]
processFilters branches = branches
processSort :: [PhyloGroup] -> [PhyloGroup]
processSort branches = branches
processMetrics :: [PhyloGroup] -> [PhyloGroup]
processMetrics branches = branches
processDynamics :: [PhyloGroup] -> [PhyloGroup]
processDynamics branches = branches
processLabels :: [PhyloGroup] -> [PhyloGroup]
processLabels branches = branches
phyloPostProcess :: [PhyloGroup] -> [PhyloGroup]
phyloPostProcess branches = branches
---------------------
-- | phyloExport | --
---------------------
toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = toDot
$ phyloPostProcess groups
where
groups :: [PhyloGroup]
groups = getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo
\ No newline at end of file
......@@ -23,6 +23,7 @@ import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
......@@ -86,12 +87,13 @@ fisToGroup fis pId lvl idx fdt coocs =
(fis ^. phyloFis_support)
ngrams
(ngramsToCooc ngrams coocs)
(1,[])
(1,[0])
[] [] [] [] []
toPhylo1 :: [Document] -> Phylo -> Phylo
toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
toPhylo1 docs phyloBase = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
where
--------------------------------------
phyloFis :: Map (Date,Date) [PhyloFis]
......@@ -181,7 +183,7 @@ docsToTimeScaleCooc docs fdt =
mCooc' = fromList
$ map (\t -> (t,empty))
$ toTimeScale (map date docs) 1
in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
$ unionWith sumCooc mCooc mCooc'
......
......@@ -17,7 +17,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloTools where
import Data.Vector (Vector, elemIndex)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init)
import Data.Set (Set, size)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey)
import Data.String (String)
......@@ -202,11 +202,11 @@ addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
addPointers group fil pty pointers =
case pty of
TemporalPointer -> case fil of
ToChilds -> group & phylo_groupPeriodChilds %~ (++ pointers)
ToParents -> group & phylo_groupPeriodParents %~ (++ pointers)
ToChilds -> group & phylo_groupPeriodChilds .~ pointers
ToParents -> group & phylo_groupPeriodParents .~ pointers
LevelPointer -> case fil of
ToChilds -> group & phylo_groupLevelChilds %~ (++ pointers)
ToParents -> group & phylo_groupLevelParents %~ (++ pointers)
ToChilds -> group & phylo_groupLevelChilds .~ pointers
ToParents -> group & phylo_groupLevelParents .~ pointers
getPeriodIds :: Phylo -> [(Date,Date)]
......@@ -284,4 +284,47 @@ getThresholdStep proxi = case proxi of
----------------
ngramsInBranches :: [[PhyloGroup]] -> [Int]
ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
\ No newline at end of file
ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
traceMatchSuccess thr qua qua' nextBranches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
$ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length nextBranches) <> ")]"
<> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
<> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
<> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchFailure thr qua qua' branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
) branches
traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchNoSplit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to split in smaller branches" <> "\n"
) branches
traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchLimit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to increase the threshold above 1" <> "\n"
) branches
traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
traceMatchEnd groups =
trace ("\n" <> "-- | End of temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
<> " branches and " <> show (length groups) <> " groups" <> "\n") groups
\ No newline at end of file
......@@ -15,7 +15,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union, elemIndex, (!!))
import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
import Gargantext.Prelude
......@@ -23,11 +23,13 @@ import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.SynchronicClustering
import Debug.Trace (trace)
import Prelude (logBase)
import Control.Lens hiding (Level)
import qualified Data.Set as Set
-------------------
-- | Proximity | --
-------------------
......@@ -204,7 +206,9 @@ entropy branches =
/ (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
* (sum $ map (\branch ->
let q = branchObs term (length $ concat branches) branch
in q * logBase 2 q ) branches) ) terms
in if (q == 0)
then 0
else - q * logBase 2 q ) branches) ) terms
where
-- | Probability to observe a branch given a random term of the phylo
branchObs :: Int -> Int -> [PhyloGroup] -> Double
......@@ -213,10 +217,21 @@ entropy branches =
homogeneity :: [[PhyloGroup]] -> Double
homogeneity _ = undefined
-- where
-- branchCov :: [PhyloGroup] -> Int -> Double
-- branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
homogeneity branches =
let nbGroups = length $ concat branches
in sum
$ map (\branch -> (if (length branch == nbGroups)
then 1
else (1 / log (branchCov branch nbGroups))
/ (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
* (sum $ map (\term -> (termFreq term branches)
/ (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
* (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
/ (fromIntegral $ sum $ ngramsInBranches [branch])
) $ ngramsInBranches [branch]) ) branches
where
branchCov :: [PhyloGroup] -> Int -> Double
branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
toPhyloQuality :: [[PhyloGroup]] -> Double
......@@ -243,42 +258,57 @@ groupsToBranches groups =
) graph
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
recursiveMatching proximity thr max' periods docs quality groups =
case quality < quality' of
-- | success : we localy improve the quality of the branch, let's go deeper
True -> concat
$ map (\branch ->
recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
) branches
-- | failure : last step was the local maximum, let's validate it
False -> groups
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
recursiveMatching proximity thr frame periods docs quality branches =
if (length branches == (length $ concat branches))
then concat $ traceMatchNoSplit branches
else if thr > 1
then concat $ traceMatchLimit branches
else
case quality <= (sum nextQualities) of
-- | success : the new threshold improves the quality score, let's go deeper
True -> concat
$ map (\branches' ->
let idx = fromJust $ elemIndex branches' nextBranches
in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
$ traceMatchSuccess thr quality (sum nextQualities) nextBranches
-- | failure : last step was a local maximum of quality, let's validate it
False -> concat $ traceMatchFailure thr quality (sum nextQualities) branches
where
-- | 3) process a quality score on the local set of branches
quality' :: Double
quality' = toPhyloQuality branches
-- | 2) group the new groups into branches
branches :: [[PhyloGroup]]
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
-- | 1) process a temporal matching for each group
groups' :: [PhyloGroup]
groups' = processMatching max' periods proximity thr docs groups
-- | 2) for each of the possible next branches process the phyloQuality score
nextQualities :: [Double]
nextQualities = map toPhyloQuality nextBranches
-- | 1) for each local branch process a temporal matching then find the resulting branches
nextBranches :: [[[PhyloGroup]]]
nextBranches = map (\branch ->
let branch' = processMatching frame periods proximity thr docs branch
in groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch'
) branches
temporalMatching :: Phylo -> Phylo
temporalMatching phylo = updatePhyloGroups 1 branches phylo
temporalMatching phylo = updatePhyloGroups 1 branches' phylo
where
-- | 2) run the recursive matching to find the best repartition among branches
branches :: Map PhyloGroupId PhyloGroup
branches = fromList
-- | 4) run the recursive matching to find the best repartition among branches
branches' :: Map PhyloGroupId PhyloGroup
branches' = fromList
$ map (\g -> (getGroupId g, g))
$ traceMatchEnd
$ recursiveMatching (phyloProximity $ getConfig phylo)
(getThresholdInit $ phyloProximity $ getConfig phylo)
( (getThresholdInit $ phyloProximity $ getConfig phylo)
+ (getThresholdStep $ phyloProximity $ getConfig phylo))
(getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo)
(phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
(phylo ^. phylo_timeDocs) quality branches
-- | 3) process the quality score
quality :: Double
quality = toPhyloQuality branches
-- | 2) group into branches
branches :: [[PhyloGroup]]
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group))
$ trace ("\n" <> "-- | Init temporal matching for " <> show (length $ groups') <> " groups" <> "\n") groups'
-- | 1) for each group process an initial temporal Matching
groups' :: [PhyloGroup]
groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
(phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
(phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)
(phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment