Commit 4ee73701 authored by qlobbe's avatar qlobbe Committed by Alexandre Delanoë

new strategy for choosing temporal matching candidates

parent 2120f449
......@@ -278,7 +278,7 @@ main = do
pure $ toPhylo (setConfig config phyloWithoutLink)
else do
printIOMsg "Reconstruct the phylo from scratch"
phyloWithoutLink <- pure $ toPhyloWithoutLink corpus mapList config
phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
......
......@@ -37,7 +37,6 @@ import Data.Text (Text, pack)
import Data.Vector (Vector)
import GHC.Generics
import GHC.IO (FilePath)
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Prelude
......@@ -362,8 +361,8 @@ data Document = Document
-- | The Foundations of a Phylo created from a given TermList
data PhyloFoundations = PhyloFoundations
{ _foundations_roots :: !(Vector Ngrams)
, _foundations_mapList :: TermList
{ _foundations_roots :: (Vector Ngrams)
, _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
} deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where
......
......@@ -86,8 +86,8 @@ phylo2dot2json phylo = do
flowPhyloAPI :: PhyloConfig -> CorpusId -> GargNoServer Phylo
flowPhyloAPI config cId = do
(mapList, corpus) <- corpusIdtoDocuments (timeUnit config) cId
phyloWithCliques <- pure $ toPhyloWithoutLink corpus mapList config
(_, corpus) <- corpusIdtoDocuments (timeUnit config) cId
phyloWithCliques <- pure $ toPhyloWithoutLink corpus config
-- writePhylo phyloWithCliquesFile phyloWithCliques
pure $ toPhylo (setConfig config phyloWithCliques)
......
......@@ -19,8 +19,8 @@ import Control.Lens
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.List (sortOn, nub, sort)
import Data.Map (Map)
import Data.Vector (Vector)
import Data.Text (Text, toLower)
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Terms.Mono (monoTexts)
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.PhyloExport
......@@ -31,6 +31,7 @@ import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Prelude
import qualified Data.Vector as Vector
import qualified Data.Set as Set
import qualified Data.Map as Map
---------------------------------
-- | STEP 5 | -- Export the phylo
......@@ -62,6 +63,7 @@ flatPhylo = case (getSeaElevation emptyPhylo) of
emptyPhylo' :: Phylo
emptyPhylo' = scanSimilarity 1
$ joinRootsToGroups
$ appendGroups clusterToGroup 1 seriesOfClustering emptyPhylo
---------------------------------------------
......@@ -83,7 +85,7 @@ docsByPeriods = groupDocsByPeriod date periods docs
emptyPhylo :: Phylo
emptyPhylo = initPhylo docs mapList config
emptyPhylo = initPhylo docs config
phyloCooc :: Map Date Cooc
......@@ -120,7 +122,11 @@ docs = map (\(d,t)
foundations :: PhyloFoundations
foundations = PhyloFoundations (Vector.fromList $ map toLower actants) mapList
foundations = PhyloFoundations roots Map.empty
roots :: Vector Ngrams
roots = Vector.fromList $ map toLower actants
--------------------------------------------
......@@ -128,8 +134,8 @@ foundations = PhyloFoundations (Vector.fromList $ map toLower actants) mapList
--------------------------------------------
mapList :: TermList
mapList = map (\a -> ([toLower a],[])) actants
-- mapList :: TermList
-- mapList = map (\a -> ([toLower a],[])) actants
actants :: [Ngrams]
......
......@@ -25,7 +25,6 @@ import Prelude (floor)
import Gargantext.Core.Methods.Similarities (Similarity(Conditional))
import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
......@@ -203,13 +202,28 @@ indexDates' m = map (\docs ->
in (f,l)) m
-- create a map of roots and group ids
joinRootsToGroups :: Phylo -> Phylo
joinRootsToGroups phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap phylo
where
--------------------------------------
rootsMap :: Map Int [PhyloGroupId]
rootsMap = fromListWith (++)
$ concat -- flatten
$ map (\g ->
map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
$ getGroupsFromScale 1 phylo
-- To build the first phylo step from docs and terms
-- QL: backend entre phyloBase et Clustering
-- tophylowithoutLink
toPhyloWithoutLink :: [Document] -> TermList -> PhyloConfig -> Phylo
toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
Constante _ _ -> appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
Adaptative _ -> scanSimilarity 1
toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
toPhyloWithoutLink docs conf = case (getSeaElevation phyloBase) of
Constante _ _ -> joinRootsToGroups
$ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
Adaptative _ -> joinRootsToGroups
$ scanSimilarity 1
$ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
where
--------------------------------------
......@@ -221,7 +235,7 @@ toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
--------------------------------------
phyloBase :: Phylo
phyloBase = initPhylo docs lst conf
phyloBase = initPhylo docs conf
--------------------------------------
---------------------------
......@@ -409,9 +423,10 @@ initPhyloScales lvlMax pId =
-- Init the basic elements of a Phylo
--
initPhylo :: [Document] -> TermList -> PhyloConfig -> Phylo
initPhylo docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
initPhylo :: [Document] -> PhyloConfig -> Phylo
initPhylo docs conf =
let roots = Vector.fromList $ nub $ concat $ map text docs
foundations = PhyloFoundations roots empty
docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
params = defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
......
......@@ -316,6 +316,9 @@ ngramsToCooc ngrams coocs =
getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupScale), g ^. phylo_groupIndex)
getGroupNgrams :: PhyloGroup -> [Int]
getGroupNgrams g = g ^. phylo_groupNgrams
idToPrd :: PhyloGroupId -> Period
idToPrd id = (fst . fst) id
......@@ -427,6 +430,9 @@ setConfig config phylo = phylo
getRoots :: Phylo -> Vector Ngrams
getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
getRootsInGroups :: Phylo -> Map Int [PhyloGroupId]
getRootsInGroups phylo = (phylo ^. phylo_foundations) ^. foundations_rootsInGroups
getSources :: Phylo -> Vector Text
getSources phylo = _sources (phylo ^. phylo_sources)
......
......@@ -15,7 +15,7 @@ import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Data.Ord
import Data.List (concat, splitAt, tail, sortOn, sortBy, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), empty, mapKeys, adjust)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), empty, mapKeys, adjust, filterWithKey)
import Debug.Trace (trace)
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.PhyloTools
......@@ -361,13 +361,82 @@ reconstructTemporalLinks frame periods similarity thr docs coocs groups =
) [] periods
{-
-- find all the groups matching a list of ngrams
-}
findIdsFromNgrams :: [Int] -> Map Int [PhyloGroupId] -> [PhyloGroupId]
findIdsFromNgrams ngrams roots = nub $ concat $ elems $ filterWithKey (\k _ -> elem k ngrams) roots
formatCandidates :: Filiation -> [PhyloGroup] -> [[(PhyloGroupId,[Int])]]
formatCandidates fil groups = case fil of
ToChilds -> map (\groups' -> map (\g -> (getGroupId g, getGroupNgrams g)) groups')
$ elems
$ groupByField _phylo_groupPeriod groups
ToParents -> reverse
$ map (\groups' -> map (\g -> (getGroupId g, getGroupNgrams g)) groups')
$ elems
$ groupByField _phylo_groupPeriod groups
ToChildsMemory -> undefined
ToParentsMemory -> undefined
filterByIds :: PhyloGroupId -> [PhyloGroupId] -> [PhyloGroup] -> [PhyloGroup]
filterByIds egoId ids groups = filter (\g -> ((getGroupId g) /= egoId) && (elem (getGroupId g) ids)) groups
filterByPeriods :: [Period] -> [PhyloGroup] -> [PhyloGroup]
filterByPeriods periods groups = filter (\g -> elem (g ^. phylo_groupPeriod) periods) groups
filterByNgrams :: Int -> [Int] -> [PhyloGroup] -> [PhyloGroup]
filterByNgrams inf ngrams groups =
if (length ngrams) > 1
then
filter (\g -> (> inf) $ length $ intersect (ngrams) (getGroupNgrams g)) groups
else
filter (\g -> (not . null) $ intersect (ngrams) (getGroupNgrams g)) groups
{-
-- perform the upstream/downstream inter‐temporal matching process group by group
-}
reconstructTemporalLinks' :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [PhyloGroup]
reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
let egos = map (\ego ->
let -- 1) find the parents/childs matching periods
periodsPar = getNextPeriods ToParents frame (ego ^. phylo_groupPeriod) periods
periodsChi = getNextPeriods ToChilds frame (ego ^. phylo_groupPeriod) periods
-- 2) find the parents/childs matching candidates
candidatesPar = formatCandidates ToParents
$ filterByNgrams (getMinSharedNgrams similarity) (getGroupNgrams ego)
$ filterByPeriods periodsPar
$ filterByIds (getGroupId ego) (findIdsFromNgrams (getGroupNgrams ego) roots) groups
candidatesChi = formatCandidates ToChilds
$ filterByNgrams (getMinSharedNgrams similarity) (getGroupNgrams ego)
$ filterByPeriods periodsChi
$ filterByIds (getGroupId ego) (findIdsFromNgrams (getGroupNgrams ego) roots) groups
-- 3) find the parents/childs number of docs by years
docsPar = filterDocs docs ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
docsChi = filterDocs docs ([(ego ^. phylo_groupPeriod)] ++ periodsChi)
-- 4) find the parents/child diago by years
diagoPar = filterDiago (map coocToDiago coocs) ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
diagoChi = filterDiago (map coocToDiago coocs) ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
-- 5) match ego to their candidates through time
pointersPar = phyloGroupMatching candidatesPar ToParents similarity docsPar diagoPar thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
pointersChi = phyloGroupMatching candidatesChi ToParents similarity docsChi diagoChi thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
in addPointers ToChilds TemporalPointer pointersChi
$ addPointers ToParents TemporalPointer pointersPar
$ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
$ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego
) groups
in egos `using` parList rdeepseq
{-
-- reconstruct a phylomemetic network from a list of groups and from a given threshold
-}
toPhylomemeticNetwork :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [Branch]
toPhylomemeticNetwork timescale periods similarity thr docs coocs groups =
toPhylomemeticNetwork :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch]
toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups =
groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
$ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
-- $ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
$ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups
----------------------------
......@@ -524,10 +593,10 @@ thrToMeta thr branches =
-- rest = all the branches we still have to separate
-}
separateBranches :: Double -> Similarity -> Double -> Map Int Double -> Int -> Double -> Double
-> Int -> Map Date Double -> Map Date Cooc -> [Period]
-> Int -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [Period]
-> [(Branch,ShouldTry)] -> (Branch,ShouldTry) -> [(Branch,ShouldTry)]
-> [(Branch,ShouldTry)]
separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs periods done currentBranch rest =
separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs roots periods done currentBranch rest =
let done' = done ++ (if snd currentBranch
then
(if ((null (fst branches')) || (quality > quality'))
......@@ -551,7 +620,7 @@ separateBranches fdt similarity lambda frequency minBranch thr rise timescale do
-- 6) if there is no more branch to separate tne return [done'] else continue with [rest]
if null rest
then done'
else separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs periods
else separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs roots periods
done' (List.head rest) (List.tail rest)
where
------- 1) compute the quality before splitting any branch
......@@ -560,7 +629,7 @@ separateBranches fdt similarity lambda frequency minBranch thr rise timescale do
------------------- 2) split the current branch and create a new phylomemetic network
phylomemeticNetwork :: [Branch]
phylomemeticNetwork = toPhylomemeticNetwork timescale periods similarity thr docs coocs (fst currentBranch)
phylomemeticNetwork = toPhylomemeticNetwork timescale periods similarity thr docs coocs roots (fst currentBranch)
--------- 3) change the new phylomemetic network into a tuple of new branches
--------- on the left : the long branches, on the right : the small ones
......@@ -582,9 +651,10 @@ seaLevelRise :: Double -> Similarity -> Double -> Int -> Map Int Double
-> [Double] -> Double
-> Int -> [Period]
-> Map Date Double -> Map Date Cooc
-> Map Int [PhyloGroupId]
-> [(Branch,ShouldTry)]
-> ([(Branch,ShouldTry)],FinalQuality)
seaLevelRise fdt similarity lambda minBranch frequency ladder rise frame periods docs coocs branches =
seaLevelRise fdt similarity lambda minBranch frequency ladder rise frame periods docs coocs roots branches =
-- if the ladder is empty or thr > 1 or there is no branch to break then stop
if (null ladder) || ((List.head ladder) > 1) || (stopRise branches)
then (branches, toPhyloQuality fdt lambda frequency (map fst branches))
......@@ -596,9 +666,9 @@ seaLevelRise fdt similarity lambda minBranch frequency ladder rise frame periods
<> " ξ = " <> printf "%.5f" (globalAccuracy frequency (map fst branches))
<> " ρ = " <> printf "%.5f" (globalRecall frequency (map fst branches))
<> " branches = " <> show(length branches))
$ separateBranches fdt similarity lambda frequency minBranch thr rise frame docs coocs periods
$ separateBranches fdt similarity lambda frequency minBranch thr rise frame docs coocs roots periods
[] (List.head branches) (List.tail branches)
in seaLevelRise fdt similarity lambda minBranch frequency (List.tail ladder) (rise + 1) frame periods docs coocs branches'
in seaLevelRise fdt similarity lambda minBranch frequency (List.tail ladder) (rise + 1) frame periods docs coocs roots branches'
where
--------
stopRise :: [(Branch,ShouldTry)] -> Bool
......@@ -633,6 +703,7 @@ temporalMatching ladder phylo = updatePhyloGroups 1
(getPeriodIds phylo)
(phylo ^. phylo_timeDocs)
(phylo ^. phylo_timeCooc)
((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
(reverse $ sortOn (length . fst) seabed)
------ 1) for each group, process an initial temporal Matching and create a 'seabed'
......@@ -645,4 +716,5 @@ temporalMatching ladder phylo = updatePhyloGroups 1
(List.head ladder)
(phylo ^. phylo_timeDocs)
(phylo ^. phylo_timeCooc)
((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
(traceTemporalMatching $ getGroupsFromScale 1 phylo)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment