Commit c60dc0b7 authored by qlobbe's avatar qlobbe

end of phylo clustering

parent bea591f6
Pipeline #571 failed with stage
...@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString) ...@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString)
import Data.Maybe (isJust, fromJust) import Data.Maybe (isJust, fromJust)
import Data.List (concat, nub, isSuffixOf, take) import Data.List (concat, nub, isSuffixOf, take)
import Data.String (String) import Data.String (String)
import Data.Text (Text, unwords) import Data.Text (Text, unwords, unpack)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
...@@ -37,6 +37,8 @@ import Gargantext.Text.List.CSV (csvGraphTermList) ...@@ -37,6 +37,8 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList) import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloMaker (toPhylo) import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Prelude (Either(..)) import Prelude (Either(..))
...@@ -54,21 +56,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv ...@@ -54,21 +56,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
--------------- ---------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To get all the files in a directory or just a file -- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath]) getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do getFilesFromPath path = do
...@@ -166,6 +153,17 @@ main = do ...@@ -166,6 +153,17 @@ main = do
printIOComment (show (length corpus) <> " parsed docs from the corpus") printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the Phylo" printIOMsg "Reconstruct the Phylo"
let phylo = toPhylo corpus mapList config let phylo = toPhylo corpus mapList config
printIOMsg "End of reconstruction" printIOMsg "End of reconstruction, start the export"
\ No newline at end of file
let dot = toPhyloExport phylo
printIOMsg "End of export to dot"
let output = (outputPath config)
<> (unpack $ phyloName config)
<> "_V2.dot"
dotToFile output dot
\ No newline at end of file
...@@ -115,7 +115,7 @@ defaultConfig = ...@@ -115,7 +115,7 @@ defaultConfig =
, phyloName = pack "Default Phylo" , phyloName = pack "Default Phylo"
, phyloLevel = 1 , phyloLevel = 1
, phyloProximity = WeightedLogJaccard 10 0 0.1 , phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityThreshold 0.4 , phyloSynchrony = ByProximityThreshold 0.1
, timeUnit = Year 3 1 5 , timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4 , contextualUnit = Fis 2 4
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2] , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
......
...@@ -45,7 +45,6 @@ phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot" ...@@ -45,7 +45,6 @@ phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot"
phyloDot :: DotGraph DotId phyloDot :: DotGraph DotId
phyloDot = toPhyloExport phylo2 phyloDot = toPhyloExport phylo2
phylo2 :: Phylo phylo2 :: Phylo
phylo2 = synchronicClustering phylo1 phylo2 = synchronicClustering phylo1
......
...@@ -447,15 +447,12 @@ toPhyloExport phylo = exportToDot phylo ...@@ -447,15 +447,12 @@ toPhyloExport phylo = exportToDot phylo
export = PhyloExport groups branches export = PhyloExport groups branches
-------------------------------------- --------------------------------------
branches :: [PhyloBranch] branches :: [PhyloBranch]
branches = traceBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups branches = traceExportBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups
-------------------------------------- --------------------------------------
groups :: [PhyloGroup] groups :: [PhyloGroup]
groups = traceGroups $ processDynamics groups = processDynamics
$ getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo $ getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo
traceBranches :: [PhyloBranch] -> [PhyloBranch] traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
traceBranches branches = trace (">>>> nb branches : " <> show(length branches)) branches traceExportBranches branches = trace ("\n" <> "-- | Export " <> show(length branches) <> " branches") branches
traceGroups :: [PhyloGroup] -> [PhyloGroup]
traceGroups groups = trace (">>>> nb groups : " <> show(length groups)) groups
\ No newline at end of file
...@@ -43,13 +43,14 @@ import qualified Data.Set as Set ...@@ -43,13 +43,14 @@ import qualified Data.Set as Set
toPhylo :: [Document] -> TermList -> Config -> Phylo toPhylo :: [Document] -> TermList -> Config -> Phylo
toPhylo docs lst conf = phylo1 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
if (phyloLevel conf) > 1
then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
else phylo1
where where
-------------------------------------- --------------------------------------
phylo1 :: Phylo phylo1 :: Phylo
phylo1 = synchronicClustering phylo1 = toPhylo1 docs phyloBase
$ temporalMatching
$ toPhylo1 docs phyloBase
-------------------------------------- --------------------------------------
phyloBase :: Phylo phyloBase :: Phylo
phyloBase = toPhyloBase docs lst conf phyloBase = toPhyloBase docs lst conf
...@@ -230,4 +231,4 @@ toPhyloBase docs lst conf = ...@@ -230,4 +231,4 @@ toPhyloBase docs lst conf =
(docsToTimeScaleCooc docs (foundations ^. foundations_roots)) (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs) (docsToTimeScaleNb docs)
params params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods) (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
...@@ -33,6 +33,25 @@ import qualified Data.Vector as Vector ...@@ -33,6 +33,25 @@ import qualified Data.Vector as Vector
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Set as Set import qualified Data.Set as Set
------------
-- | Io | --
------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-------------- --------------
-- | Misc | -- -- | Misc | --
-------------- --------------
...@@ -232,6 +251,9 @@ getPeriodIds phylo = sortOn fst ...@@ -232,6 +251,9 @@ getPeriodIds phylo = sortOn fst
$ keys $ keys
$ phylo ^. phylo_periods $ phylo ^. phylo_periods
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
getLastLevel :: Phylo -> Level getLastLevel :: Phylo -> Level
getLastLevel phylo = last' "lastLevel" $ getLevels phylo getLastLevel phylo = last' "lastLevel" $ getLevels phylo
...@@ -282,6 +304,13 @@ updatePhyloGroups lvl m phylo = ...@@ -282,6 +304,13 @@ updatePhyloGroups lvl m phylo =
then m ! id then m ! id
else group ) phylo else group ) phylo
traceToPhylo :: Level -> Phylo -> Phylo
traceToPhylo lvl phylo =
trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
<> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
-------------------- --------------------
-- | Clustering | -- -- | Clustering | --
-------------------- --------------------
...@@ -298,6 +327,21 @@ relatedComponents graphs = foldl' (\mem groups -> ...@@ -298,6 +327,21 @@ relatedComponents graphs = foldl' (\mem groups ->
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs
traceSynchronyEnd :: Phylo -> Phylo
traceSynchronyEnd phylo =
trace ( "\n" <> "-- | End of synchronic clustering for level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
traceSynchronyStart :: Phylo -> Phylo
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start of synchronic clustering for level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
------------------- -------------------
-- | Proximity | -- -- | Proximity | --
------------------- -------------------
......
...@@ -31,51 +31,56 @@ import Debug.Trace (trace) ...@@ -31,51 +31,56 @@ import Debug.Trace (trace)
-- | New Level Maker | -- -- | New Level Maker | --
------------------------- -------------------------
mergeGroups :: [Cooc] -> PhyloGroupId -> [PhyloGroup] -> PhyloGroup toBranchId :: PhyloGroup -> PhyloBranchId
mergeGroups coocs id childs = toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(snd $ fst id) (sum $ map _phylo_groupSupport childs) ngrams
(snd id) (ngramsToCooc ngrams coocs) (toBranchId (head' "mergeGroups" childs))
"" empty [] (map (\g -> (getGroupId g, 1)) childs)
(sum $ map _phylo_groupSupport childs) (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
ngrams (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
(ngramsToCooc ngrams coocs) where
(((head' "mergeGroups" childs) ^. phylo_groupLevel) + 1, snd ((head' "mergeGroups" childs) ^. phylo_groupBranchId)) updatePointers :: [Pointer] -> [Pointer]
empty updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
[]
(map (\g -> (getGroupId g, 1)) childs)
(concat $ map _phylo_groupPeriodParents childs) addPhyloLevel :: Level -> Phylo -> Phylo
(concat $ map _phylo_groupPeriodChilds childs) addPhyloLevel lvl phylo =
over ( phylo_periods . traverse )
(\phyloPrd -> phyloPrd & phylo_periodLevels
addNewLevel :: Level -> Phylo -> Phylo %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
addNewLevel lvl phylo =
over ( phylo_periods
. traverse )
(\phyloPrd ->
phyloPrd & phylo_periodLevels %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl + 1)
(PhyloLevel (phyloPrd ^. phylo_periodPeriod) (lvl + 1) empty))) phylo
toNextLevel :: Phylo -> [PhyloGroup] -> Phylo toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
toNextLevel phylo groups = toNextLevel phylo groups =
let level = getLastLevel phylo let curLvl = getLastLevel phylo
phylo' = updatePhyloGroups level (fromList $ map (\g -> (getGroupId g, g)) groups) phylo oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
nextGroups = fromListWith (++) newGroups = fromListWith (++)
$ foldlWithKey (\acc k v -> -- | 5) group the parents by periods
let group = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [fst $ fst k]) k v $ foldlWithKey (\acc id groups' ->
in acc ++ [(group ^. phylo_groupPeriod,[group])]) [] -- | 4) create the parent group
$ fromListWith (++) $ map (\g -> (fst $ head' "nextGroups" $ g ^. phylo_groupLevelParents,[g])) groups let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
in trace (">>>>>>>>>>>>>>>>>>>>>>>>" <> show (nextGroups)) over ( phylo_periods in acc ++ [(parent ^. phylo_groupPeriod, [parent])]) []
. traverse -- | 3) group the current groups by parentId
. phylo_periodLevels $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
. traverse in traceSynchronyEnd
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (level + 1))) $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
(\phyloLvl -> if member (phyloLvl ^. phylo_levelPeriod) nextGroups -- | 6) update each period at curLvl + 1
then phyloLvl & phylo_levelGroups .~ fromList ( map (\g -> (getGroupId g,g)) . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
$ nextGroups ! (phyloLvl ^. phylo_levelPeriod)) -- | 7) by adding the parents
else phyloLvl (\phyloLvl ->
) $ addNewLevel level phylo' if member (phyloLvl ^. phylo_levelPeriod) newGroups
then phyloLvl & phylo_levelGroups
.~ fromList (map (\g -> (getGroupId g, g)) $ newGroups ! (phyloLvl ^. phylo_levelPeriod))
else phyloLvl)
-- | 2) add the curLvl + 1 phyloLevel to the phylo
$ addPhyloLevel (curLvl + 1)
-- | 1) update the current groups (with level parent pointers) in the phylo
$ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
-------------------- --------------------
...@@ -99,6 +104,9 @@ groupsToEdges prox thr docs groups = ...@@ -99,6 +104,9 @@ groupsToEdges prox thr docs groups =
toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]] toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
toRelatedComponents nodes edges = relatedComponents $ ((map (\((g,g'),_) -> [g,g']) edges) ++ (map (\g -> [g]) nodes)) toRelatedComponents nodes edges = relatedComponents $ ((map (\((g,g'),_) -> [g,g']) edges) ++ (map (\g -> [g]) nodes))
toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
reduceBranch :: Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup] reduceBranch :: Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
reduceBranch prox thr docs branch = reduceBranch prox thr docs branch =
...@@ -109,12 +117,12 @@ reduceBranch prox thr docs branch = ...@@ -109,12 +117,12 @@ reduceBranch prox thr docs branch =
$ mapWithKey (\prd groups -> $ mapWithKey (\prd groups ->
-- | 2) for each period, transform the groups as a proximity graph filtered by a threshold -- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
let edges = groupsToEdges prox thr ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups let edges = groupsToEdges prox thr ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
in map (\(idx,comp) -> in map (\comp ->
-- | 4) add to each groups their futur level parent group -- | 4) add to each groups their futur level parent group
let parentId = (((head' "reduceBranch" comp) ^. phylo_groupPeriod, 1 + (head' "reduceBranch" comp) ^. phylo_groupLevel), idx) let parentId = toParentId (head' "parentId" comp)
in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp ) in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
-- |3) reduce the graph a a set of related components -- |3) reduce the graph a a set of related components
$ zip [1..] (toRelatedComponents groups edges)) periods $ toRelatedComponents groups edges) periods
synchronicClustering :: Phylo -> Phylo synchronicClustering :: Phylo -> Phylo
...@@ -123,5 +131,6 @@ synchronicClustering phylo = ...@@ -123,5 +131,6 @@ synchronicClustering phylo =
ByProximityThreshold thr -> toNextLevel phylo ByProximityThreshold thr -> toNextLevel phylo
$ concat $ concat
$ map (\branch -> reduceBranch (phyloProximity $ getConfig phylo) thr (phylo ^. phylo_timeDocs) branch) $ map (\branch -> reduceBranch (phyloProximity $ getConfig phylo) thr (phylo ^. phylo_timeDocs) branch)
$ phyloToLastBranches phylo $ phyloToLastBranches
$ traceSynchronyStart phylo
ByProximityDistribution -> undefined ByProximityDistribution -> undefined
\ No newline at end of file
...@@ -260,19 +260,19 @@ groupsToBranches groups = ...@@ -260,19 +260,19 @@ groupsToBranches groups =
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup] recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
recursiveMatching proximity thr frame periods docs quality branches = recursiveMatching proximity thr frame periods docs quality branches =
if (length branches == (length $ concat branches)) if (length branches == (length $ concat branches))
then concat $ traceMatchNoSplit branches then concat branches
else if thr > 1 else if thr > 1
then concat $ traceMatchLimit branches then concat branches
else else
case quality <= (sum nextQualities) of case quality <= (sum nextQualities) of
-- | success : the new threshold improves the quality score, let's go deeper -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
True -> concat True -> concat
$ map (\branches' -> $ map (\branches' ->
let idx = fromJust $ elemIndex branches' nextBranches let idx = fromJust $ elemIndex branches' nextBranches
in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches') in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
$ traceMatchSuccess thr quality (sum nextQualities) nextBranches $ nextBranches
-- | failure : last step was a local maximum of quality, let's validate it -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
False -> concat $ traceMatchFailure thr quality (sum nextQualities) branches False -> concat branches
where where
-- | 2) for each of the possible next branches process the phyloQuality score -- | 2) for each of the possible next branches process the phyloQuality score
nextQualities :: [Double] nextQualities :: [Double]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment