Commit c60dc0b7 authored by qlobbe's avatar qlobbe

end of phylo clustering

parent bea591f6
Pipeline #571 failed with stage
......@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString)
import Data.Maybe (isJust, fromJust)
import Data.List (concat, nub, isSuffixOf, take)
import Data.String (String)
import Data.Text (Text, unwords)
import Data.Text (Text, unwords, unpack)
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
......@@ -37,6 +37,8 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import GHC.IO (FilePath)
import Prelude (Either(..))
......@@ -54,21 +56,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
---------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do
......@@ -166,6 +153,17 @@ main = do
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the Phylo"
let phylo = toPhylo corpus mapList config
printIOMsg "End of reconstruction"
\ No newline at end of file
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport phylo
printIOMsg "End of export to dot"
let output = (outputPath config)
<> (unpack $ phyloName config)
<> "_V2.dot"
dotToFile output dot
\ No newline at end of file
......@@ -115,7 +115,7 @@ defaultConfig =
, phyloName = pack "Default Phylo"
, phyloLevel = 1
, phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityThreshold 0.4
, phyloSynchrony = ByProximityThreshold 0.1
, timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
......
......@@ -45,7 +45,6 @@ phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot"
phyloDot :: DotGraph DotId
phyloDot = toPhyloExport phylo2
phylo2 :: Phylo
phylo2 = synchronicClustering phylo1
......
......@@ -447,15 +447,12 @@ toPhyloExport phylo = exportToDot phylo
export = PhyloExport groups branches
--------------------------------------
branches :: [PhyloBranch]
branches = traceBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups
branches = traceExportBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups
--------------------------------------
groups :: [PhyloGroup]
groups = traceGroups $ processDynamics
groups = processDynamics
$ getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo
traceBranches :: [PhyloBranch] -> [PhyloBranch]
traceBranches branches = trace (">>>> nb branches : " <> show(length branches)) branches
traceGroups :: [PhyloGroup] -> [PhyloGroup]
traceGroups groups = trace (">>>> nb groups : " <> show(length groups)) groups
\ No newline at end of file
traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
traceExportBranches branches = trace ("\n" <> "-- | Export " <> show(length branches) <> " branches") branches
......@@ -43,13 +43,14 @@ import qualified Data.Set as Set
toPhylo :: [Document] -> TermList -> Config -> Phylo
toPhylo docs lst conf = phylo1
toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
if (phyloLevel conf) > 1
then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
else phylo1
where
--------------------------------------
phylo1 :: Phylo
phylo1 = synchronicClustering
$ temporalMatching
$ toPhylo1 docs phyloBase
phylo1 = toPhylo1 docs phyloBase
--------------------------------------
phyloBase :: Phylo
phyloBase = toPhyloBase docs lst conf
......@@ -230,4 +231,4 @@ toPhyloBase docs lst conf =
(docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs)
params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
......@@ -33,6 +33,25 @@ import qualified Data.Vector as Vector
import qualified Data.List as List
import qualified Data.Set as Set
------------
-- | Io | --
------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
--------------
-- | Misc | --
--------------
......@@ -232,6 +251,9 @@ getPeriodIds phylo = sortOn fst
$ keys
$ phylo ^. phylo_periods
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
getLastLevel :: Phylo -> Level
getLastLevel phylo = last' "lastLevel" $ getLevels phylo
......@@ -282,6 +304,13 @@ updatePhyloGroups lvl m phylo =
then m ! id
else group ) phylo
traceToPhylo :: Level -> Phylo -> Phylo
traceToPhylo lvl phylo =
trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
<> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
--------------------
-- | Clustering | --
--------------------
......@@ -298,6 +327,21 @@ relatedComponents graphs = foldl' (\mem groups ->
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs
traceSynchronyEnd :: Phylo -> Phylo
traceSynchronyEnd phylo =
trace ( "\n" <> "-- | End of synchronic clustering for level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
traceSynchronyStart :: Phylo -> Phylo
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start of synchronic clustering for level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
-------------------
-- | Proximity | --
-------------------
......
......@@ -31,51 +31,56 @@ import Debug.Trace (trace)
-- | New Level Maker | --
-------------------------
mergeGroups :: [Cooc] -> PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id childs =
toBranchId :: PhyloGroup -> PhyloBranchId
toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id)
(snd $ fst id)
(snd id)
""
(sum $ map _phylo_groupSupport childs)
ngrams
(ngramsToCooc ngrams coocs)
(((head' "mergeGroups" childs) ^. phylo_groupLevel) + 1, snd ((head' "mergeGroups" childs) ^. phylo_groupBranchId))
empty
[]
(map (\g -> (getGroupId g, 1)) childs)
(concat $ map _phylo_groupPeriodParents childs)
(concat $ map _phylo_groupPeriodChilds childs)
addNewLevel :: Level -> Phylo -> Phylo
addNewLevel lvl phylo =
over ( phylo_periods
. traverse )
(\phyloPrd ->
phyloPrd & phylo_periodLevels %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl + 1)
(PhyloLevel (phyloPrd ^. phylo_periodPeriod) (lvl + 1) empty))) phylo
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs) ngrams
(ngramsToCooc ngrams coocs) (toBranchId (head' "mergeGroups" childs))
empty [] (map (\g -> (getGroupId g, 1)) childs)
(updatePointers $ concat $ map _phylo_groupPeriodParents childs)
(updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
where
updatePointers :: [Pointer] -> [Pointer]
updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
addPhyloLevel :: Level -> Phylo -> Phylo
addPhyloLevel lvl phylo =
over ( phylo_periods . traverse )
(\phyloPrd -> phyloPrd & phylo_periodLevels
%~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
toNextLevel phylo groups =
let level = getLastLevel phylo
phylo' = updatePhyloGroups level (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
nextGroups = fromListWith (++)
$ foldlWithKey (\acc k v ->
let group = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [fst $ fst k]) k v
in acc ++ [(group ^. phylo_groupPeriod,[group])]) []
$ fromListWith (++) $ map (\g -> (fst $ head' "nextGroups" $ g ^. phylo_groupLevelParents,[g])) groups
in trace (">>>>>>>>>>>>>>>>>>>>>>>>" <> show (nextGroups)) over ( phylo_periods
. traverse
. phylo_periodLevels
. traverse
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (level + 1)))
(\phyloLvl -> if member (phyloLvl ^. phylo_levelPeriod) nextGroups
then phyloLvl & phylo_levelGroups .~ fromList ( map (\g -> (getGroupId g,g))
$ nextGroups ! (phyloLvl ^. phylo_levelPeriod))
else phyloLvl
) $ addNewLevel level phylo'
let curLvl = getLastLevel phylo
oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
newGroups = fromListWith (++)
-- | 5) group the parents by periods
$ foldlWithKey (\acc id groups' ->
-- | 4) create the parent group
let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
in acc ++ [(parent ^. phylo_groupPeriod, [parent])]) []
-- | 3) group the current groups by parentId
$ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
in traceSynchronyEnd
$ over ( phylo_periods . traverse . phylo_periodLevels . traverse
-- | 6) update each period at curLvl + 1
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
-- | 7) by adding the parents
(\phyloLvl ->
if member (phyloLvl ^. phylo_levelPeriod) newGroups
then phyloLvl & phylo_levelGroups
.~ fromList (map (\g -> (getGroupId g, g)) $ newGroups ! (phyloLvl ^. phylo_levelPeriod))
else phyloLvl)
-- | 2) add the curLvl + 1 phyloLevel to the phylo
$ addPhyloLevel (curLvl + 1)
-- | 1) update the current groups (with level parent pointers) in the phylo
$ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
--------------------
......@@ -99,6 +104,9 @@ groupsToEdges prox thr docs groups =
toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
toRelatedComponents nodes edges = relatedComponents $ ((map (\((g,g'),_) -> [g,g']) edges) ++ (map (\g -> [g]) nodes))
toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
reduceBranch :: Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
reduceBranch prox thr docs branch =
......@@ -109,12 +117,12 @@ reduceBranch prox thr docs branch =
$ mapWithKey (\prd groups ->
-- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
let edges = groupsToEdges prox thr ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
in map (\(idx,comp) ->
in map (\comp ->
-- | 4) add to each groups their futur level parent group
let parentId = (((head' "reduceBranch" comp) ^. phylo_groupPeriod, 1 + (head' "reduceBranch" comp) ^. phylo_groupLevel), idx)
let parentId = toParentId (head' "parentId" comp)
in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
-- |3) reduce the graph a a set of related components
$ zip [1..] (toRelatedComponents groups edges)) periods
$ toRelatedComponents groups edges) periods
synchronicClustering :: Phylo -> Phylo
......@@ -123,5 +131,6 @@ synchronicClustering phylo =
ByProximityThreshold thr -> toNextLevel phylo
$ concat
$ map (\branch -> reduceBranch (phyloProximity $ getConfig phylo) thr (phylo ^. phylo_timeDocs) branch)
$ phyloToLastBranches phylo
$ phyloToLastBranches
$ traceSynchronyStart phylo
ByProximityDistribution -> undefined
\ No newline at end of file
......@@ -260,19 +260,19 @@ groupsToBranches groups =
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
recursiveMatching proximity thr frame periods docs quality branches =
if (length branches == (length $ concat branches))
then concat $ traceMatchNoSplit branches
then concat branches
else if thr > 1
then concat $ traceMatchLimit branches
then concat branches
else
case quality <= (sum nextQualities) of
-- | success : the new threshold improves the quality score, let's go deeper
-- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
True -> concat
$ map (\branches' ->
let idx = fromJust $ elemIndex branches' nextBranches
in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
$ traceMatchSuccess thr quality (sum nextQualities) nextBranches
-- | failure : last step was a local maximum of quality, let's validate it
False -> concat $ traceMatchFailure thr quality (sum nextQualities) branches
$ nextBranches
-- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
False -> concat branches
where
-- | 2) for each of the possible next branches process the phyloQuality score
nextQualities :: [Double]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment