Commit eeeb82c8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev-phylo' into dev-merge

parents 76e11752 077bf19a
Pipeline #592 failed with stage
......@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString)
import Data.Maybe (isJust, fromJust)
import Data.List (concat, nub, isSuffixOf, take)
import Data.String (String)
import Data.Text (Text, unwords)
import Data.Text (Text, unwords, unpack)
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
......@@ -37,6 +37,9 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
import GHC.IO (FilePath)
import Prelude (Either(..))
......@@ -54,21 +57,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
---------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do
......@@ -166,6 +154,23 @@ main = do
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the Phylo"
let phylo = toPhylo corpus mapList config
printIOMsg "End of reconstruction"
\ No newline at end of file
-- | probes
-- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
-- $ synchronicDistance' phylo 1
-- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
-- $ inflexionPoints phylo 1
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport phylo
let output = (outputPath config)
<> (unpack $ phyloName config)
<> "_V2.dot"
dotToFile output dot
\ No newline at end of file
......@@ -72,6 +72,9 @@ library:
- Gargantext.Viz.AdaptativePhylo
- Gargantext.Viz.Phylo.PhyloMaker
- Gargantext.Viz.Phylo.Tools
- Gargantext.Viz.Phylo.PhyloTools
- Gargantext.Viz.Phylo.PhyloExport
- Gargantext.Viz.Phylo.SynchronicClustering
- Gargantext.Viz.Phylo.Example
- Gargantext.Viz.Phylo.LevelMaker
- Gargantext.Viz.Phylo.View.Export
......
......@@ -44,6 +44,8 @@ import GHC.IO (FilePath)
import Control.DeepSeq (NFData)
import Control.Lens (makeLenses)
import qualified Data.Text.Lazy as TextLazy
----------------
-- | Config | --
......@@ -65,6 +67,15 @@ data Proximity =
deriving (Show,Generic,Eq)
data Synchrony =
ByProximityThreshold
{ _bpt_threshold :: Double
, _bpt_sensibility :: Double}
| ByProximityDistribution
{ _bpd_sensibility :: Double}
deriving (Show,Generic,Eq)
data TimeUnit =
Year
{ _year_period :: Int
......@@ -80,6 +91,12 @@ data ContextualUnit =
deriving (Show,Generic,Eq)
data Quality =
Quality { _qua_relevance :: Double
, _qua_minBranch :: Int }
deriving (Show,Generic,Eq)
data Config =
Config { corpusPath :: FilePath
, listPath :: FilePath
......@@ -88,9 +105,13 @@ data Config =
, phyloName :: Text
, phyloLevel :: Int
, phyloProximity :: Proximity
, phyloSynchrony :: Synchrony
, phyloQuality :: Quality
, timeUnit :: TimeUnit
, contextualUnit :: ContextualUnit
, branchSize :: Int
, exportLabel :: [PhyloLabel]
, exportSort :: Sort
, exportFilter :: [Filter]
} deriving (Show,Generic,Eq)
......@@ -102,10 +123,14 @@ defaultConfig =
, corpusParser = Csv 1000
, phyloName = pack "Default Phylo"
, phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10 0 0.05
, phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityDistribution 0
, phyloQuality = Quality 0.1 1
, timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4
, branchSize = 3
, contextualUnit = Fis 1 5
, exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy
, exportFilter = [ByBranchSize 2]
}
instance FromJSON Config
......@@ -118,6 +143,20 @@ instance FromJSON TimeUnit
instance ToJSON TimeUnit
instance FromJSON ContextualUnit
instance ToJSON ContextualUnit
instance FromJSON PhyloLabel
instance ToJSON PhyloLabel
instance FromJSON Tagger
instance ToJSON Tagger
instance FromJSON Sort
instance ToJSON Sort
instance FromJSON Order
instance ToJSON Order
instance FromJSON Filter
instance ToJSON Filter
instance FromJSON Synchrony
instance ToJSON Synchrony
instance FromJSON Quality
instance ToJSON Quality
-- | Software parameters
......@@ -248,17 +287,18 @@ data PhyloGroup =
PhyloGroup { _phylo_groupPeriod :: (Date,Date)
, _phylo_groupLevel :: Level
, _phylo_groupIndex :: Int
, _phylo_groupLabel :: Text
, _phylo_groupSupport :: Support
, _phylo_groupNgrams :: [Int]
, _phylo_groupCooc :: !(Cooc)
, _phylo_groupBranchId :: PhyloBranchId
, _phylo_groupMeta :: Map Text [Double]
, _phylo_groupLevelParents :: [Pointer]
, _phylo_groupLevelChilds :: [Pointer]
, _phylo_groupPeriodParents :: [Pointer]
, _phylo_groupPeriodChilds :: [Pointer]
, _phylo_groupGhostPointers :: [Pointer]
}
deriving (Generic, Show, Eq)
deriving (Generic, Show, Eq, NFData)
-- | Weight : A generic mesure that can be associated with an Id
type Weight = Double
......@@ -266,8 +306,6 @@ type Weight = Double
-- | Pointer : A weighted pointer to a given PhyloGroup
type Pointer = (PhyloGroupId, Weight)
type Link = ((PhyloGroupId, PhyloGroupId), Weight)
data Filiation = ToParents | ToChilds deriving (Generic, Show)
data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
......@@ -290,13 +328,53 @@ data PhyloFis = PhyloFis
} deriving (Generic,NFData,Show,Eq)
----------------
-- | Export | --
----------------
type DotId = TextLazy.Text
data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
data Order = Asc | Desc deriving (Show,Generic,Eq)
data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
data PhyloLabel =
BranchLabel
{ _branch_labelTagger :: Tagger
, _branch_labelSize :: Int }
| GroupLabel
{ _group_labelTagger :: Tagger
, _group_labelSize :: Int }
deriving (Show,Generic,Eq)
data PhyloBranch =
PhyloBranch
{ _branch_id :: PhyloBranchId
, _branch_label :: Text
, _branch_meta :: Map Text [Double]
} deriving (Generic, Show)
data PhyloExport =
PhyloExport
{ _export_groups :: [PhyloGroup]
, _export_branches :: [PhyloBranch]
} deriving (Generic, Show)
----------------
-- | Lenses | --
----------------
makeLenses ''Config
makeLenses ''Proximity
makeLenses ''Quality
makeLenses ''ContextualUnit
makeLenses ''PhyloLabel
makeLenses ''TimeUnit
makeLenses ''PhyloFoundations
makeLenses ''PhyloFis
......@@ -305,6 +383,8 @@ makeLenses ''PhyloPeriod
makeLenses ''PhyloLevel
makeLenses ''PhyloGroup
makeLenses ''PhyloParam
makeLenses ''PhyloExport
makeLenses ''PhyloBranch
------------------------
-- | JSON instances | --
......
......@@ -83,7 +83,7 @@ queryViewEx = "level=3"
phyloQueryView :: PhyloQueryView
phyloQueryView = PhyloQueryView 2 Merge False 2 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True
phyloQueryView = PhyloQueryView 1 Merge False 1 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True
--------------------------------------------------
......@@ -110,7 +110,7 @@ queryEx = "title=Cesar et Cleôpatre"
phyloQueryBuild :: PhyloQueryBuild
phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)"
3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.5 20) 5 0.8 0.5 4 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0)
3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.9 10) 5 0.8 0.5 4 1 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0)
......
......@@ -29,18 +29,32 @@ import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.PhyloMaker
import Gargantext.Viz.Phylo.PhyloExport
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Control.Lens
import Data.GraphViz.Types.Generalised (DotGraph)
import qualified Data.Vector as Vector
phyloExport :: IO ()
phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot" phyloDot
phyloDot :: DotGraph DotId
phyloDot = toPhyloExport phylo2
phylo2 :: Phylo
phylo2 = synchronicClustering phylo1
-----------------------------------------------
-- | STEP 3 | -- Build the Level 1 of the Phylo
-----------------------------------------------
phylo1 :: Phylo
phylo1 = appendGroups fisToGroup 1 phyloFis phyloBase
phylo1 = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
---------------------------------------------
......@@ -80,7 +94,8 @@ nbDocsByYear = docsToTimeScaleNb docs
config :: Config
config =
defaultConfig { phyloName = "Cesar et Cleopatre"
, branchSize = 0
, phyloLevel = 2
, exportFilter = [ByBranchSize 0]
, contextualUnit = Fis 0 0 }
......
......@@ -12,5 +12,461 @@ Portability : POSIX
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module Gargantext.Viz.Phylo.PhyloExport where
import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList)
import Data.List ((++), sort, nub, concat, sortOn, reverse, groupBy, union, (\\), (!!), init, partition, unwords, nubBy)
import Data.Vector (Vector)
import Prelude (writeFile)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Control.Lens
import Data.GraphViz hiding (DotGraph, Order)
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
import Data.GraphViz.Types.Monadic
import Data.Text.Lazy (fromStrict, pack, unpack)
import System.FilePath
import Debug.Trace (trace)
import qualified Data.Text as Text
import qualified Data.Text.Lazy as Lazy
import qualified Data.GraphViz.Attributes.HTML as H
--------------------
-- | Dot export | --
--------------------
dotToFile :: FilePath -> DotGraph DotId -> IO ()
dotToFile filePath dotG = writeFile filePath $ dotToString dotG
dotToString :: DotGraph DotId -> [Char]
dotToString dotG = unpack (printDotGraph dotG)
dynamicToColor :: Double -> H.Attribute
dynamicToColor d
| d == 0 = H.BGColor (toColor LightCoral)
| d == 1 = H.BGColor (toColor Khaki)
| d == 2 = H.BGColor (toColor SkyBlue)
| otherwise = H.Color (toColor Black)
pickLabelColor :: [Double] -> H.Attribute
pickLabelColor lst
| elem 0 lst = dynamicToColor 0
| elem 2 lst = dynamicToColor 2
| elem 1 lst = dynamicToColor 1
| otherwise = dynamicToColor 3
toDotLabel :: Text.Text -> Label
toDotLabel lbl = StrLabel $ fromStrict lbl
toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
toAttr k v = customAttribute k v
metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
groupIdToDotId :: PhyloGroupId -> DotId
groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
branchIdToDotId :: PhyloBranchId -> DotId
branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
periodIdToDotId :: PhyloPeriodId -> DotId
periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
groupToTable fdt g = H.Table H.HTable
{ H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
, H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
, H.tableRows = [header]
<> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
<> ( map ngramsToRow $ splitEvery 4
$ reverse $ sortOn (snd . snd)
$ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
$ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
where
--------------------------------------
ngramsToRow :: [(Ngrams,(Double,Double))] -> H.Row
ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
H.LabelCell [H.Align H.HLeft,dynamicToColor d] $ H.Text [H.Str $ fromStrict n]) ns
--------------------------------------
header :: H.Row
header =
H.Cells [ H.LabelCell [pickLabelColor ((g ^. phylo_groupMeta) ! "dynamics")]
$ H.Text [H.Str $ (((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
<> (fromStrict " ( ")
<> (pack $ show (fst $ g ^. phylo_groupPeriod))
<> (fromStrict " , ")
<> (pack $ show (snd $ g ^. phylo_groupPeriod))
<> (fromStrict " ) "))]]
--------------------------------------
branchToDotNode :: PhyloBranch -> Dot DotId
branchToDotNode b =
node (branchIdToDotId $ b ^. branch_id)
([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
<> (metaToAttr $ b ^. branch_meta)
<> [ toAttr "nodeType" "branch"
, toAttr "branchId" (pack $ unwords (map show $ snd $ b ^. branch_id)) ])
periodToDotNode :: (Date,Date) -> Dot DotId
periodToDotNode prd =
node (periodIdToDotId prd)
([Shape Square, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
<> [ toAttr "nodeType" "period"
, toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
, toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
groupToDotNode :: Vector Ngrams -> PhyloGroup -> Dot DotId
groupToDotNode fdt g =
node (groupIdToDotId $ getGroupId g)
([FontName "Arial", Shape Square, toLabel (groupToTable fdt g)]
<> [ toAttr "nodeType" "group"
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))])
toDotEdge :: DotId -> DotId -> Text.Text -> EdgeType -> Dot DotId
toDotEdge source target lbl edgeType = edge source target
(case edgeType of
GroupToGroup -> [ Width 10, Color [toWColor Black], Constraint True
, Label (StrLabel $ fromStrict lbl)]
BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])
, Label (StrLabel $ fromStrict lbl)]
BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])
, Label (StrLabel $ fromStrict lbl)]
PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
mergePointers groups =
let toChilds = fromList $ concat $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupPeriodChilds) groups
toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
in unionWith (\w w' -> max w w') toChilds toParents
exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
exportToDot phylo export =
trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
<> show(length $ export ^. export_groups) <> " groups to a dot file\n") $
digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
-- | 1) init the dot graph
graphAttrs ( [ Label (toDotLabel $ (phyloName $ getConfig phylo))]
<> [ FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
, Ratio FillRatio
, Style [SItem Filled []],Color [toWColor White]]
-- | home made attributes
<> [(toAttr (fromStrict "nbDocs") $ pack $ show (sum $ elems $ phylo ^. phylo_timeDocs))
,(toAttr (fromStrict "proxiName") $ pack $ show (getProximityName $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "proxiInit") $ pack $ show (getProximityInit $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "proxiStep") $ pack $ show (getProximityStep $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "quaFactor") $ pack $ show (_qua_relevance $ phyloQuality $ getConfig phylo))
])
-- toAttr (fromStrict k) $ (pack . unwords) $ map show v
-- | 2) create a layer for the branches labels
subgraph (Str "Branches peaks") $ do
graphAttrs [Rank SameRank]
-- | 3) group the branches by hierarchy
-- mapM (\branches ->
-- subgraph (Str "Branches clade") $ do
-- graphAttrs [Rank SameRank]
-- -- | 4) create a node for each branch
-- mapM branchToDotNode branches
-- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
mapM branchToDotNode $ export ^. export_branches
-- | 5) create a layer for each period
_ <- mapM (\period ->
subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst period) <> show (snd period))) $ do
graphAttrs [Rank SameRank]
periodToDotNode period
-- | 6) create a node for each group
mapM (\g -> groupToDotNode (getRoots phylo) g) (filter (\g -> g ^. phylo_groupPeriod == period) $ export ^. export_groups)
) $ getPeriodIds phylo
-- | 7) create the edges between a branch and its first groups
_ <- mapM (\(bId,groups) ->
mapM (\g -> toDotEdge (branchIdToDotId bId) (groupIdToDotId $ getGroupId g) "" BranchToGroup) groups
)
$ toList
$ map (\groups -> head' "toDot"
$ groupBy (\g g' -> g' ^. phylo_groupPeriod == g ^. phylo_groupPeriod)
$ sortOn (fst . _phylo_groupPeriod) groups)
$ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
-- | 8) create the edges between the groups
_ <- mapM (\((k,k'),_) ->
toDotEdge (groupIdToDotId k) (groupIdToDotId k') "" GroupToGroup
) $ (toList . mergePointers) $ export ^. export_groups
-- | 7) create the edges between the periods
_ <- mapM (\(prd,prd') ->
toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
-- | 8) create the edges between the branches
_ <- mapM (\(bId,bId') ->
toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
(Text.pack $ show(branchIdsToProximity bId bId'
(getThresholdInit $ phyloProximity $ getConfig phylo)
(getThresholdStep $ phyloProximity $ getConfig phylo))) BranchToBranch
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
graphAttrs [Rank SameRank]
----------------
-- | Filter | --
----------------
filterByBranchSize :: Double -> PhyloExport -> PhyloExport
filterByBranchSize thr export =
let branches' = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
in export & export_branches .~ (fst branches')
& export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd branches')))
processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
processFilters filters qua export =
foldl (\export' f -> case f of
ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
else filterByBranchSize thr export'
) export filters
--------------
-- | Sort | --
--------------
sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
sortByHierarchy depth branches =
if (length branches == 1)
then branches
else concat
$ map (\branches' ->
let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
++ (sortByHierarchy (depth + 1) (snd partitions)))
$ groupBy (\b b' -> ((take depth . snd) $ b ^. branch_id) == ((take depth . snd) $ b' ^. branch_id) )
$ sortOn (\b -> (take depth . snd) $ b ^. branch_id) branches
sortByBirthDate :: Order -> PhyloExport -> PhyloExport
sortByBirthDate order export =
let branches = sortOn (\b -> (b ^. branch_meta) ! "birth") $ export ^. export_branches
branches' = case order of
Asc -> branches
Desc -> reverse branches
in export & export_branches .~ branches'
processSort :: Sort -> PhyloExport -> PhyloExport
processSort sort' export = case sort' of
ByBirthDate o -> sortByBirthDate o export
ByHierarchy -> export & export_branches .~ sortByHierarchy 0 (export ^. export_branches)
-----------------
-- | Metrics | --
-----------------
-- | Return the conditional probability of i knowing j
conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
conditional m i j = (findWithDefault 0 (i,j) m)
/ (m ! (j,j))
-- | Return the genericity score of a given ngram
genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
- (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
-- | Return the specificity score of a given ngram
specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
- (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
-- | Return the inclusion score of a given ngram
inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
+ (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
ngramsMetrics :: PhyloExport -> PhyloExport
ngramsMetrics export =
over ( export_groups
. traverse )
(\g -> g & phylo_groupMeta %~ insert "genericity"
(map (\n -> genericity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "specificity"
(map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "inclusion"
(map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
) export
branchDating :: PhyloExport -> PhyloExport
branchDating export =
over ( export_branches
. traverse )
(\b ->
let groups = sortOn fst
$ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
then acc ++ [g ^. phylo_groupPeriod]
else acc ) [] $ export ^. export_groups
birth = fst $ head' "birth" groups
age = (snd $ last' "age" groups) - birth
in b & branch_meta %~ insert "birth" [fromIntegral birth]
& branch_meta %~ insert "age" [fromIntegral age]
& branch_meta %~ insert "size" [fromIntegral $ length groups] ) export
processMetrics :: PhyloExport -> PhyloExport
processMetrics export = ngramsMetrics
$ branchDating export
-----------------
-- | Taggers | --
-----------------
getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
$ take nth
$ reverse
$ sortOn snd $ zip [0..] meta
mostInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
mostInclusive nth foundations export =
over ( export_branches
. traverse )
(\b ->
let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
cooc = foldl (\acc g -> unionWith (+) acc (g ^. phylo_groupCooc)) empty groups
ngrams = sort $ foldl (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups
inc = map (\n -> inclusion cooc (ngrams \\ [n]) n) ngrams
lbl = ngramsToLabel foundations $ getNthMostMeta nth inc ngrams
in b & branch_label .~ lbl ) export
mostEmergentInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
mostEmergentInclusive nth foundations export =
over ( export_groups
. traverse )
(\g ->
let lbl = ngramsToLabel foundations
$ take nth
$ map (\(_,(_,idx)) -> idx)
$ concat
$ map (\groups -> sortOn (fst . snd) groups)
$ groupBy ((==) `on` fst) $ reverse $ sortOn fst
$ zip ((g ^. phylo_groupMeta) ! "inclusion")
$ zip ((g ^. phylo_groupMeta) ! "dynamics") (g ^. phylo_groupNgrams)
in g & phylo_groupLabel .~ lbl ) export
processLabels :: [PhyloLabel] -> Vector Ngrams -> PhyloExport -> PhyloExport
processLabels labels foundations export =
foldl (\export' label ->
case label of
GroupLabel tagger nth ->
case tagger of
MostEmergentInclusive -> mostEmergentInclusive nth foundations export'
_ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger"
BranchLabel tagger nth ->
case tagger of
MostInclusive -> mostInclusive nth foundations export'
_ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
------------------
-- | Dynamics | --
------------------
toDynamics :: Int -> [PhyloGroup] -> PhyloGroup -> Map Int (Date,Date) -> Double
toDynamics n parents group m =
let prd = group ^. phylo_groupPeriod
end = last' "dynamics" (sort $ map snd $ elems m)
in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
-- | decrease
then 2
else if ((fst prd) == (fst $ m ! n))
-- | recombination
then 0
else if isNew
-- | emergence
then 1
else 3
where
--------------------------------------
isNew :: Bool
isNew = not $ elem n $ concat $ map _phylo_groupNgrams parents
processDynamics :: [PhyloGroup] -> [PhyloGroup]
processDynamics groups =
map (\g ->
let parents = filter (\g' -> (g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
&& ((fst $ g ^. phylo_groupPeriod) > (fst $ g' ^. phylo_groupPeriod))) groups
in g & phylo_groupMeta %~ insert "dynamics" (map (\n -> toDynamics n parents g mapNgrams) $ g ^. phylo_groupNgrams) ) groups
where
--------------------------------------
mapNgrams :: Map Int (Date,Date)
mapNgrams = map (\dates ->
let dates' = sort dates
in (head' "dynamics" dates', last' "dynamics" dates'))
$ fromListWith (++)
$ foldl (\acc g -> acc ++ ( map (\n -> (n,[fst $ g ^. phylo_groupPeriod, snd $ g ^. phylo_groupPeriod]))
$ (g ^. phylo_groupNgrams))) [] groups
---------------------
-- | phyloExport | --
---------------------
toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = exportToDot phylo
$ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
$ processSort (exportSort $ getConfig phylo)
$ processLabels (exportLabel $ getConfig phylo) (getRoots phylo)
$ processMetrics export
where
export :: PhyloExport
export = PhyloExport groups branches
--------------------------------------
branches :: [PhyloBranch]
branches = traceExportBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups
--------------------------------------
groups :: [PhyloGroup]
groups = processDynamics
$ getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo
traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
traceExportBranches branches = trace ("\n" <> "-- | Export " <> show(length branches) <> " branches") branches
......@@ -16,13 +16,15 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloMaker where
import Data.List (concat, nub, partition, sort, (++))
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys)
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys)
import Data.Set (size)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
......@@ -41,7 +43,10 @@ import qualified Data.Set as Set
toPhylo :: [Document] -> TermList -> Config -> Phylo
toPhylo docs lst conf = phylo1
toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
if (phyloLevel conf) > 1
then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
else phylo1
where
--------------------------------------
phylo1 :: Phylo
......@@ -82,16 +87,18 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
fisToGroup fis pId lvl idx fdt coocs =
let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
in PhyloGroup pId lvl idx
in PhyloGroup pId lvl idx ""
(fis ^. phyloFis_support)
ngrams
(ngramsToCooc ngrams coocs)
(1,[])
[] [] [] [] []
(1,[0])
empty
[] [] [] []
toPhylo1 :: [Document] -> Phylo -> Phylo
toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
toPhylo1 docs phyloBase = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
where
--------------------------------------
phyloFis :: Map (Date,Date) [PhyloFis]
......@@ -164,14 +171,6 @@ toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
--------------------
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc :: [Int] -> [Cooc] -> Cooc
ngramsToCooc ngrams coocs =
let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
pairs = listToKeys ngrams
in filterWithKey (\k _ -> elem k pairs) cooc
-- | To transform the docs into a time map of coocurency matrix
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
docsToTimeScaleCooc docs fdt =
......@@ -232,4 +231,4 @@ toPhyloBase docs lst conf =
(docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs)
params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
......@@ -17,13 +17,16 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloTools where
import Data.Vector (Vector, elemIndex)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub)
import Data.Set (Set, size)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
import Data.Set (Set, size, disjoint)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
import Data.String (String)
import Data.Text (Text, unwords)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Text.Printf
import Debug.Trace (trace)
import Control.Lens hiding (Level)
......@@ -32,14 +35,40 @@ import qualified Data.Vector as Vector
import qualified Data.List as List
import qualified Data.Set as Set
------------
-- | Io | --
------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
--------------
-- | Misc | --
--------------
roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
roundToStr = printf "%0.*f"
countSup :: Double -> [Double] -> Int
countSup s l = length $ filter (>s) l
dropByIdx :: Int -> [a] -> [a]
dropByIdx k l = take k l ++ drop (k+1) l
elemIndex' :: Eq a => a -> [a] -> Int
elemIndex' e l = case (List.elemIndex e l) of
......@@ -60,6 +89,15 @@ isRoots n ns = Vector.elem n ns
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
-- | To transform a list of Ngrams Indexes into a Label
ngramsToLabel :: Vector Ngrams -> [Int] -> Text
ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
-- | To transform a list of Ngrams Indexes into a list of Text
ngramsToText :: Vector Ngrams -> [Int] -> [Text]
ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
--------------
-- | Time | --
......@@ -168,7 +206,6 @@ getFisSize unit = case unit of
-- | Cooc | --
--------------
listToCombi' :: [a] -> [(a,a)]
listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
......@@ -181,12 +218,24 @@ listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
listToMatrix :: [Int] -> Map (Int,Int) Double
listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
listToSeq :: Eq a => [a] -> [(a,a)]
listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
sumCooc :: Cooc -> Cooc -> Cooc
sumCooc cooc cooc' = unionWith (+) cooc cooc'
getTrace :: Cooc -> Double
getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc :: [Int] -> [Cooc] -> Cooc
ngramsToCooc ngrams coocs =
let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
pairs = listToKeys ngrams
in filterWithKey (\k _ -> elem k pairs) cooc
--------------------
-- | PhyloGroup | --
--------------------
......@@ -194,6 +243,40 @@ getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
getPeriodPointers fil group =
case fil of
ToChilds -> group ^. phylo_groupPeriodChilds
ToParents -> group ^. phylo_groupPeriodParents
filterProximity :: Proximity -> Double -> Double -> Bool
filterProximity proximity thr local =
case proximity of
WeightedLogJaccard _ _ _ -> local >= thr
Hamming -> undefined
getProximityName :: Proximity -> String
getProximityName proximity =
case proximity of
WeightedLogJaccard _ _ _ -> "WLJaccard"
Hamming -> "Hamming"
getProximityInit :: Proximity -> Double
getProximityInit proximity =
case proximity of
WeightedLogJaccard _ i _ -> i
Hamming -> undefined
getProximityStep :: Proximity -> Double
getProximityStep proximity =
case proximity of
WeightedLogJaccard _ _ s -> s
Hamming -> undefined
---------------
-- | Phylo | --
---------------
......@@ -202,11 +285,11 @@ addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
addPointers group fil pty pointers =
case pty of
TemporalPointer -> case fil of
ToChilds -> group & phylo_groupPeriodChilds %~ (++ pointers)
ToParents -> group & phylo_groupPeriodParents %~ (++ pointers)
ToChilds -> group & phylo_groupPeriodChilds .~ pointers
ToParents -> group & phylo_groupPeriodParents .~ pointers
LevelPointer -> case fil of
ToChilds -> group & phylo_groupLevelChilds %~ (++ pointers)
ToParents -> group & phylo_groupLevelParents %~ (++ pointers)
ToChilds -> group & phylo_groupLevelChilds .~ pointers
ToParents -> group & phylo_groupLevelParents .~ pointers
getPeriodIds :: Phylo -> [(Date,Date)]
......@@ -214,6 +297,19 @@ getPeriodIds phylo = sortOn fst
$ keys
$ phylo ^. phylo_periods
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
getLastLevel :: Phylo -> Level
getLastLevel phylo = last' "lastLevel" $ getLevels phylo
getLevels :: Phylo -> [Level]
getLevels phylo = nub
$ map snd
$ keys $ view ( phylo_periods
. traverse
. phylo_periodLevels ) phylo
getConfig :: Phylo -> Config
getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
......@@ -222,6 +318,11 @@ getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
getRoots :: Phylo -> Vector Ngrams
getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
phyloToLastBranches :: Phylo -> [[PhyloGroup]]
phyloToLastBranches phylo = elems
$ fromListWith (++)
$ map (\g -> (g ^. phylo_groupBranchId, [g]))
$ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
getGroupsFromLevel lvl phylo =
......@@ -250,13 +351,38 @@ updatePhyloGroups lvl m phylo =
else group ) phylo
------------------
-- | Pointers | --
------------------
traceToPhylo :: Level -> Phylo -> Phylo
traceToPhylo lvl phylo =
trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
<> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
--------------------
-- | Clustering | --
--------------------
pointersToLinks :: PhyloGroupId -> [Pointer] -> [Link]
pointersToLinks id pointers = map (\p -> ((id,fst p),snd p)) pointers
relatedComponents :: Ord a => [[a]] -> [[a]]
relatedComponents graph = foldl' (\acc groups ->
if (null acc)
then acc ++ [groups]
else
let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
traceSynchronyEnd :: Phylo -> Phylo
traceSynchronyEnd phylo =
trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
traceSynchronyStart :: Phylo -> Phylo
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
-------------------
......@@ -279,9 +405,77 @@ getThresholdStep proxi = case proxi of
Hamming -> undefined
traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
traceBranchMatching proxi thr groups = case proxi of
WeightedLogJaccard _ i s -> trace (
roundToStr 2 thr <> " "
<> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
<> " " <> show(length groups) <> " groups"
) groups
Hamming -> undefined
----------------
-- | Branch | --
----------------
intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
intersectInit acc lst lst' =
if (null lst) || (null lst')
then acc
else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
else acc
branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
ngramsInBranches :: [[PhyloGroup]] -> [Int]
ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
traceMatchSuccess thr qua qua' nextBranches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
$ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length nextBranches) <> ")]"
<> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
<> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
<> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchFailure thr qua qua' branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
) branches
traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchNoSplit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to split in smaller branches" <> "\n"
) branches
traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchLimit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to increase the threshold above 1" <> "\n"
) branches
traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
traceMatchEnd groups =
trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
<> " branches and " <> show (length groups) <> " groups" <> "\n") groups
traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
traceTemporalMatching groups =
trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
\ No newline at end of file
......@@ -16,22 +16,180 @@ Portability : POSIX
module Gargantext.Viz.Phylo.SynchronicClustering where
import Gargantext.Prelude
-- import Gargantext.Viz.AdaptativePhylo
-- import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard)
import Data.List ((++), null, intersect, nub, concat, sort, sortOn)
import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import qualified Data.Map as Map
-------------------------
-- | New Level Maker | --
-------------------------
toBranchId :: PhyloGroup -> PhyloBranchId
toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs) ngrams
(ngramsToCooc ngrams coocs) (toBranchId (head' "mergeGroups" childs))
empty [] (map (\g -> (getGroupId g, 1)) childs)
(updatePointers $ concat $ map _phylo_groupPeriodParents childs)
(updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
where
updatePointers :: [Pointer] -> [Pointer]
updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
addPhyloLevel :: Level -> Phylo -> Phylo
addPhyloLevel lvl phylo =
over ( phylo_periods . traverse )
(\phyloPrd -> phyloPrd & phylo_periodLevels
%~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
toNextLevel phylo groups =
let curLvl = getLastLevel phylo
oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
newGroups = fromListWith (++)
-- | 5) group the parents by periods
$ foldlWithKey (\acc id groups' ->
-- | 4) create the parent group
let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
in acc ++ [(parent ^. phylo_groupPeriod, [parent])]) []
-- | 3) group the current groups by parentId
$ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
in traceSynchronyEnd
$ over ( phylo_periods . traverse . phylo_periodLevels . traverse
-- | 6) update each period at curLvl + 1
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
-- | 7) by adding the parents
(\phyloLvl ->
if member (phyloLvl ^. phylo_levelPeriod) newGroups
then phyloLvl & phylo_levelGroups
.~ fromList (map (\g -> (getGroupId g, g)) $ newGroups ! (phyloLvl ^. phylo_levelPeriod))
else phyloLvl)
-- | 2) add the curLvl + 1 phyloLevel to the phylo
$ addPhyloLevel (curLvl + 1)
-- | 1) update the current groups (with level parent pointers) in the phylo
$ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
import Data.List (foldl', (++), null, intersect, (\\), union, nub, concat)
--------------------
-- | Clustering | --
--------------------
relatedComponents :: Eq a => [[a]] -> [[a]]
relatedComponents graphs = foldl' (\mem groups ->
if (null mem)
then mem ++ [groups]
else
let related = filter (\groups' -> (not . null) $ intersect groups groups') mem
in if (null related)
then mem ++ [groups]
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs
\ No newline at end of file
toPairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
toPairs groups = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))
$ listToCombi' groups
toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
toDiamonds groups = foldl' (\acc groups' ->
acc ++ ( elems
$ Map.filter (\v -> length v > 1)
$ fromListWith (++)
$ foldl' (\acc' g ->
acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
$ elems
$ Map.filter (\v -> length v > 1)
$ fromListWith (++)
$ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
groupsToEdges :: Proximity -> Synchrony -> Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
groupsToEdges prox sync docs groups =
case sync of
ByProximityThreshold t s -> filter (\(_,w) -> w >= t)
$ toEdges s
$ toPairs groups
ByProximityDistribution s ->
let diamonds = sortOn snd
$ toEdges s $ concat
$ map toPairs $ toDiamonds groups
in take (div (length diamonds) 2) diamonds
where
toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
toEdges sens edges =
case prox of
WeightedLogJaccard _ _ _ -> map (\(g,g') ->
((g,g'), weightedLogJaccard sens docs
(g ^. phylo_groupCooc) (g' ^. phylo_groupCooc)
(g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
_ -> undefined
toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
toRelatedComponents nodes edges =
let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
reduceBranch :: Proximity -> Synchrony -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
reduceBranch prox sync docs branch =
-- | 1) reduce a branch as a set of periods & groups
let periods = fromListWith (++)
$ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
in (concat . concat . elems)
$ mapWithKey (\prd groups ->
-- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
let edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
in map (\comp ->
-- | 4) add to each groups their futur level parent group
let parentId = toParentId (head' "parentId" comp)
in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
-- |3) reduce the graph a a set of related components
$ toRelatedComponents groups edges) periods
synchronicClustering :: Phylo -> Phylo
synchronicClustering phylo =
let prox = phyloProximity $ getConfig phylo
sync = phyloSynchrony $ getConfig phylo
docs = phylo ^. phylo_timeDocs
branches = map (\branch -> reduceBranch prox sync docs branch)
$ phyloToLastBranches
$ traceSynchronyStart phylo
branches' = branches `using` parList rdeepseq
in toNextLevel phylo $ concat branches'
----------------
-- | probes | --
----------------
-- synchronicDistance :: Phylo -> Level -> String
-- synchronicDistance phylo lvl =
-- foldl' (\acc branch ->
-- acc <> (foldl' (\acc' period ->
-- acc' <> let prox = phyloProximity $ getConfig phylo
-- sync = phyloSynchrony $ getConfig phylo
-- docs = _phylo_timeDocs phylo
-- prd = _phylo_groupPeriod $ head' "distance" period
-- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
-- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
-- in foldl' (\mem (_,w) ->
-- mem <> show (prd)
-- <> "\t"
-- <> show (w)
-- <> "\n"
-- ) "" edges
-- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
-- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo
\ No newline at end of file
......@@ -15,19 +15,21 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete)
import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.SynchronicClustering
import Prelude (logBase)
-- import Prelude (logBase)
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Debug.Trace (trace)
import qualified Data.Set as Set
-------------------
-- | Proximity | --
-------------------
......@@ -65,10 +67,11 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
ngramsUnion = union ngrams ngrams'
--------------------------------------
coocInter :: [Double]
coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
-- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
--------------------------------------
coocUnion :: [Double]
coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
--------------------------------------
......@@ -79,13 +82,6 @@ pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
Hamming -> undefined
filterProximity :: Proximity -> Double -> Double -> Bool
filterProximity proximity thr local =
case proximity of
WeightedLogJaccard _ _ _ -> local >= thr
Hamming -> undefined
-- | To process the proximity between a current group and a pair of targets group
toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
toProximity docs proximity ego target target' =
......@@ -103,50 +99,88 @@ toProximity docs proximity ego target target' =
-- | Local Matching | --
------------------------
toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
toLastPeriod fil periods = case fil of
ToParents -> head' "toLastPeriod" (sortOn fst periods)
ToChilds -> last' "toLastPeriod" (sortOn fst periods)
toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
toLazyPairs pointers fil thr prox prd pairs =
if null pointers then pairs
else let rest = filterPointers prox thr pointers
in if null rest
then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
in if prd' == prd
then []
else filter (\(g,g') ->
case fil of
ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
else []
-- | Find pairs of valuable candidates to be matched
makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
makePairs candidates periods = case null periods of
makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
makePairs' ego candidates periods pointers fil thr prox docs =
case null periods of
True -> []
False -> toLazyPairs pointers fil thr prox lastPrd
-- | at least on of the pair candidates should be from the last added period
False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
|| (inLastPeriod cdt' periods))
$ listToKeys candidates
$ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
|| ((g' ^. phylo_groupPeriod) == lastPrd))
$ listToKeys
$ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
|| ((toProximity docs prox ego ego g) >= thr)) candidates
where
inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
lastPrd :: PhyloPeriodId
lastPrd = toLastPeriod fil periods
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
Nothing -> addPointers ego fil TemporalPointer []
Just pts -> addPointers ego fil TemporalPointer
filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil proxi docs thr ego =
case null nextPointers of
-- | let's find new pointers
True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
then addPointers ego fil TemporalPointer []
-- | or keep the old ones
else addPointers ego fil TemporalPointer
$ filterPointers proxi thr $ getPeriodPointers fil ego
False -> addPointers ego fil TemporalPointer
$ head' "phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity
$ groupBy (\pt pt' -> snd pt == snd pt')
$ reverse $ sortOn snd pts
$ reverse $ sortOn snd $ head' "pointers"
$ nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold
where
pointers :: Maybe [Pointer]
pointers = find (not . null)
nextPointers :: [[Pointer]]
nextPointers = take 1
$ dropWhile (null)
-- | for each time frame, process the proximity on relevant pairs of targeted groups
$ scanl (\acc groups ->
let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
pairs = makePairs (concat groups) periods
in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
let periods = nub $ map _phylo_groupPeriod $ concat groups
docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
in acc ++ ( filterPointers proxi thr
$ concat
$ map (\(c,c') ->
-- | process the proximity between the current group and a pair of candidates
let proximity = toProximity (filterDocs docs periods) proxi ego c c'
let proximity = toProximity docs' proxi ego c c'
in if (c == c')
then [(getGroupId c,proximity)]
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
) []
-- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
$ inits candidates
--------------------------------------
filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
filterDocs d pds = restrictKeys d $ periodsToYears pds
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
$ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
filterDocs d pds = restrictKeys d $ periodsToYears pds
-----------------------------
......@@ -161,30 +195,40 @@ getNextPeriods fil max' pId pIds =
ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
getCandidates fil ego pIds targets =
getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
getCandidates fil ego targets =
case fil of
ToChilds -> targets'
ToParents -> reverse targets'
where
targets' :: [[PhyloGroup]]
targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
$ filterWithKey (\k _ -> elem k pIds)
$ fromListWith (++)
$ sortOn (fst . fst)
$ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
processMatching max' periods proximity thr docs groups =
map (\group ->
let childs = getCandidates ToChilds group
(getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
parents = getCandidates ToParents group
(getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
in phyloGroupMatching parents ToParents proximity docs thr
$ phyloGroupMatching childs ToChilds proximity docs thr group
) groups
targets' =
map (\groups' ->
filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
) groups') targets
phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
-- $ matchByPeriods ToParents
-- $ groupByField _phylo_groupPeriod
$ matchByPeriods
$ groupByField _phylo_groupPeriod branch
where
--------------------------------------
matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
matchByPeriods branch' = foldl' (\acc prd ->
let periodsPar = getNextPeriods ToParents frame prd periods
periodsChi = getNextPeriods ToChilds frame prd periods
candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
docsPar = filterDocs docs ([prd] ++ periodsPar)
docsChi = filterDocs docs ([prd] ++ periodsChi)
egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
$ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
$ findWithDefault [] prd branch'
egos' = egos `using` parList rdeepseq
in acc ++ egos' ) [] periods
-----------------------
......@@ -192,35 +236,80 @@ processMatching max' periods proximity thr docs groups =
-----------------------
termFreq :: Int -> [[PhyloGroup]] -> Double
termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
/ (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
count :: Eq a => a -> [a] -> Int
count x = length . filter (== x)
termFreq' :: Int -> [PhyloGroup] -> Double
termFreq' term groups =
let ngrams = concat $ map _phylo_groupNgrams groups
in log ((fromIntegral $ count term ngrams)
/ (fromIntegral $ length ngrams))
entropy :: [[PhyloGroup]] -> Double
entropy branches =
let terms = ngramsInBranches branches
in sum $ map (\term -> (1 / log (termFreq term branches))
/ (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
* (sum $ map (\branch ->
let q = branchObs term (length $ concat branches) branch
in q * logBase 2 q ) branches) ) terms
where
-- | Probability to observe a branch given a random term of the phylo
branchObs :: Int -> Int -> [PhyloGroup] -> Double
branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
/ (fromIntegral total)
relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
relevantBranches term branches =
filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
branchCov' branch branches =
(fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
homogeneity :: [[PhyloGroup]] -> Double
homogeneity _ = undefined
-- where
-- branchCov :: [PhyloGroup] -> Int -> Double
-- branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
toRecall freq term border branches =
-- | given a random term in a phylo
freq
-- | for each relevant branches
* (sum $ map (\branch ->
-- | given its local coverage
((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
-- | compute the local recall
* ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches')
-- | with a ponderation from border branches
+ (fromIntegral border)) )) branches')
where
branches' :: [[PhyloGroup]]
branches' = relevantBranches term branches
toAccuracy :: Double -> Int -> [[PhyloGroup]] -> Double
toAccuracy freq term branches =
if (null branches)
then 0
else
-- | given a random term in a phylo
freq
-- | for each relevant branches
* (sum $ map (\branch ->
-- | given its local coverage
((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
-- | compute the local accuracy
* ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ (fromIntegral $ length branch))) branches')
where
branches' :: [[PhyloGroup]]
branches' = relevantBranches term branches
toPhyloQuality :: Double -> Map Int Double -> Int -> Double -> [[PhyloGroup]] -> Double
toPhyloQuality beta frequency border oldAcc branches =
-- trace (" rec : " <> show(recall)) $
-- trace (" acc : " <> show(accuracy)) $
if (null branches)
then 0
else ((1 + beta ** 2) * accuracy * recall)
/ (((beta ** 2) * accuracy + recall))
where
-- | for each term compute the global accuracy
accuracy :: Double
accuracy = oldAcc + (sum $ map (\term -> toAccuracy (frequency ! term) term branches) $ keys frequency)
-- | for each term compute the global recall
recall :: Double
recall = sum $ map (\term -> toRecall (frequency ! term) term border branches) $ keys frequency
toPhyloQuality :: [[PhyloGroup]] -> Double
toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
toBorderAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
toBorderAccuracy freq branches = sum $ map (\t -> toAccuracy (freq ! t) t branches) $ keys freq
-----------------------------
......@@ -231,54 +320,102 @@ toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches groups =
-- | run the related component algorithm
let graph = zip [1..]
$ relatedComponents
let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
$ sortOn (\gs -> fst $ fst $ head' "egos" gs)
$ map (\group -> [getGroupId group]
++ (map fst $ group ^. phylo_groupPeriodParents)
++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
-- | first find the related components by inside each ego's period
graph' = map relatedComponents egos
-- | then run it for the all the periods
graph = zip [1..]
$ relatedComponents $ concat (graph' `using` parList rdeepseq)
-- | update each group's branch id
in map (\(bId,ids) ->
map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
$ elems $ restrictKeys groups (Set.fromList ids)
) graph
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
recursiveMatching proximity thr max' periods docs quality groups =
case quality < quality' of
-- | success : we localy improve the quality of the branch, let's go deeper
True -> concat
$ map (\branch ->
recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
) branches
-- | failure : last step was the local maximum, let's validate it
False -> groups
in groups' `using` parList rdeepseq ) graph
reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
reduceFrequency frequency branches =
restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
alterBorder border branches branch = border + (length $ concat branches) - (length branch)
recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Int -> Double -> [PhyloGroup] -> [PhyloGroup]
recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality border oldAcc groups =
if ((egoThr >= 1) || (quality > quality') || ((length $ concat $ snd branches') == (length groups)))
then
trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
<> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
<> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
groups
else
let next = map (\b -> recursiveMatching proximity beta minBranch
(reduceFrequency frequency (fst branches'))
(egoThr + (getThresholdStep proximity))
frame periods docs quality'
(alterBorder border (fst branches') b)
(oldAcc + (toBorderAccuracy frequency (delete b ((fst branches') ++ (snd branches')))))
b ) (fst branches')
in trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
<> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
<> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
concat (next ++ (snd branches'))
where
-- | 3) process a quality score on the local set of branches
-- | 2) for each of the possible next branches process the phyloQuality score
quality' :: Double
quality' = toPhyloQuality branches
-- | 2) group the new groups into branches
branches :: [[PhyloGroup]]
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
-- | 1) process a temporal matching for each group
groups' :: [PhyloGroup]
groups' = processMatching max' periods proximity thr docs groups
quality' = toPhyloQuality beta frequency border oldAcc ((fst branches') ++ (snd branches'))
-- | 1) for each local branch process a temporal matching then find the resulting branches
branches' :: ([[PhyloGroup]],[[PhyloGroup]])
branches' =
let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
$ phyloBranchMatching frame periods proximity egoThr docs groups
in partition (\b -> length b >= minBranch) (branches `using` parList rdeepseq)
temporalMatching :: Phylo -> Phylo
temporalMatching phylo = updatePhyloGroups 1 branches phylo
temporalMatching phylo = updatePhyloGroups 1 branches' phylo
where
-- | 2) run the recursive matching to find the best repartition among branches
branches :: Map PhyloGroupId PhyloGroup
branches = fromList
$ map (\g -> (getGroupId g, g))
$ recursiveMatching (phyloProximity $ getConfig phylo)
(getThresholdInit $ phyloProximity $ getConfig phylo)
-- | 5) apply the recursive matching
branches' :: Map PhyloGroupId PhyloGroup
branches' =
let next = trace (" ✓ F(β) = " <> show(quality)
<> " |✓ " <> show(length $ fst branches) <> show(map length $ fst branches)
<> " |✗ " <> show(length $ snd branches) <> "[" <> show(length $ concat $ snd branches) <> "]")
$ map (\branch -> recursiveMatching (phyloProximity $ getConfig phylo)
(_qua_relevance $ phyloQuality $ getConfig phylo)
(_qua_minBranch $ phyloQuality $ getConfig phylo)
(reduceFrequency frequency (fst branches))
( (getThresholdInit $ phyloProximity $ getConfig phylo)
+ (getThresholdStep $ phyloProximity $ getConfig phylo))
(getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo)
(phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
(phylo ^. phylo_timeDocs) quality (alterBorder 0 (fst branches) branch)
(toBorderAccuracy frequency (delete branch ((fst branches) ++ (snd branches))))
branch
) (fst branches)
in fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd (concat (next ++ (snd branches)))
-- | 4) process the quality score
quality :: Double
quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) frequency 0 0 ((fst branches) ++ (snd branches))
-- | 3) process the constants of the quality score
frequency :: Map Int Double
frequency =
let terms = ngramsInBranches ((fst branches) ++ (snd branches))
freqs = map (\t -> termFreq' t $ concat ((fst branches) ++ (snd branches))) terms
in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
-- | 2) group into branches
branches :: ([[PhyloGroup]],[[PhyloGroup]])
branches = partition (\b -> length b >= (_qua_minBranch $ phyloQuality $ getConfig phylo))
$ groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
-- | 1) for each group process an initial temporal Matching
groups' :: [PhyloGroup]
groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
(phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
(phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)
(phylo ^. phylo_timeDocs)
(traceTemporalMatching $ getGroupsFromLevel 1 phylo)
......@@ -815,7 +815,7 @@ getProximity cluster = case cluster of
-- | To initialize all the Cluster / Proximity with their default parameters
initFis :: Maybe Bool -> Maybe Support -> Maybe Int -> FisParams
initFis (def True -> kmf) (def 2 -> min') (def 4 -> thr) = FisParams kmf min' thr
initFis (def True -> kmf) (def 0 -> min') (def 0 -> thr) = FisParams kmf min' thr
initHamming :: Maybe Double -> HammingParams
initHamming (def 0.01 -> sens) = HammingParams sens
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment