Commit eeeb82c8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev-phylo' into dev-merge

parents 76e11752 077bf19a
...@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString) ...@@ -26,7 +26,7 @@ import Data.ByteString.Lazy (ByteString)
import Data.Maybe (isJust, fromJust) import Data.Maybe (isJust, fromJust)
import Data.List (concat, nub, isSuffixOf, take) import Data.List (concat, nub, isSuffixOf, take)
import Data.String (String) import Data.String (String)
import Data.Text (Text, unwords) import Data.Text (Text, unwords, unpack)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
...@@ -37,6 +37,9 @@ import Gargantext.Text.List.CSV (csvGraphTermList) ...@@ -37,6 +37,9 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList) import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloMaker (toPhylo) import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
import Gargantext.Viz.Phylo.PhyloTools (printIOMsg, printIOComment)
import Gargantext.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.Viz.Phylo.SynchronicClustering (synchronicDistance')
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Prelude (Either(..)) import Prelude (Either(..))
...@@ -54,21 +57,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv ...@@ -54,21 +57,6 @@ import qualified Gargantext.Text.Corpus.Parsers.CSV as Csv
--------------- ---------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-- | To get all the files in a directory or just a file -- | To get all the files in a directory or just a file
getFilesFromPath :: FilePath -> IO([FilePath]) getFilesFromPath :: FilePath -> IO([FilePath])
getFilesFromPath path = do getFilesFromPath path = do
...@@ -166,6 +154,23 @@ main = do ...@@ -166,6 +154,23 @@ main = do
printIOComment (show (length corpus) <> " parsed docs from the corpus") printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOMsg "Reconstruct the Phylo" printIOMsg "Reconstruct the Phylo"
let phylo = toPhylo corpus mapList config let phylo = toPhylo corpus mapList config
printIOMsg "End of reconstruction" -- | probes
\ No newline at end of file
-- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_synchronic_distance_cumu_jaccard.txt")
-- $ synchronicDistance' phylo 1
-- writeFile ((outputPath config) <> (unpack $ phyloName config) <> "_inflexion_points.txt")
-- $ inflexionPoints phylo 1
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport phylo
let output = (outputPath config)
<> (unpack $ phyloName config)
<> "_V2.dot"
dotToFile output dot
\ No newline at end of file
...@@ -72,6 +72,9 @@ library: ...@@ -72,6 +72,9 @@ library:
- Gargantext.Viz.AdaptativePhylo - Gargantext.Viz.AdaptativePhylo
- Gargantext.Viz.Phylo.PhyloMaker - Gargantext.Viz.Phylo.PhyloMaker
- Gargantext.Viz.Phylo.Tools - Gargantext.Viz.Phylo.Tools
- Gargantext.Viz.Phylo.PhyloTools
- Gargantext.Viz.Phylo.PhyloExport
- Gargantext.Viz.Phylo.SynchronicClustering
- Gargantext.Viz.Phylo.Example - Gargantext.Viz.Phylo.Example
- Gargantext.Viz.Phylo.LevelMaker - Gargantext.Viz.Phylo.LevelMaker
- Gargantext.Viz.Phylo.View.Export - Gargantext.Viz.Phylo.View.Export
......
...@@ -44,6 +44,8 @@ import GHC.IO (FilePath) ...@@ -44,6 +44,8 @@ import GHC.IO (FilePath)
import Control.DeepSeq (NFData) import Control.DeepSeq (NFData)
import Control.Lens (makeLenses) import Control.Lens (makeLenses)
import qualified Data.Text.Lazy as TextLazy
---------------- ----------------
-- | Config | -- -- | Config | --
...@@ -65,6 +67,15 @@ data Proximity = ...@@ -65,6 +67,15 @@ data Proximity =
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
data Synchrony =
ByProximityThreshold
{ _bpt_threshold :: Double
, _bpt_sensibility :: Double}
| ByProximityDistribution
{ _bpd_sensibility :: Double}
deriving (Show,Generic,Eq)
data TimeUnit = data TimeUnit =
Year Year
{ _year_period :: Int { _year_period :: Int
...@@ -80,6 +91,12 @@ data ContextualUnit = ...@@ -80,6 +91,12 @@ data ContextualUnit =
deriving (Show,Generic,Eq) deriving (Show,Generic,Eq)
data Quality =
Quality { _qua_relevance :: Double
, _qua_minBranch :: Int }
deriving (Show,Generic,Eq)
data Config = data Config =
Config { corpusPath :: FilePath Config { corpusPath :: FilePath
, listPath :: FilePath , listPath :: FilePath
...@@ -88,9 +105,13 @@ data Config = ...@@ -88,9 +105,13 @@ data Config =
, phyloName :: Text , phyloName :: Text
, phyloLevel :: Int , phyloLevel :: Int
, phyloProximity :: Proximity , phyloProximity :: Proximity
, phyloSynchrony :: Synchrony
, phyloQuality :: Quality
, timeUnit :: TimeUnit , timeUnit :: TimeUnit
, contextualUnit :: ContextualUnit , contextualUnit :: ContextualUnit
, branchSize :: Int , exportLabel :: [PhyloLabel]
, exportSort :: Sort
, exportFilter :: [Filter]
} deriving (Show,Generic,Eq) } deriving (Show,Generic,Eq)
...@@ -102,10 +123,14 @@ defaultConfig = ...@@ -102,10 +123,14 @@ defaultConfig =
, corpusParser = Csv 1000 , corpusParser = Csv 1000
, phyloName = pack "Default Phylo" , phyloName = pack "Default Phylo"
, phyloLevel = 2 , phyloLevel = 2
, phyloProximity = WeightedLogJaccard 10 0 0.05 , phyloProximity = WeightedLogJaccard 10 0 0.1
, phyloSynchrony = ByProximityDistribution 0
, phyloQuality = Quality 0.1 1
, timeUnit = Year 3 1 5 , timeUnit = Year 3 1 5
, contextualUnit = Fis 2 4 , contextualUnit = Fis 1 5
, branchSize = 3 , exportLabel = [BranchLabel MostInclusive 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy
, exportFilter = [ByBranchSize 2]
} }
instance FromJSON Config instance FromJSON Config
...@@ -118,6 +143,20 @@ instance FromJSON TimeUnit ...@@ -118,6 +143,20 @@ instance FromJSON TimeUnit
instance ToJSON TimeUnit instance ToJSON TimeUnit
instance FromJSON ContextualUnit instance FromJSON ContextualUnit
instance ToJSON ContextualUnit instance ToJSON ContextualUnit
instance FromJSON PhyloLabel
instance ToJSON PhyloLabel
instance FromJSON Tagger
instance ToJSON Tagger
instance FromJSON Sort
instance ToJSON Sort
instance FromJSON Order
instance ToJSON Order
instance FromJSON Filter
instance ToJSON Filter
instance FromJSON Synchrony
instance ToJSON Synchrony
instance FromJSON Quality
instance ToJSON Quality
-- | Software parameters -- | Software parameters
...@@ -248,17 +287,18 @@ data PhyloGroup = ...@@ -248,17 +287,18 @@ data PhyloGroup =
PhyloGroup { _phylo_groupPeriod :: (Date,Date) PhyloGroup { _phylo_groupPeriod :: (Date,Date)
, _phylo_groupLevel :: Level , _phylo_groupLevel :: Level
, _phylo_groupIndex :: Int , _phylo_groupIndex :: Int
, _phylo_groupLabel :: Text
, _phylo_groupSupport :: Support , _phylo_groupSupport :: Support
, _phylo_groupNgrams :: [Int] , _phylo_groupNgrams :: [Int]
, _phylo_groupCooc :: !(Cooc) , _phylo_groupCooc :: !(Cooc)
, _phylo_groupBranchId :: PhyloBranchId , _phylo_groupBranchId :: PhyloBranchId
, _phylo_groupMeta :: Map Text [Double]
, _phylo_groupLevelParents :: [Pointer] , _phylo_groupLevelParents :: [Pointer]
, _phylo_groupLevelChilds :: [Pointer] , _phylo_groupLevelChilds :: [Pointer]
, _phylo_groupPeriodParents :: [Pointer] , _phylo_groupPeriodParents :: [Pointer]
, _phylo_groupPeriodChilds :: [Pointer] , _phylo_groupPeriodChilds :: [Pointer]
, _phylo_groupGhostPointers :: [Pointer]
} }
deriving (Generic, Show, Eq) deriving (Generic, Show, Eq, NFData)
-- | Weight : A generic mesure that can be associated with an Id -- | Weight : A generic mesure that can be associated with an Id
type Weight = Double type Weight = Double
...@@ -266,8 +306,6 @@ type Weight = Double ...@@ -266,8 +306,6 @@ type Weight = Double
-- | Pointer : A weighted pointer to a given PhyloGroup -- | Pointer : A weighted pointer to a given PhyloGroup
type Pointer = (PhyloGroupId, Weight) type Pointer = (PhyloGroupId, Weight)
type Link = ((PhyloGroupId, PhyloGroupId), Weight)
data Filiation = ToParents | ToChilds deriving (Generic, Show) data Filiation = ToParents | ToChilds deriving (Generic, Show)
data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show) data PointerType = TemporalPointer | LevelPointer deriving (Generic, Show)
...@@ -290,13 +328,53 @@ data PhyloFis = PhyloFis ...@@ -290,13 +328,53 @@ data PhyloFis = PhyloFis
} deriving (Generic,NFData,Show,Eq) } deriving (Generic,NFData,Show,Eq)
----------------
-- | Export | --
----------------
type DotId = TextLazy.Text
data EdgeType = GroupToGroup | BranchToGroup | BranchToBranch | PeriodToPeriod deriving (Show,Generic,Eq)
data Filter = ByBranchSize { _branch_size :: Double } deriving (Show,Generic,Eq)
data Order = Asc | Desc deriving (Show,Generic,Eq)
data Sort = ByBirthDate { _sort_order :: Order } | ByHierarchy deriving (Show,Generic,Eq)
data Tagger = MostInclusive | MostEmergentInclusive deriving (Show,Generic,Eq)
data PhyloLabel =
BranchLabel
{ _branch_labelTagger :: Tagger
, _branch_labelSize :: Int }
| GroupLabel
{ _group_labelTagger :: Tagger
, _group_labelSize :: Int }
deriving (Show,Generic,Eq)
data PhyloBranch =
PhyloBranch
{ _branch_id :: PhyloBranchId
, _branch_label :: Text
, _branch_meta :: Map Text [Double]
} deriving (Generic, Show)
data PhyloExport =
PhyloExport
{ _export_groups :: [PhyloGroup]
, _export_branches :: [PhyloBranch]
} deriving (Generic, Show)
---------------- ----------------
-- | Lenses | -- -- | Lenses | --
---------------- ----------------
makeLenses ''Config makeLenses ''Config
makeLenses ''Proximity makeLenses ''Proximity
makeLenses ''Quality
makeLenses ''ContextualUnit makeLenses ''ContextualUnit
makeLenses ''PhyloLabel
makeLenses ''TimeUnit makeLenses ''TimeUnit
makeLenses ''PhyloFoundations makeLenses ''PhyloFoundations
makeLenses ''PhyloFis makeLenses ''PhyloFis
...@@ -305,6 +383,8 @@ makeLenses ''PhyloPeriod ...@@ -305,6 +383,8 @@ makeLenses ''PhyloPeriod
makeLenses ''PhyloLevel makeLenses ''PhyloLevel
makeLenses ''PhyloGroup makeLenses ''PhyloGroup
makeLenses ''PhyloParam makeLenses ''PhyloParam
makeLenses ''PhyloExport
makeLenses ''PhyloBranch
------------------------ ------------------------
-- | JSON instances | -- -- | JSON instances | --
......
...@@ -83,7 +83,7 @@ queryViewEx = "level=3" ...@@ -83,7 +83,7 @@ queryViewEx = "level=3"
phyloQueryView :: PhyloQueryView phyloQueryView :: PhyloQueryView
phyloQueryView = PhyloQueryView 2 Merge False 2 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True phyloQueryView = PhyloQueryView 1 Merge False 1 [BranchAge,BranchBirth,BranchGroups] [] [BranchPeakInc,GroupLabelIncDyn] (Just (ByBranchBirth,Asc)) Json Flat True
-------------------------------------------------- --------------------------------------------------
...@@ -110,7 +110,7 @@ queryEx = "title=Cesar et Cleôpatre" ...@@ -110,7 +110,7 @@ queryEx = "title=Cesar et Cleôpatre"
phyloQueryBuild :: PhyloQueryBuild phyloQueryBuild :: PhyloQueryBuild
phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)" phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)"
3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.5 20) 5 0.8 0.5 4 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0) 3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.9 10) 5 0.8 0.5 4 1 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.3 0)
......
...@@ -29,18 +29,32 @@ import Gargantext.Text.Terms.Mono (monoTexts) ...@@ -29,18 +29,32 @@ import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.PhyloMaker import Gargantext.Viz.Phylo.PhyloMaker
import Gargantext.Viz.Phylo.PhyloExport
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Control.Lens import Control.Lens
import Data.GraphViz.Types.Generalised (DotGraph)
import qualified Data.Vector as Vector import qualified Data.Vector as Vector
phyloExport :: IO ()
phyloExport = dotToFile "/home/qlobbe/data/phylo/output/cesar_cleopatre_V2.dot" phyloDot
phyloDot :: DotGraph DotId
phyloDot = toPhyloExport phylo2
phylo2 :: Phylo
phylo2 = synchronicClustering phylo1
----------------------------------------------- -----------------------------------------------
-- | STEP 3 | -- Build the Level 1 of the Phylo -- | STEP 3 | -- Build the Level 1 of the Phylo
----------------------------------------------- -----------------------------------------------
phylo1 :: Phylo phylo1 :: Phylo
phylo1 = appendGroups fisToGroup 1 phyloFis phyloBase phylo1 = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
--------------------------------------------- ---------------------------------------------
...@@ -80,7 +94,8 @@ nbDocsByYear = docsToTimeScaleNb docs ...@@ -80,7 +94,8 @@ nbDocsByYear = docsToTimeScaleNb docs
config :: Config config :: Config
config = config =
defaultConfig { phyloName = "Cesar et Cleopatre" defaultConfig { phyloName = "Cesar et Cleopatre"
, branchSize = 0 , phyloLevel = 2
, exportFilter = [ByBranchSize 0]
, contextualUnit = Fis 0 0 } , contextualUnit = Fis 0 0 }
......
...@@ -12,5 +12,461 @@ Portability : POSIX ...@@ -12,5 +12,461 @@ Portability : POSIX
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module Gargantext.Viz.Phylo.PhyloExport where module Gargantext.Viz.Phylo.PhyloExport where
import Data.Map (Map, fromList, empty, fromListWith, insert, (!), elems, unionWith, findWithDefault, toList)
import Data.List ((++), sort, nub, concat, sortOn, reverse, groupBy, union, (\\), (!!), init, partition, unwords, nubBy)
import Data.Vector (Vector)
import Prelude (writeFile)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Control.Lens
import Data.GraphViz hiding (DotGraph, Order)
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.GraphViz.Attributes.Complete hiding (EdgeType, Order)
import Data.GraphViz.Types.Monadic
import Data.Text.Lazy (fromStrict, pack, unpack)
import System.FilePath
import Debug.Trace (trace)
import qualified Data.Text as Text
import qualified Data.Text.Lazy as Lazy
import qualified Data.GraphViz.Attributes.HTML as H
--------------------
-- | Dot export | --
--------------------
dotToFile :: FilePath -> DotGraph DotId -> IO ()
dotToFile filePath dotG = writeFile filePath $ dotToString dotG
dotToString :: DotGraph DotId -> [Char]
dotToString dotG = unpack (printDotGraph dotG)
dynamicToColor :: Double -> H.Attribute
dynamicToColor d
| d == 0 = H.BGColor (toColor LightCoral)
| d == 1 = H.BGColor (toColor Khaki)
| d == 2 = H.BGColor (toColor SkyBlue)
| otherwise = H.Color (toColor Black)
pickLabelColor :: [Double] -> H.Attribute
pickLabelColor lst
| elem 0 lst = dynamicToColor 0
| elem 2 lst = dynamicToColor 2
| elem 1 lst = dynamicToColor 1
| otherwise = dynamicToColor 3
toDotLabel :: Text.Text -> Label
toDotLabel lbl = StrLabel $ fromStrict lbl
toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
toAttr k v = customAttribute k v
metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
groupIdToDotId :: PhyloGroupId -> DotId
groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
branchIdToDotId :: PhyloBranchId -> DotId
branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
periodIdToDotId :: PhyloPeriodId -> DotId
periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
groupToTable fdt g = H.Table H.HTable
{ H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
, H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
, H.tableRows = [header]
<> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
<> ( map ngramsToRow $ splitEvery 4
$ reverse $ sortOn (snd . snd)
$ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
$ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
where
--------------------------------------
ngramsToRow :: [(Ngrams,(Double,Double))] -> H.Row
ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
H.LabelCell [H.Align H.HLeft,dynamicToColor d] $ H.Text [H.Str $ fromStrict n]) ns
--------------------------------------
header :: H.Row
header =
H.Cells [ H.LabelCell [pickLabelColor ((g ^. phylo_groupMeta) ! "dynamics")]
$ H.Text [H.Str $ (((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
<> (fromStrict " ( ")
<> (pack $ show (fst $ g ^. phylo_groupPeriod))
<> (fromStrict " , ")
<> (pack $ show (snd $ g ^. phylo_groupPeriod))
<> (fromStrict " ) "))]]
--------------------------------------
branchToDotNode :: PhyloBranch -> Dot DotId
branchToDotNode b =
node (branchIdToDotId $ b ^. branch_id)
([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
<> (metaToAttr $ b ^. branch_meta)
<> [ toAttr "nodeType" "branch"
, toAttr "branchId" (pack $ unwords (map show $ snd $ b ^. branch_id)) ])
periodToDotNode :: (Date,Date) -> Dot DotId
periodToDotNode prd =
node (periodIdToDotId prd)
([Shape Square, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
<> [ toAttr "nodeType" "period"
, toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
, toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
groupToDotNode :: Vector Ngrams -> PhyloGroup -> Dot DotId
groupToDotNode fdt g =
node (groupIdToDotId $ getGroupId g)
([FontName "Arial", Shape Square, toLabel (groupToTable fdt g)]
<> [ toAttr "nodeType" "group"
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))])
toDotEdge :: DotId -> DotId -> Text.Text -> EdgeType -> Dot DotId
toDotEdge source target lbl edgeType = edge source target
(case edgeType of
GroupToGroup -> [ Width 10, Color [toWColor Black], Constraint True
, Label (StrLabel $ fromStrict lbl)]
BranchToGroup -> [ Width 3, Color [toWColor Black], ArrowHead (AType [(ArrMod FilledArrow RightSide,DotArrow)])
, Label (StrLabel $ fromStrict lbl)]
BranchToBranch -> [ Width 2, Color [toWColor Black], Style [SItem Dashed []], ArrowHead (AType [(ArrMod FilledArrow BothSides,DotArrow)])
, Label (StrLabel $ fromStrict lbl)]
PeriodToPeriod -> [ Width 5, Color [toWColor Black]])
mergePointers :: [PhyloGroup] -> Map (PhyloGroupId,PhyloGroupId) Double
mergePointers groups =
let toChilds = fromList $ concat $ map (\g -> map (\(target,w) -> ((getGroupId g,target),w)) $ g ^. phylo_groupPeriodChilds) groups
toParents = fromList $ concat $ map (\g -> map (\(target,w) -> ((target,getGroupId g),w)) $ g ^. phylo_groupPeriodParents) groups
in unionWith (\w w' -> max w w') toChilds toParents
exportToDot :: Phylo -> PhyloExport -> DotGraph DotId
exportToDot phylo export =
trace ("\n-- | Convert " <> show(length $ export ^. export_branches) <> " branches and "
<> show(length $ export ^. export_groups) <> " groups to a dot file\n") $
digraph ((Str . fromStrict) $ (phyloName $ getConfig phylo)) $ do
-- | 1) init the dot graph
graphAttrs ( [ Label (toDotLabel $ (phyloName $ getConfig phylo))]
<> [ FontSize 30, LabelLoc VTop, NodeSep 1, RankSep [1], Rank SameRank, Splines SplineEdges, Overlap ScaleOverlaps
, Ratio FillRatio
, Style [SItem Filled []],Color [toWColor White]]
-- | home made attributes
<> [(toAttr (fromStrict "nbDocs") $ pack $ show (sum $ elems $ phylo ^. phylo_timeDocs))
,(toAttr (fromStrict "proxiName") $ pack $ show (getProximityName $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "proxiInit") $ pack $ show (getProximityInit $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "proxiStep") $ pack $ show (getProximityStep $ phyloProximity $ getConfig phylo))
,(toAttr (fromStrict "quaFactor") $ pack $ show (_qua_relevance $ phyloQuality $ getConfig phylo))
])
-- toAttr (fromStrict k) $ (pack . unwords) $ map show v
-- | 2) create a layer for the branches labels
subgraph (Str "Branches peaks") $ do
graphAttrs [Rank SameRank]
-- | 3) group the branches by hierarchy
-- mapM (\branches ->
-- subgraph (Str "Branches clade") $ do
-- graphAttrs [Rank SameRank]
-- -- | 4) create a node for each branch
-- mapM branchToDotNode branches
-- ) $ elems $ fromListWith (++) $ map (\b -> ((init . snd) $ b ^. branch_id,[b])) $ export ^. export_branches
mapM branchToDotNode $ export ^. export_branches
-- | 5) create a layer for each period
_ <- mapM (\period ->
subgraph ((Str . fromStrict . Text.pack) $ ("Period" <> show (fst period) <> show (snd period))) $ do
graphAttrs [Rank SameRank]
periodToDotNode period
-- | 6) create a node for each group
mapM (\g -> groupToDotNode (getRoots phylo) g) (filter (\g -> g ^. phylo_groupPeriod == period) $ export ^. export_groups)
) $ getPeriodIds phylo
-- | 7) create the edges between a branch and its first groups
_ <- mapM (\(bId,groups) ->
mapM (\g -> toDotEdge (branchIdToDotId bId) (groupIdToDotId $ getGroupId g) "" BranchToGroup) groups
)
$ toList
$ map (\groups -> head' "toDot"
$ groupBy (\g g' -> g' ^. phylo_groupPeriod == g ^. phylo_groupPeriod)
$ sortOn (fst . _phylo_groupPeriod) groups)
$ fromListWith (++) $ map (\g -> (g ^. phylo_groupBranchId,[g])) $ export ^. export_groups
-- | 8) create the edges between the groups
_ <- mapM (\((k,k'),_) ->
toDotEdge (groupIdToDotId k) (groupIdToDotId k') "" GroupToGroup
) $ (toList . mergePointers) $ export ^. export_groups
-- | 7) create the edges between the periods
_ <- mapM (\(prd,prd') ->
toDotEdge (periodIdToDotId prd) (periodIdToDotId prd') "" PeriodToPeriod
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ getPeriodIds phylo
-- | 8) create the edges between the branches
_ <- mapM (\(bId,bId') ->
toDotEdge (branchIdToDotId bId) (branchIdToDotId bId')
(Text.pack $ show(branchIdsToProximity bId bId'
(getThresholdInit $ phyloProximity $ getConfig phylo)
(getThresholdStep $ phyloProximity $ getConfig phylo))) BranchToBranch
) $ nubBy (\combi combi' -> fst combi == fst combi') $ listToCombi' $ map _branch_id $ export ^. export_branches
graphAttrs [Rank SameRank]
----------------
-- | Filter | --
----------------
filterByBranchSize :: Double -> PhyloExport -> PhyloExport
filterByBranchSize thr export =
let branches' = partition (\b -> head' "filter" ((b ^. branch_meta) ! "size") >= thr) $ export ^. export_branches
in export & export_branches .~ (fst branches')
& export_groups %~ (filter (\g -> not $ elem (g ^. phylo_groupBranchId) (map _branch_id $ snd branches')))
processFilters :: [Filter] -> Quality -> PhyloExport -> PhyloExport
processFilters filters qua export =
foldl (\export' f -> case f of
ByBranchSize thr -> if (thr < (fromIntegral $ qua ^. qua_minBranch))
then filterByBranchSize (fromIntegral $ qua ^. qua_minBranch) export'
else filterByBranchSize thr export'
) export filters
--------------
-- | Sort | --
--------------
sortByHierarchy :: Int -> [PhyloBranch] -> [PhyloBranch]
sortByHierarchy depth branches =
if (length branches == 1)
then branches
else concat
$ map (\branches' ->
let partitions = partition (\b -> depth + 1 == ((length . snd) $ b ^. branch_id)) branches'
in (sortOn (\b -> (b ^. branch_meta) ! "birth") (fst partitions))
++ (sortByHierarchy (depth + 1) (snd partitions)))
$ groupBy (\b b' -> ((take depth . snd) $ b ^. branch_id) == ((take depth . snd) $ b' ^. branch_id) )
$ sortOn (\b -> (take depth . snd) $ b ^. branch_id) branches
sortByBirthDate :: Order -> PhyloExport -> PhyloExport
sortByBirthDate order export =
let branches = sortOn (\b -> (b ^. branch_meta) ! "birth") $ export ^. export_branches
branches' = case order of
Asc -> branches
Desc -> reverse branches
in export & export_branches .~ branches'
processSort :: Sort -> PhyloExport -> PhyloExport
processSort sort' export = case sort' of
ByBirthDate o -> sortByBirthDate o export
ByHierarchy -> export & export_branches .~ sortByHierarchy 0 (export ^. export_branches)
-----------------
-- | Metrics | --
-----------------
-- | Return the conditional probability of i knowing j
conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
conditional m i j = (findWithDefault 0 (i,j) m)
/ (m ! (j,j))
-- | Return the genericity score of a given ngram
genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
- (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
-- | Return the specificity score of a given ngram
specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
- (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
-- | Return the inclusion score of a given ngram
inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
+ (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
ngramsMetrics :: PhyloExport -> PhyloExport
ngramsMetrics export =
over ( export_groups
. traverse )
(\g -> g & phylo_groupMeta %~ insert "genericity"
(map (\n -> genericity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "specificity"
(map (\n -> specificity (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "inclusion"
(map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
) export
branchDating :: PhyloExport -> PhyloExport
branchDating export =
over ( export_branches
. traverse )
(\b ->
let groups = sortOn fst
$ foldl' (\acc g -> if (g ^. phylo_groupBranchId == b ^. branch_id)
then acc ++ [g ^. phylo_groupPeriod]
else acc ) [] $ export ^. export_groups
birth = fst $ head' "birth" groups
age = (snd $ last' "age" groups) - birth
in b & branch_meta %~ insert "birth" [fromIntegral birth]
& branch_meta %~ insert "age" [fromIntegral age]
& branch_meta %~ insert "size" [fromIntegral $ length groups] ) export
processMetrics :: PhyloExport -> PhyloExport
processMetrics export = ngramsMetrics
$ branchDating export
-----------------
-- | Taggers | --
-----------------
getNthMostMeta :: Int -> [Double] -> [Int] -> [Int]
getNthMostMeta nth meta ns = map (\(idx,_) -> (ns !! idx))
$ take nth
$ reverse
$ sortOn snd $ zip [0..] meta
mostInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
mostInclusive nth foundations export =
over ( export_branches
. traverse )
(\b ->
let groups = filter (\g -> g ^. phylo_groupBranchId == b ^. branch_id) $ export ^. export_groups
cooc = foldl (\acc g -> unionWith (+) acc (g ^. phylo_groupCooc)) empty groups
ngrams = sort $ foldl (\acc g -> union acc (g ^. phylo_groupNgrams)) [] groups
inc = map (\n -> inclusion cooc (ngrams \\ [n]) n) ngrams
lbl = ngramsToLabel foundations $ getNthMostMeta nth inc ngrams
in b & branch_label .~ lbl ) export
mostEmergentInclusive :: Int -> Vector Ngrams -> PhyloExport -> PhyloExport
mostEmergentInclusive nth foundations export =
over ( export_groups
. traverse )
(\g ->
let lbl = ngramsToLabel foundations
$ take nth
$ map (\(_,(_,idx)) -> idx)
$ concat
$ map (\groups -> sortOn (fst . snd) groups)
$ groupBy ((==) `on` fst) $ reverse $ sortOn fst
$ zip ((g ^. phylo_groupMeta) ! "inclusion")
$ zip ((g ^. phylo_groupMeta) ! "dynamics") (g ^. phylo_groupNgrams)
in g & phylo_groupLabel .~ lbl ) export
processLabels :: [PhyloLabel] -> Vector Ngrams -> PhyloExport -> PhyloExport
processLabels labels foundations export =
foldl (\export' label ->
case label of
GroupLabel tagger nth ->
case tagger of
MostEmergentInclusive -> mostEmergentInclusive nth foundations export'
_ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger"
BranchLabel tagger nth ->
case tagger of
MostInclusive -> mostInclusive nth foundations export'
_ -> panic "[ERR][Viz.Phylo.PhyloExport] unknown tagger" ) export labels
------------------
-- | Dynamics | --
------------------
toDynamics :: Int -> [PhyloGroup] -> PhyloGroup -> Map Int (Date,Date) -> Double
toDynamics n parents group m =
let prd = group ^. phylo_groupPeriod
end = last' "dynamics" (sort $ map snd $ elems m)
in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
-- | decrease
then 2
else if ((fst prd) == (fst $ m ! n))
-- | recombination
then 0
else if isNew
-- | emergence
then 1
else 3
where
--------------------------------------
isNew :: Bool
isNew = not $ elem n $ concat $ map _phylo_groupNgrams parents
processDynamics :: [PhyloGroup] -> [PhyloGroup]
processDynamics groups =
map (\g ->
let parents = filter (\g' -> (g ^. phylo_groupBranchId == g' ^. phylo_groupBranchId)
&& ((fst $ g ^. phylo_groupPeriod) > (fst $ g' ^. phylo_groupPeriod))) groups
in g & phylo_groupMeta %~ insert "dynamics" (map (\n -> toDynamics n parents g mapNgrams) $ g ^. phylo_groupNgrams) ) groups
where
--------------------------------------
mapNgrams :: Map Int (Date,Date)
mapNgrams = map (\dates ->
let dates' = sort dates
in (head' "dynamics" dates', last' "dynamics" dates'))
$ fromListWith (++)
$ foldl (\acc g -> acc ++ ( map (\n -> (n,[fst $ g ^. phylo_groupPeriod, snd $ g ^. phylo_groupPeriod]))
$ (g ^. phylo_groupNgrams))) [] groups
---------------------
-- | phyloExport | --
---------------------
toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = exportToDot phylo
$ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
$ processSort (exportSort $ getConfig phylo)
$ processLabels (exportLabel $ getConfig phylo) (getRoots phylo)
$ processMetrics export
where
export :: PhyloExport
export = PhyloExport groups branches
--------------------------------------
branches :: [PhyloBranch]
branches = traceExportBranches $ map (\bId -> PhyloBranch bId "" empty) $ nub $ map _phylo_groupBranchId groups
--------------------------------------
groups :: [PhyloGroup]
groups = processDynamics
$ getGroupsFromLevel (phyloLevel $ getConfig phylo) phylo
traceExportBranches :: [PhyloBranch] -> [PhyloBranch]
traceExportBranches branches = trace ("\n" <> "-- | Export " <> show(length branches) <> " branches") branches
...@@ -16,13 +16,15 @@ Portability : POSIX ...@@ -16,13 +16,15 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloMaker where module Gargantext.Viz.Phylo.PhyloMaker where
import Data.List (concat, nub, partition, sort, (++)) import Data.List (concat, nub, partition, sort, (++))
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys) import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys)
import Data.Set (size) import Data.Set (size)
import Data.Vector (Vector) import Data.Vector (Vector)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Text.Context (TermList) import Gargantext.Text.Context (TermList)
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..)) import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
...@@ -41,7 +43,10 @@ import qualified Data.Set as Set ...@@ -41,7 +43,10 @@ import qualified Data.Set as Set
toPhylo :: [Document] -> TermList -> Config -> Phylo toPhylo :: [Document] -> TermList -> Config -> Phylo
toPhylo docs lst conf = phylo1 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
if (phyloLevel conf) > 1
then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
else phylo1
where where
-------------------------------------- --------------------------------------
phylo1 :: Phylo phylo1 :: Phylo
...@@ -82,16 +87,18 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co ...@@ -82,16 +87,18 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
fisToGroup fis pId lvl idx fdt coocs = fisToGroup fis pId lvl idx fdt coocs =
let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
in PhyloGroup pId lvl idx in PhyloGroup pId lvl idx ""
(fis ^. phyloFis_support) (fis ^. phyloFis_support)
ngrams ngrams
(ngramsToCooc ngrams coocs) (ngramsToCooc ngrams coocs)
(1,[]) (1,[0])
[] [] [] [] [] empty
[] [] [] []
toPhylo1 :: [Document] -> Phylo -> Phylo toPhylo1 :: [Document] -> Phylo -> Phylo
toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase toPhylo1 docs phyloBase = temporalMatching
$ appendGroups fisToGroup 1 phyloFis phyloBase
where where
-------------------------------------- --------------------------------------
phyloFis :: Map (Date,Date) [PhyloFis] phyloFis :: Map (Date,Date) [PhyloFis]
...@@ -164,14 +171,6 @@ toPhyloFis phyloDocs support clique = traceFis "Filtered Fis" ...@@ -164,14 +171,6 @@ toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
-------------------- --------------------
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc :: [Int] -> [Cooc] -> Cooc
ngramsToCooc ngrams coocs =
let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
pairs = listToKeys ngrams
in filterWithKey (\k _ -> elem k pairs) cooc
-- | To transform the docs into a time map of coocurency matrix -- | To transform the docs into a time map of coocurency matrix
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
docsToTimeScaleCooc docs fdt = docsToTimeScaleCooc docs fdt =
...@@ -232,4 +231,4 @@ toPhyloBase docs lst conf = ...@@ -232,4 +231,4 @@ toPhyloBase docs lst conf =
(docsToTimeScaleCooc docs (foundations ^. foundations_roots)) (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
(docsToTimeScaleNb docs) (docsToTimeScaleNb docs)
params params
(fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods) (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)
...@@ -17,13 +17,16 @@ Portability : POSIX ...@@ -17,13 +17,16 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloTools where module Gargantext.Viz.Phylo.PhyloTools where
import Data.Vector (Vector, elemIndex) import Data.Vector (Vector, elemIndex)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub) import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
import Data.Set (Set, size) import Data.Set (Set, size, disjoint)
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey) import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
import Data.String (String) import Data.String (String)
import Data.Text (Text, unwords)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Text.Printf
import Debug.Trace (trace) import Debug.Trace (trace)
import Control.Lens hiding (Level) import Control.Lens hiding (Level)
...@@ -32,14 +35,40 @@ import qualified Data.Vector as Vector ...@@ -32,14 +35,40 @@ import qualified Data.Vector as Vector
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Set as Set import qualified Data.Set as Set
------------
-- | Io | --
------------
-- | To print an important message as an IO()
printIOMsg :: String -> IO ()
printIOMsg msg =
putStrLn ( "\n"
<> "------------"
<> "\n"
<> "-- | " <> msg <> "\n" )
-- | To print a comment as an IO()
printIOComment :: String -> IO ()
printIOComment cmt =
putStrLn ( "\n" <> cmt <> "\n" )
-------------- --------------
-- | Misc | -- -- | Misc | --
-------------- --------------
roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
roundToStr = printf "%0.*f"
countSup :: Double -> [Double] -> Int countSup :: Double -> [Double] -> Int
countSup s l = length $ filter (>s) l countSup s l = length $ filter (>s) l
dropByIdx :: Int -> [a] -> [a]
dropByIdx k l = take k l ++ drop (k+1) l
elemIndex' :: Eq a => a -> [a] -> Int elemIndex' :: Eq a => a -> [a] -> Int
elemIndex' e l = case (List.elemIndex e l) of elemIndex' e l = case (List.elemIndex e l) of
...@@ -60,6 +89,15 @@ isRoots n ns = Vector.elem n ns ...@@ -60,6 +89,15 @@ isRoots n ns = Vector.elem n ns
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int] ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
-- | To transform a list of Ngrams Indexes into a Label
ngramsToLabel :: Vector Ngrams -> [Int] -> Text
ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
-- | To transform a list of Ngrams Indexes into a list of Text
ngramsToText :: Vector Ngrams -> [Int] -> [Text]
ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
-------------- --------------
-- | Time | -- -- | Time | --
...@@ -168,7 +206,6 @@ getFisSize unit = case unit of ...@@ -168,7 +206,6 @@ getFisSize unit = case unit of
-- | Cooc | -- -- | Cooc | --
-------------- --------------
listToCombi' :: [a] -> [(a,a)] listToCombi' :: [a] -> [(a,a)]
listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest] listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
...@@ -181,12 +218,24 @@ listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst) ...@@ -181,12 +218,24 @@ listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
listToMatrix :: [Int] -> Map (Int,Int) Double listToMatrix :: [Int] -> Map (Int,Int) Double
listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
listToSeq :: Eq a => [a] -> [(a,a)]
listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
sumCooc :: Cooc -> Cooc -> Cooc sumCooc :: Cooc -> Cooc -> Cooc
sumCooc cooc cooc' = unionWith (+) cooc cooc' sumCooc cooc cooc' = unionWith (+) cooc cooc'
getTrace :: Cooc -> Double getTrace :: Cooc -> Double
getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc :: [Int] -> [Cooc] -> Cooc
ngramsToCooc ngrams coocs =
let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
pairs = listToKeys ngrams
in filterWithKey (\k _ -> elem k pairs) cooc
-------------------- --------------------
-- | PhyloGroup | -- -- | PhyloGroup | --
-------------------- --------------------
...@@ -194,6 +243,40 @@ getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc ...@@ -194,6 +243,40 @@ getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
getGroupId :: PhyloGroup -> PhyloGroupId getGroupId :: PhyloGroup -> PhyloGroupId
getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex) getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
getPeriodPointers fil group =
case fil of
ToChilds -> group ^. phylo_groupPeriodChilds
ToParents -> group ^. phylo_groupPeriodParents
filterProximity :: Proximity -> Double -> Double -> Bool
filterProximity proximity thr local =
case proximity of
WeightedLogJaccard _ _ _ -> local >= thr
Hamming -> undefined
getProximityName :: Proximity -> String
getProximityName proximity =
case proximity of
WeightedLogJaccard _ _ _ -> "WLJaccard"
Hamming -> "Hamming"
getProximityInit :: Proximity -> Double
getProximityInit proximity =
case proximity of
WeightedLogJaccard _ i _ -> i
Hamming -> undefined
getProximityStep :: Proximity -> Double
getProximityStep proximity =
case proximity of
WeightedLogJaccard _ _ s -> s
Hamming -> undefined
--------------- ---------------
-- | Phylo | -- -- | Phylo | --
--------------- ---------------
...@@ -202,11 +285,11 @@ addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup ...@@ -202,11 +285,11 @@ addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
addPointers group fil pty pointers = addPointers group fil pty pointers =
case pty of case pty of
TemporalPointer -> case fil of TemporalPointer -> case fil of
ToChilds -> group & phylo_groupPeriodChilds %~ (++ pointers) ToChilds -> group & phylo_groupPeriodChilds .~ pointers
ToParents -> group & phylo_groupPeriodParents %~ (++ pointers) ToParents -> group & phylo_groupPeriodParents .~ pointers
LevelPointer -> case fil of LevelPointer -> case fil of
ToChilds -> group & phylo_groupLevelChilds %~ (++ pointers) ToChilds -> group & phylo_groupLevelChilds .~ pointers
ToParents -> group & phylo_groupLevelParents %~ (++ pointers) ToParents -> group & phylo_groupLevelParents .~ pointers
getPeriodIds :: Phylo -> [(Date,Date)] getPeriodIds :: Phylo -> [(Date,Date)]
...@@ -214,6 +297,19 @@ getPeriodIds phylo = sortOn fst ...@@ -214,6 +297,19 @@ getPeriodIds phylo = sortOn fst
$ keys $ keys
$ phylo ^. phylo_periods $ phylo ^. phylo_periods
getLevelParentId :: PhyloGroup -> PhyloGroupId
getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
getLastLevel :: Phylo -> Level
getLastLevel phylo = last' "lastLevel" $ getLevels phylo
getLevels :: Phylo -> [Level]
getLevels phylo = nub
$ map snd
$ keys $ view ( phylo_periods
. traverse
. phylo_periodLevels ) phylo
getConfig :: Phylo -> Config getConfig :: Phylo -> Config
getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
...@@ -222,6 +318,11 @@ getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config ...@@ -222,6 +318,11 @@ getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
getRoots :: Phylo -> Vector Ngrams getRoots :: Phylo -> Vector Ngrams
getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
phyloToLastBranches :: Phylo -> [[PhyloGroup]]
phyloToLastBranches phylo = elems
$ fromListWith (++)
$ map (\g -> (g ^. phylo_groupBranchId, [g]))
$ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup] getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
getGroupsFromLevel lvl phylo = getGroupsFromLevel lvl phylo =
...@@ -250,13 +351,38 @@ updatePhyloGroups lvl m phylo = ...@@ -250,13 +351,38 @@ updatePhyloGroups lvl m phylo =
else group ) phylo else group ) phylo
------------------ traceToPhylo :: Level -> Phylo -> Phylo
-- | Pointers | -- traceToPhylo lvl phylo =
------------------ trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
<> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
<> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
--------------------
-- | Clustering | --
--------------------
pointersToLinks :: PhyloGroupId -> [Pointer] -> [Link] relatedComponents :: Ord a => [[a]] -> [[a]]
pointersToLinks id pointers = map (\p -> ((id,fst p),snd p)) pointers relatedComponents graph = foldl' (\acc groups ->
if (null acc)
then acc ++ [groups]
else
let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
traceSynchronyEnd :: Phylo -> Phylo
traceSynchronyEnd phylo =
trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
traceSynchronyStart :: Phylo -> Phylo
traceSynchronyStart phylo =
trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
<> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
<> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
<> "\n" ) phylo
------------------- -------------------
...@@ -279,9 +405,77 @@ getThresholdStep proxi = case proxi of ...@@ -279,9 +405,77 @@ getThresholdStep proxi = case proxi of
Hamming -> undefined Hamming -> undefined
traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
traceBranchMatching proxi thr groups = case proxi of
WeightedLogJaccard _ i s -> trace (
roundToStr 2 thr <> " "
<> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
<> " " <> show(length groups) <> " groups"
) groups
Hamming -> undefined
---------------- ----------------
-- | Branch | -- -- | Branch | --
---------------- ----------------
intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
intersectInit acc lst lst' =
if (null lst) || (null lst')
then acc
else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
else acc
branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
ngramsInBranches :: [[PhyloGroup]] -> [Int] ngramsInBranches :: [[PhyloGroup]] -> [Int]
ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
traceMatchSuccess thr qua qua' nextBranches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
$ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length nextBranches) <> ")]"
<> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
<> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
<> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchFailure thr qua qua' branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
) branches
traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchNoSplit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to split in smaller branches" <> "\n"
) branches
traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
traceMatchLimit branches =
trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
<> ",(1.." <> show (length branches) <> ")]"
<> " | " <> show (length $ concat branches) <> " groups" <> "\n"
<> " - unable to increase the threshold above 1" <> "\n"
) branches
traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
traceMatchEnd groups =
trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
<> " branches and " <> show (length groups) <> " groups" <> "\n") groups
traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
traceTemporalMatching groups =
trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
\ No newline at end of file
...@@ -16,22 +16,180 @@ Portability : POSIX ...@@ -16,22 +16,180 @@ Portability : POSIX
module Gargantext.Viz.Phylo.SynchronicClustering where module Gargantext.Viz.Phylo.SynchronicClustering where
import Gargantext.Prelude import Gargantext.Prelude
-- import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
-- import Gargantext.Viz.Phylo.PhyloTools import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard)
import Data.List ((++), null, intersect, nub, concat, sort, sortOn)
import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import qualified Data.Map as Map
-------------------------
-- | New Level Maker | --
-------------------------
toBranchId :: PhyloGroup -> PhyloBranchId
toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
mergeGroups coocs id mapIds childs =
let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
(sum $ map _phylo_groupSupport childs) ngrams
(ngramsToCooc ngrams coocs) (toBranchId (head' "mergeGroups" childs))
empty [] (map (\g -> (getGroupId g, 1)) childs)
(updatePointers $ concat $ map _phylo_groupPeriodParents childs)
(updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
where
updatePointers :: [Pointer] -> [Pointer]
updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
addPhyloLevel :: Level -> Phylo -> Phylo
addPhyloLevel lvl phylo =
over ( phylo_periods . traverse )
(\phyloPrd -> phyloPrd & phylo_periodLevels
%~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
toNextLevel phylo groups =
let curLvl = getLastLevel phylo
oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
newGroups = fromListWith (++)
-- | 5) group the parents by periods
$ foldlWithKey (\acc id groups' ->
-- | 4) create the parent group
let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
in acc ++ [(parent ^. phylo_groupPeriod, [parent])]) []
-- | 3) group the current groups by parentId
$ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
in traceSynchronyEnd
$ over ( phylo_periods . traverse . phylo_periodLevels . traverse
-- | 6) update each period at curLvl + 1
. filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
-- | 7) by adding the parents
(\phyloLvl ->
if member (phyloLvl ^. phylo_levelPeriod) newGroups
then phyloLvl & phylo_levelGroups
.~ fromList (map (\g -> (getGroupId g, g)) $ newGroups ! (phyloLvl ^. phylo_levelPeriod))
else phyloLvl)
-- | 2) add the curLvl + 1 phyloLevel to the phylo
$ addPhyloLevel (curLvl + 1)
-- | 1) update the current groups (with level parent pointers) in the phylo
$ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
import Data.List (foldl', (++), null, intersect, (\\), union, nub, concat)
-------------------- --------------------
-- | Clustering | -- -- | Clustering | --
-------------------- --------------------
relatedComponents :: Eq a => [[a]] -> [[a]] toPairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
relatedComponents graphs = foldl' (\mem groups -> toPairs groups = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))
if (null mem) $ listToCombi' groups
then mem ++ [groups]
else
let related = filter (\groups' -> (not . null) $ intersect groups groups') mem toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
in if (null related) toDiamonds groups = foldl' (\acc groups' ->
then mem ++ [groups] acc ++ ( elems
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs $ Map.filter (\v -> length v > 1)
\ No newline at end of file $ fromListWith (++)
$ foldl' (\acc' g ->
acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
$ elems
$ Map.filter (\v -> length v > 1)
$ fromListWith (++)
$ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
groupsToEdges :: Proximity -> Synchrony -> Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
groupsToEdges prox sync docs groups =
case sync of
ByProximityThreshold t s -> filter (\(_,w) -> w >= t)
$ toEdges s
$ toPairs groups
ByProximityDistribution s ->
let diamonds = sortOn snd
$ toEdges s $ concat
$ map toPairs $ toDiamonds groups
in take (div (length diamonds) 2) diamonds
where
toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
toEdges sens edges =
case prox of
WeightedLogJaccard _ _ _ -> map (\(g,g') ->
((g,g'), weightedLogJaccard sens docs
(g ^. phylo_groupCooc) (g' ^. phylo_groupCooc)
(g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
_ -> undefined
toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
toRelatedComponents nodes edges =
let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
toParentId :: PhyloGroup -> PhyloGroupId
toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
reduceBranch :: Proximity -> Synchrony -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
reduceBranch prox sync docs branch =
-- | 1) reduce a branch as a set of periods & groups
let periods = fromListWith (++)
$ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
in (concat . concat . elems)
$ mapWithKey (\prd groups ->
-- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
let edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
in map (\comp ->
-- | 4) add to each groups their futur level parent group
let parentId = toParentId (head' "parentId" comp)
in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
-- |3) reduce the graph a a set of related components
$ toRelatedComponents groups edges) periods
synchronicClustering :: Phylo -> Phylo
synchronicClustering phylo =
let prox = phyloProximity $ getConfig phylo
sync = phyloSynchrony $ getConfig phylo
docs = phylo ^. phylo_timeDocs
branches = map (\branch -> reduceBranch prox sync docs branch)
$ phyloToLastBranches
$ traceSynchronyStart phylo
branches' = branches `using` parList rdeepseq
in toNextLevel phylo $ concat branches'
----------------
-- | probes | --
----------------
-- synchronicDistance :: Phylo -> Level -> String
-- synchronicDistance phylo lvl =
-- foldl' (\acc branch ->
-- acc <> (foldl' (\acc' period ->
-- acc' <> let prox = phyloProximity $ getConfig phylo
-- sync = phyloSynchrony $ getConfig phylo
-- docs = _phylo_timeDocs phylo
-- prd = _phylo_groupPeriod $ head' "distance" period
-- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
-- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
-- in foldl' (\mem (_,w) ->
-- mem <> show (prd)
-- <> "\t"
-- <> show (w)
-- <> "\n"
-- ) "" edges
-- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
-- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo
\ No newline at end of file
...@@ -15,19 +15,21 @@ Portability : POSIX ...@@ -15,19 +15,21 @@ Portability : POSIX
module Gargantext.Viz.Phylo.TemporalMatching where module Gargantext.Viz.Phylo.TemporalMatching where
import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union) import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete)
import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault) import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Viz.Phylo.SynchronicClustering
import Prelude (logBase) -- import Prelude (logBase)
import Control.Lens hiding (Level) import Control.Lens hiding (Level)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Debug.Trace (trace)
import qualified Data.Set as Set import qualified Data.Set as Set
------------------- -------------------
-- | Proximity | -- -- | Proximity | --
------------------- -------------------
...@@ -65,10 +67,11 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams' ...@@ -65,10 +67,11 @@ weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
ngramsUnion = union ngrams ngrams' ngramsUnion = union ngrams ngrams'
-------------------------------------- --------------------------------------
coocInter :: [Double] coocInter :: [Double]
coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc' coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
-- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
-------------------------------------- --------------------------------------
coocUnion :: [Double] coocUnion :: [Double]
coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc' coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
-------------------------------------- --------------------------------------
...@@ -79,13 +82,6 @@ pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of ...@@ -79,13 +82,6 @@ pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
Hamming -> undefined Hamming -> undefined
filterProximity :: Proximity -> Double -> Double -> Bool
filterProximity proximity thr local =
case proximity of
WeightedLogJaccard _ _ _ -> local >= thr
Hamming -> undefined
-- | To process the proximity between a current group and a pair of targets group -- | To process the proximity between a current group and a pair of targets group
toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
toProximity docs proximity ego target target' = toProximity docs proximity ego target target' =
...@@ -103,50 +99,88 @@ toProximity docs proximity ego target target' = ...@@ -103,50 +99,88 @@ toProximity docs proximity ego target target' =
-- | Local Matching | -- -- | Local Matching | --
------------------------ ------------------------
toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
toLastPeriod fil periods = case fil of
ToParents -> head' "toLastPeriod" (sortOn fst periods)
ToChilds -> last' "toLastPeriod" (sortOn fst periods)
toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
toLazyPairs pointers fil thr prox prd pairs =
if null pointers then pairs
else let rest = filterPointers prox thr pointers
in if null rest
then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
in if prd' == prd
then []
else filter (\(g,g') ->
case fil of
ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
|| ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
else []
-- | Find pairs of valuable candidates to be matched -- | Find pairs of valuable candidates to be matched
makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)] makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
makePairs candidates periods = case null periods of makePairs' ego candidates periods pointers fil thr prox docs =
case null periods of
True -> [] True -> []
False -> toLazyPairs pointers fil thr prox lastPrd
-- | at least on of the pair candidates should be from the last added period -- | at least on of the pair candidates should be from the last added period
False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods) $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
|| (inLastPeriod cdt' periods)) || ((g' ^. phylo_groupPeriod) == lastPrd))
$ listToKeys candidates $ listToKeys
$ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
|| ((toProximity docs prox ego ego g) >= thr)) candidates
where where
inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool lastPrd :: PhyloPeriodId
inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds) lastPrd = toLastPeriod fil periods
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
phyloGroupMatching candidates fil proxi docs thr ego = case pointers of filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
Nothing -> addPointers ego fil TemporalPointer []
Just pts -> addPointers ego fil TemporalPointer
phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
phyloGroupMatching candidates fil proxi docs thr ego =
case null nextPointers of
-- | let's find new pointers
True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
then addPointers ego fil TemporalPointer []
-- | or keep the old ones
else addPointers ego fil TemporalPointer
$ filterPointers proxi thr $ getPeriodPointers fil ego
False -> addPointers ego fil TemporalPointer
$ head' "phyloGroupMatching" $ head' "phyloGroupMatching"
-- | Keep only the best set of pointers grouped by proximity -- | Keep only the best set of pointers grouped by proximity
$ groupBy (\pt pt' -> snd pt == snd pt') $ groupBy (\pt pt' -> snd pt == snd pt')
$ reverse $ sortOn snd pts $ reverse $ sortOn snd $ head' "pointers"
$ nextPointers
-- | Find the first time frame where at leats one pointer satisfies the proximity threshold -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
where where
pointers :: Maybe [Pointer] nextPointers :: [[Pointer]]
pointers = find (not . null) nextPointers = take 1
$ dropWhile (null)
-- | for each time frame, process the proximity on relevant pairs of targeted groups -- | for each time frame, process the proximity on relevant pairs of targeted groups
$ scanl (\acc groups -> $ scanl (\acc groups ->
let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups let periods = nub $ map _phylo_groupPeriod $ concat groups
pairs = makePairs (concat groups) periods docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity) pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
in acc ++ ( filterPointers proxi thr
$ concat $ concat
$ map (\(c,c') -> $ map (\(c,c') ->
-- | process the proximity between the current group and a pair of candidates -- | process the proximity between the current group and a pair of candidates
let proximity = toProximity (filterDocs docs periods) proxi ego c c' let proximity = toProximity docs' proxi ego c c'
in if (c == c') in if (c == c')
then [(getGroupId c,proximity)] then [(getGroupId c,proximity)]
else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs) else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
) [] $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
-- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
$ inits candidates
-------------------------------------- filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double filterDocs d pds = restrictKeys d $ periodsToYears pds
filterDocs d pds = restrictKeys d $ periodsToYears pds
----------------------------- -----------------------------
...@@ -161,30 +195,40 @@ getNextPeriods fil max' pId pIds = ...@@ -161,30 +195,40 @@ getNextPeriods fil max' pId pIds =
ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]] getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
getCandidates fil ego pIds targets = getCandidates fil ego targets =
case fil of case fil of
ToChilds -> targets' ToChilds -> targets'
ToParents -> reverse targets' ToParents -> reverse targets'
where where
targets' :: [[PhyloGroup]] targets' :: [[PhyloGroup]]
targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems targets' =
$ filterWithKey (\k _ -> elem k pIds) map (\groups' ->
$ fromListWith (++) filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
$ sortOn (fst . fst) ) groups') targets
$ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup] phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
processMatching max' periods proximity thr docs groups = -- $ matchByPeriods ToParents
map (\group -> -- $ groupByField _phylo_groupPeriod
let childs = getCandidates ToChilds group $ matchByPeriods
(getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups $ groupByField _phylo_groupPeriod branch
parents = getCandidates ToParents group where
(getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups --------------------------------------
in phyloGroupMatching parents ToParents proximity docs thr matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
$ phyloGroupMatching childs ToChilds proximity docs thr group matchByPeriods branch' = foldl' (\acc prd ->
) groups let periodsPar = getNextPeriods ToParents frame prd periods
periodsChi = getNextPeriods ToChilds frame prd periods
candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
docsPar = filterDocs docs ([prd] ++ periodsPar)
docsChi = filterDocs docs ([prd] ++ periodsChi)
egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
$ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
$ findWithDefault [] prd branch'
egos' = egos `using` parList rdeepseq
in acc ++ egos' ) [] periods
----------------------- -----------------------
...@@ -192,35 +236,80 @@ processMatching max' periods proximity thr docs groups = ...@@ -192,35 +236,80 @@ processMatching max' periods proximity thr docs groups =
----------------------- -----------------------
termFreq :: Int -> [[PhyloGroup]] -> Double count :: Eq a => a -> [a] -> Int
termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches) count x = length . filter (== x)
/ (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
termFreq' :: Int -> [PhyloGroup] -> Double
termFreq' term groups =
let ngrams = concat $ map _phylo_groupNgrams groups
in log ((fromIntegral $ count term ngrams)
/ (fromIntegral $ length ngrams))
entropy :: [[PhyloGroup]] -> Double relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
entropy branches = relevantBranches term branches =
let terms = ngramsInBranches branches filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
in sum $ map (\term -> (1 / log (termFreq term branches))
/ (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches) branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
* (sum $ map (\branch -> branchCov' branch branches =
let q = branchObs term (length $ concat branches) branch (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
in q * logBase 2 q ) branches) ) terms
where
-- | Probability to observe a branch given a random term of the phylo
branchObs :: Int -> Int -> [PhyloGroup] -> Double
branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
/ (fromIntegral total)
homogeneity :: [[PhyloGroup]] -> Double toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
homogeneity _ = undefined toRecall freq term border branches =
-- where -- | given a random term in a phylo
-- branchCov :: [PhyloGroup] -> Int -> Double freq
-- branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total) -- | for each relevant branches
* (sum $ map (\branch ->
-- | given its local coverage
((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
-- | compute the local recall
* ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches')
-- | with a ponderation from border branches
+ (fromIntegral border)) )) branches')
where
branches' :: [[PhyloGroup]]
branches' = relevantBranches term branches
toAccuracy :: Double -> Int -> [[PhyloGroup]] -> Double
toAccuracy freq term branches =
if (null branches)
then 0
else
-- | given a random term in a phylo
freq
-- | for each relevant branches
* (sum $ map (\branch ->
-- | given its local coverage
((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
-- | compute the local accuracy
* ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
/ (fromIntegral $ length branch))) branches')
where
branches' :: [[PhyloGroup]]
branches' = relevantBranches term branches
toPhyloQuality :: Double -> Map Int Double -> Int -> Double -> [[PhyloGroup]] -> Double
toPhyloQuality beta frequency border oldAcc branches =
-- trace (" rec : " <> show(recall)) $
-- trace (" acc : " <> show(accuracy)) $
if (null branches)
then 0
else ((1 + beta ** 2) * accuracy * recall)
/ (((beta ** 2) * accuracy + recall))
where
-- | for each term compute the global accuracy
accuracy :: Double
accuracy = oldAcc + (sum $ map (\term -> toAccuracy (frequency ! term) term branches) $ keys frequency)
-- | for each term compute the global recall
recall :: Double
recall = sum $ map (\term -> toRecall (frequency ! term) term border branches) $ keys frequency
toPhyloQuality :: [[PhyloGroup]] -> Double toBorderAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
toPhyloQuality branches = sqrt (homogeneity branches / entropy branches) toBorderAccuracy freq branches = sum $ map (\t -> toAccuracy (freq ! t) t branches) $ keys freq
----------------------------- -----------------------------
...@@ -231,54 +320,102 @@ toPhyloQuality branches = sqrt (homogeneity branches / entropy branches) ...@@ -231,54 +320,102 @@ toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]] groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
groupsToBranches groups = groupsToBranches groups =
-- | run the related component algorithm -- | run the related component algorithm
let graph = zip [1..] let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
$ relatedComponents $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
$ map (\group -> [getGroupId group] $ map (\group -> [getGroupId group]
++ (map fst $ group ^. phylo_groupPeriodParents) ++ (map fst $ group ^. phylo_groupPeriodParents)
++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
-- | first find the related components by inside each ego's period
graph' = map relatedComponents egos
-- | then run it for the all the periods
graph = zip [1..]
$ relatedComponents $ concat (graph' `using` parList rdeepseq)
-- | update each group's branch id -- | update each group's branch id
in map (\(bId,ids) -> in map (\(bId,ids) ->
map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId]))) let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
$ elems $ restrictKeys groups (Set.fromList ids) $ elems $ restrictKeys groups (Set.fromList ids)
) graph in groups' `using` parList rdeepseq ) graph
recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup] reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
recursiveMatching proximity thr max' periods docs quality groups = reduceFrequency frequency branches =
case quality < quality' of restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
-- | success : we localy improve the quality of the branch, let's go deeper
True -> concat
$ map (\branch -> alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch alterBorder border branches branch = border + (length $ concat branches) - (length branch)
) branches
-- | failure : last step was the local maximum, let's validate it
False -> groups recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Int -> Double -> [PhyloGroup] -> [PhyloGroup]
recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality border oldAcc groups =
if ((egoThr >= 1) || (quality > quality') || ((length $ concat $ snd branches') == (length groups)))
then
trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
<> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
<> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
groups
else
let next = map (\b -> recursiveMatching proximity beta minBranch
(reduceFrequency frequency (fst branches'))
(egoThr + (getThresholdStep proximity))
frame periods docs quality'
(alterBorder border (fst branches') b)
(oldAcc + (toBorderAccuracy frequency (delete b ((fst branches') ++ (snd branches')))))
b ) (fst branches')
in trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
<> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
<> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
concat (next ++ (snd branches'))
where where
-- | 3) process a quality score on the local set of branches -- | 2) for each of the possible next branches process the phyloQuality score
quality' :: Double quality' :: Double
quality' = toPhyloQuality branches quality' = toPhyloQuality beta frequency border oldAcc ((fst branches') ++ (snd branches'))
-- | 2) group the new groups into branches -- | 1) for each local branch process a temporal matching then find the resulting branches
branches :: [[PhyloGroup]] branches' :: ([[PhyloGroup]],[[PhyloGroup]])
branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups' branches' =
-- | 1) process a temporal matching for each group let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
groups' :: [PhyloGroup] $ phyloBranchMatching frame periods proximity egoThr docs groups
groups' = processMatching max' periods proximity thr docs groups in partition (\b -> length b >= minBranch) (branches `using` parList rdeepseq)
temporalMatching :: Phylo -> Phylo temporalMatching :: Phylo -> Phylo
temporalMatching phylo = updatePhyloGroups 1 branches phylo temporalMatching phylo = updatePhyloGroups 1 branches' phylo
where where
-- | 2) run the recursive matching to find the best repartition among branches -- | 5) apply the recursive matching
branches :: Map PhyloGroupId PhyloGroup branches' :: Map PhyloGroupId PhyloGroup
branches = fromList branches' =
$ map (\g -> (getGroupId g, g)) let next = trace (" ✓ F(β) = " <> show(quality)
$ recursiveMatching (phyloProximity $ getConfig phylo) <> " |✓ " <> show(length $ fst branches) <> show(map length $ fst branches)
(getThresholdInit $ phyloProximity $ getConfig phylo) <> " |✗ " <> show(length $ snd branches) <> "[" <> show(length $ concat $ snd branches) <> "]")
$ map (\branch -> recursiveMatching (phyloProximity $ getConfig phylo)
(_qua_relevance $ phyloQuality $ getConfig phylo)
(_qua_minBranch $ phyloQuality $ getConfig phylo)
(reduceFrequency frequency (fst branches))
( (getThresholdInit $ phyloProximity $ getConfig phylo)
+ (getThresholdStep $ phyloProximity $ getConfig phylo))
(getTimeFrame $ timeUnit $ getConfig phylo) (getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo) (getPeriodIds phylo)
(phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups' (phylo ^. phylo_timeDocs) quality (alterBorder 0 (fst branches) branch)
(toBorderAccuracy frequency (delete branch ((fst branches) ++ (snd branches))))
branch
) (fst branches)
in fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd (concat (next ++ (snd branches)))
-- | 4) process the quality score
quality :: Double
quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) frequency 0 0 ((fst branches) ++ (snd branches))
-- | 3) process the constants of the quality score
frequency :: Map Int Double
frequency =
let terms = ngramsInBranches ((fst branches) ++ (snd branches))
freqs = map (\t -> termFreq' t $ concat ((fst branches) ++ (snd branches))) terms
in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
-- | 2) group into branches
branches :: ([[PhyloGroup]],[[PhyloGroup]])
branches = partition (\b -> length b >= (_qua_minBranch $ phyloQuality $ getConfig phylo))
$ groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
-- | 1) for each group process an initial temporal Matching -- | 1) for each group process an initial temporal Matching
groups' :: [PhyloGroup] groups' :: [PhyloGroup]
groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo) groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
(phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo) (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
(phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo) (phylo ^. phylo_timeDocs)
(traceTemporalMatching $ getGroupsFromLevel 1 phylo)
...@@ -815,7 +815,7 @@ getProximity cluster = case cluster of ...@@ -815,7 +815,7 @@ getProximity cluster = case cluster of
-- | To initialize all the Cluster / Proximity with their default parameters -- | To initialize all the Cluster / Proximity with their default parameters
initFis :: Maybe Bool -> Maybe Support -> Maybe Int -> FisParams initFis :: Maybe Bool -> Maybe Support -> Maybe Int -> FisParams
initFis (def True -> kmf) (def 2 -> min') (def 4 -> thr) = FisParams kmf min' thr initFis (def True -> kmf) (def 0 -> min') (def 0 -> thr) = FisParams kmf min' thr
initHamming :: Maybe Double -> HammingParams initHamming :: Maybe Double -> HammingParams
initHamming (def 0.01 -> sens) = HammingParams sens initHamming (def 0.01 -> sens) = HammingParams sens
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment