Commit eea8b990 authored by qlobbe's avatar qlobbe

some refactoring & adding generality, specifivity, etc for the ngrams

parent 8bdafc9f
Pipeline #500 canceled with stage
......@@ -17,7 +17,7 @@ Phylo binaries
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
......@@ -25,7 +25,7 @@ module Main where
import System.Directory (doesFileExist)
import Data.Aeson
import Data.Text (Text, unwords)
import Data.Text (Text, unwords, unlines)
import Data.List ((++))
import GHC.Generics
import GHC.IO (FilePath)
......@@ -198,17 +198,13 @@ main = do
corpus <- parse (corpusType conf) (limit conf) (corpusPath conf) termList
putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
let roots = DL.nub $ DL.concat $ map text corpus
putStrLn $ ("\n" <> show (length roots) <> " parsed foundation roots")
putStrLn $ ("\n" <> show (length corpus) <> " parsed docs")
fis <- parseFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf)
putStrLn $ ("\n" <> show (length fis) <> " parsed fis")
let mFis = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
let fis' = DM.fromListWith (++) $ DL.sortOn (fst . fst) $ map (\f -> (getFisPeriod f,[f])) fis
let query = PhyloQueryBuild (phyloName conf) "" (timeGrain conf) (timeStep conf)
(Fis $ FisParams True (fisSupport conf) (fisClique conf)) [] [] (WeightedLogJaccard $ WLJParams (timeTh conf) (timeSens conf)) (timeFrame conf) (timeFrameTh conf)
......@@ -217,7 +213,7 @@ main = do
let queryView = PhyloQueryView (viewLevel conf) Merge False 1 [BranchAge] [SizeBranch $ SBParams (minSizeBranch conf)] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
let phylo = toPhylo query corpus roots termList mFis
let phylo = toPhylo query corpus termList fis'
writeFis (fisPath conf) (phyloName conf) (timeGrain conf) (timeStep conf) (fisSupport conf) (fisClique conf) (getPhyloFis phylo)
......
......@@ -153,6 +153,7 @@ data PhyloGroup =
PhyloGroup { _phylo_groupId :: PhyloGroupId
, _phylo_groupLabel :: Text
, _phylo_groupNgrams :: [Int]
, _phylo_groupNgramsMeta :: Map Text [Double]
, _phylo_groupMeta :: Map Text Double
, _phylo_groupBranchId :: Maybe PhyloBranchId
, _phylo_groupCooc :: Map (Int,Int) Double
......
......@@ -32,6 +32,7 @@ import Gargantext.API.Types
import Gargantext.Database.Types.Node (PhyloId, ListId, CorpusId)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Aggregates
import Gargantext.Viz.Phylo.Example
import Gargantext.Viz.Phylo.Tools
import Gargantext.Viz.Phylo.View.ViewMaker
......@@ -105,7 +106,7 @@ postPhylo _n _lId q = do
vrs = Just ("1" :: Text)
sft = Just (Software "Gargantext" "4")
prm = initPhyloParam vrs sft (Just q)
pure (toPhyloBase q prm corpus actants termList empty)
pure (toPhyloBase q prm (parseDocs (initFoundationsRoots actants) corpus) termList empty)
------------------------------------------------------------------------
......
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Viz.Phylo.Aggregates.Cluster
where
import Control.Parallel.Strategies
import Data.List (null,concat,sort,intersect,(++))
import Data.Map (Map)
import Data.Tuple (fst)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Tools
import Gargantext.Viz.Phylo.Metrics.Proximity
import Gargantext.Viz.Phylo.Metrics.Clustering
import Gargantext.Viz.Phylo.LinkMaker
import qualified Data.Map as Map
import qualified Data.Vector.Storable as VS
import Debug.Trace (trace)
import Numeric.Statistics (percentile)
-- | Optimisation to filter only relevant candidates
getCandidates :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
getCandidates gs = filter (\(g,g') -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
$ filter (\(g,g') -> g /= g')
$ listToDirectedCombi gs
-- | To transform a Graph into Clusters
graphToClusters :: Cluster -> GroupGraph -> [PhyloCluster]
graphToClusters clust (nodes,edges) = case clust of
Louvain (LouvainParams _) -> undefined
RelatedComponents (RCParams _) -> relatedComp $ ((map (\((g,g'),_) -> [g,g']) edges) ++ (map (\g -> [g]) nodes))
_ -> panic "[ERR][Viz.Phylo.Aggregates.Cluster.graphToClusters] not implemented"
-- | To transform a list of PhyloGroups into a Graph of Proximity
groupsToGraph :: Double -> Proximity -> [PhyloGroup] -> ([GroupNode],[GroupEdge])
groupsToGraph nbDocs prox gs = case prox of
WeightedLogJaccard (WLJParams _ sens) -> (gs, let candidates = map (\(x,y) -> ((x,y), weightedLogJaccard sens nbDocs (getGroupCooc x) (getGroupCooc y) (getGroupNgrams x) (getGroupNgrams y)))
$ getCandidates gs
candidates' = candidates `using` parList rdeepseq
in candidates' )
Hamming (HammingParams _) -> (gs, map (\(x,y) -> ((x,y), hamming (getGroupCooc x) (getGroupCooc y))) $ getCandidates gs)
_ -> undefined
-- | To filter a Graph of Proximity using a given threshold
filterGraph :: Proximity -> ([GroupNode],[GroupEdge]) -> ([GroupNode],[GroupEdge])
filterGraph prox (ns,es) = case prox of
WeightedLogJaccard (WLJParams thr _) -> (ns, filter (\(_,v) -> v >= thr) es)
Hamming (HammingParams thr) -> (ns, filter (\(_,v) -> v <= thr) es)
_ -> undefined
-- | To clusterise a Phylo
phyloToClusters :: Level -> Cluster -> Phylo -> Map (Date,Date) [PhyloCluster]
phyloToClusters lvl clus p = Map.fromList
$ zip periods
$ map (\g -> if null (fst g)
then []
else graphToClusters clus g) graphs'
where
--------------------------------------
graphs' :: [([GroupNode],[GroupEdge])]
graphs' = traceGraphFiltered lvl
$ map (\g -> filterGraph prox g) graphs
--------------------------------------
graphs :: [([GroupNode],[GroupEdge])]
graphs = traceGraph lvl (getThreshold prox)
$ let gs = map (\prd -> groupsToGraph (periodsToNbDocs [prd] p) prox (getGroupsWithFilters lvl prd p)) periods
gs' = gs `using` parList rdeepseq
in gs'
--------------------------------------
prox :: Proximity
prox = getProximity clus
--------------------------------------
periods :: [PhyloPeriodId]
periods = getPhyloPeriods p
--------------------------------------
----------------
-- | Tracer | --
----------------
traceGraph :: Level -> Double -> [([GroupNode],[GroupEdge])] -> [([GroupNode],[GroupEdge])]
traceGraph lvl thr g = trace ( "----\nUnfiltered clustering in Phylo" <> show (lvl) <> " :\n"
<> "count : " <> show (length lst) <> " potential edges (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
<> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
<> show (percentile 50 (VS.fromList lst)) <> " (50%) "
<> show (percentile 75 (VS.fromList lst)) <> " (75%) "
<> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") g
where
lst = sort $ map snd $ concat $ map snd g
traceGraphFiltered :: Level -> [([GroupNode],[GroupEdge])] -> [([GroupNode],[GroupEdge])]
traceGraphFiltered lvl g = trace ( "----\nClustering in Phylo" <> show (lvl) <> " :\n"
<> "count : " <> show (length lst) <> " edges\n"
<> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
<> show (percentile 50 (VS.fromList lst)) <> " (50%) "
<> show (percentile 75 (VS.fromList lst)) <> " (75%) "
<> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") g
where
lst = sort $ map snd $ concat $ map snd g
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz.Phylo.Aggregates.Cooc
where
import Data.List (union,concat,nub,sort, sortOn)
import Data.Map (Map,elems,adjust,filterWithKey,fromListWith,fromList,restrictKeys)
import Data.Set (Set)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Tools
import qualified Data.Map as Map
import qualified Data.Set as Set
-- import Debug.Trace (trace)
-- | To transform the Fis into a full coocurency Matrix in a Phylo
fisToCooc :: Map (Date, Date) [PhyloFis] -> Phylo -> Map (Int, Int) Double
fisToCooc m p = map (/docs)
$ foldl (\mem x -> adjust (+1) x mem) cooc
$ concat
$ map (\x -> listToDirectedCombiWith (\y -> getIdxInRoots y p) $ (Set.toList . getClique) x)
$ (concat . elems) m
where
--------------------------------------
fisNgrams :: [Ngrams]
fisNgrams = foldl (\mem x -> union mem $ (Set.toList . getClique) x) [] $ (concat . elems) m
--------------------------------------
docs :: Double
docs = fromIntegral $ foldl (\mem x -> mem + (getSupport x)) 0 $ (concat . elems) m
--------------------------------------
cooc :: Map (Int, Int) (Double)
cooc = Map.fromList $ map (\x -> (x,0)) (listToDirectedCombiWith (\y -> getIdxInRoots y p) fisNgrams)
--------------------------------------
-- | To transform a tuple of group's information into a coocurency Matrix
toCooc :: [([Int],Double)] -> Map (Int, Int) Double
toCooc l = map (/docs)
$ foldl (\mem x -> adjust (+1) x mem) cooc
$ concat
$ map (\x -> listToFullCombi $ fst x) l
where
--------------------------------------
idx :: [Int]
idx = nub $ concat $ map fst l
--------------------------------------
docs :: Double
docs = sum $ map snd l
--------------------------------------
cooc :: Map (Int, Int) (Double)
cooc = Map.fromList $ map (\x -> (x,0)) $ listToFullCombi idx
--------------------------------------
-- | To reduce a coocurency Matrix to some keys
getSubCooc :: [Int] -> Map (Int, Int) Double -> Map (Int, Int) Double
getSubCooc idx cooc = filterWithKey (\k _ -> (elem (fst k) idx)
&& (elem (snd k) idx)) cooc
-- | To get a coocurency Matrix related to a given list of Periods
getCooc :: [PhyloPeriodId] -> Phylo -> Map (Int, Int) Double
getCooc prds p = toCooc $ map (\g -> (getGroupNgrams g,getGroupMeta "support" g)) gs
where
--------------------------------------
-- | Here we need to go back to the level 1 (aka : the Fis level)
gs :: [PhyloGroup]
gs = filter (\g -> elem (getGroupPeriod g) prds ) $ getGroupsWithLevel 1 p
--------------------------------------
-- | To transform a list of index into a cooc matrix
listToCooc :: [Int] -> Map (Int,Int) Double
listToCooc lst = fromList $ map (\combi -> (combi,1)) $ listToFullCombi lst
-- | To transform a list of ngrams into a list of indexes
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns v = sort $ map (\n -> getIdxInVector n v) ns
-- | To build the cooc matrix by years out of the corpus
docsToCooc :: [Document] -> Vector Ngrams -> Map Date (Map (Int,Int) Double)
docsToCooc docs fdt = fromListWith sumCooc
$ map (\(d,l) -> (d, listToCooc l))
$ map (\doc -> (date doc, ngramsToIdx (text doc) fdt)) docs
-- | To sum all the docs produced during a list of years
sumDocsByYears :: Set Date -> Map Date Double -> Double
sumDocsByYears years m = sum $ elems $ restrictKeys m years
-- | To get the cooc matrix of a group
groupToCooc :: PhyloGroup -> Phylo -> Map (Int,Int) Double
groupToCooc g p = getMiniCooc (listToFullCombi $ getGroupNgrams g) (periodsToYears [getGroupPeriod g]) (getPhyloCooc p)
-- | To get the union of the cooc matrix of two groups
unionOfCooc :: PhyloGroup -> PhyloGroup -> Phylo -> Map (Int,Int) Double
unionOfCooc g g' p = sumCooc (groupToCooc g p) (groupToCooc g' p)
-- | To get the nth most occurent elems in a coocurency matrix
getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
getNthMostOcc nth cooc = (nub . concat)
$ map (\((idx,idx'),_) -> [idx,idx'])
$ take nth
$ reverse
$ sortOn snd $ Map.toList cooc
-- phyloCooc :: Map (Int, Int) Double
-- phyloCooc = fisToCooc phyloFis phylo1_0_1
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz.Phylo.Aggregates.Document
where
import Data.Map (Map,fromListWith)
import Data.Text (Text)
import Data.Tuple (fst)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Viz.Phylo
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Vector as Vector
import Debug.Trace (trace)
-- | To init a list of Periods framed by a starting Date and an ending Date
initPeriods :: (Eq date, Enum date) => Grain -> Step -> (date, date) -> [(date, date)]
initPeriods g s (start,end) = map (\l -> (head' "Doc" l, last' "Doc" l))
$ chunkAlong g s [start .. end]
-- | To group a list of Documents by fixed periods
groupDocsByPeriod :: (Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.Example.docsToPeriods] Empty [Documents] can not have any periods"
groupDocsByPeriod f pds es = trace ("----\nGroup docs by periods\n") $ Map.fromList $ zip pds $ map (inPeriode f es) pds
where
--------------------------------------
inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
inPeriode f' h (start,end) =
fst $ List.partition (\d -> f' d >= start && f' d <= end) h
--------------------------------------
-- | To parse a list of Documents by filtering on a Vector of Ngrams
parseDocs :: Vector Ngrams -> [(Date,Text)] -> [Document]
parseDocs roots c = map (\(d,t)
-> Document d ( filter (\x -> Vector.elem x roots)
$ monoTexts t)) c
-- | To count the number of documents by year
countDocs :: [(Date,a)] -> Map Date Double
countDocs corpus = fromListWith (+) $ map (\(d,_) -> (d,1)) corpus
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz.Phylo.Aggregates.Fis
where
import Control.Lens hiding (makeLenses, both, Level)
import Data.List (null,concat,sort,(++))
import Data.Map (Map,elems,mapWithKey,unionWith,fromList,keys)
import Data.Tuple (fst, snd)
import Data.Set (size)
import Gargantext.Prelude
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Tools
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Vector.Storable as Vector
import Numeric.Statistics (percentile)
import Debug.Trace (trace)
-- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
filterFis keep thr f m = case keep of
False -> Map.map (\l -> f thr l) m
True -> Map.map (\l -> keepFilled (f) thr l) m
-- | To filter Fis with small Support
filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
filterFisBySupport thr l = filter (\fis -> getSupport fis >= thr) l
-- | To filter Fis with small Clique size
filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
filterFisByClique thr l = filter (\fis -> (size $ getClique fis) >= thr) l
-- | To filter nested Fis
filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
filterFisByNested = map (\l -> let cliqueMax = filterNestedSets (head' "Fis" $ map getClique l) (map getClique l) []
in filter (\fis -> elem (getClique fis) cliqueMax) l)
docsToFis' :: Map (Date,Date) [Document] -> Phylo -> Phylo
docsToFis' m p = if (null $ getPhyloFis p)
then trace("----\nRebuild the Fis from scratch\n")
$ p & phylo_fis .~ mapWithKey (\k docs -> let fis = Map.toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
in map (\f -> PhyloFis (fst f) (snd f) k) fis) m
else trace("----\nUse Fis from an existing file\n")
$ p & phylo_fis %~ (unionWith (++) (fromList $ map (\k -> (k,[])) $ keys m))
toPhyloFis' :: Map (Date, Date) [PhyloFis] -> Bool -> Support -> Int -> Map (Date, Date) [PhyloFis]
toPhyloFis' fis k s t = traceFis "----\nFiltered Fis by clique size :\n"
$ filterFis k t (filterFisByClique)
$ traceFis "----\nFiltered Fis by nested :\n"
$ filterFisByNested
$ traceFis "----\nFiltered Fis by support :\n"
$ filterFis k s (filterFisBySupport)
$ traceFis "----\nUnfiltered Fis :\n" fis
-----------------
-- | Tracers | --
-----------------
traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
traceFis lbl m = trace (lbl <> "count : " <> show (sum $ map length $ elems m) <> " Fis\n"
<> "support : " <> show (percentile 25 (Vector.fromList supps)) <> " (25%) "
<> show (percentile 50 (Vector.fromList supps)) <> " (50%) "
<> show (percentile 75 (Vector.fromList supps)) <> " (75%) "
<> show (percentile 90 (Vector.fromList supps)) <> " (90%) "
<> show (percentile 100 (Vector.fromList supps)) <> " (100%)\n"
<> " " <> show (countSup 1 supps) <> " (>1) "
<> show (countSup 2 supps) <> " (>2) "
<> show (countSup 3 supps) <> " (>3) "
<> show (countSup 4 supps) <> " (>4) "
<> show (countSup 5 supps) <> " (>5) "
<> show (countSup 6 supps) <> " (>6)\n"
<> "clique size : " <> show (percentile 25 (Vector.fromList ngrms)) <> " (25%) "
<> show (percentile 50 (Vector.fromList ngrms)) <> " (50%) "
<> show (percentile 75 (Vector.fromList ngrms)) <> " (75%) "
<> show (percentile 90 (Vector.fromList ngrms)) <> " (90%) "
<> show (percentile 100 (Vector.fromList ngrms)) <> " (100%)\n"
<> " " <> show (countSup 1 ngrms) <> " (>1) "
<> show (countSup 2 ngrms) <> " (>2) "
<> show (countSup 3 ngrms) <> " (>3) "
<> show (countSup 4 ngrms) <> " (>4) "
<> show (countSup 5 ngrms) <> " (>5) "
<> show (countSup 6 ngrms) <> " (>6)\n"
) m
where
--------------------------------------
countSup :: Double -> [Double] -> Int
countSup s l = length $ filter (>s) l
--------------------------------------
supps :: [Double]
supps = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems m
--------------------------------------
ngrms :: [Double]
ngrms = sort $ map (\f -> fromIntegral $ Set.size $ _phyloFis_clique f) $ concat $ elems m
--------------------------------------
\ No newline at end of file
......@@ -19,13 +19,14 @@ module Gargantext.Viz.Phylo.BranchMaker
import Control.Parallel.Strategies
import Control.Lens hiding (both, Level)
import Data.List (concat,nub,(++),tail,sortOn,take,reverse,sort,null,intersect,union,delete)
import Data.List (concat,nub,(++),sortOn,reverse,sort,null,intersect,union,delete)
import Data.Map (Map,(!), fromListWith, elems)
import Data.Tuple (fst, snd)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Metrics.Clustering
import Gargantext.Viz.Phylo.Aggregates.Cooc
import Gargantext.Viz.Phylo.Cluster
import Gargantext.Viz.Phylo.Aggregates
import Gargantext.Viz.Phylo.Metrics
import Gargantext.Viz.Phylo.Tools
import Gargantext.Viz.Phylo.LinkMaker
......
......@@ -29,19 +29,16 @@ TODO:
module Gargantext.Viz.Phylo.Example where
import Data.GraphViz.Types.Generalised (DotGraph)
import Data.Text (Text)
import Data.List ((++), last)
import Data.Text (Text, toLower)
import Data.List ((++))
import Data.Map (Map,empty)
import Data.Tuple (fst)
import Data.Tuple.Extra
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Text.Context (TermList)
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Aggregates.Cluster
import Gargantext.Viz.Phylo.Aggregates.Document
import Gargantext.Viz.Phylo.Aggregates.Cooc
import Gargantext.Viz.Phylo.Aggregates.Fis
import Gargantext.Viz.Phylo.Cluster
import Gargantext.Viz.Phylo.Aggregates
import Gargantext.Viz.Phylo.BranchMaker
import Gargantext.Viz.Phylo.LevelMaker
import Gargantext.Viz.Phylo.LinkMaker
......@@ -78,7 +75,7 @@ queryViewEx = "level=3"
phyloQueryView :: PhyloQueryView
phyloQueryView = PhyloQueryView 2 Merge False 1 [BranchAge] [] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
phyloQueryView = PhyloQueryView 2 Merge False 2 [BranchAge] [] [BranchPeakFreq,GroupLabelCooc] (Just (ByBranchAge,Asc)) Json Flat True
--------------------------------------------------
......@@ -87,7 +84,7 @@ phyloQueryView = PhyloQueryView 2 Merge False 1 [BranchAge] [] [BranchPeakFreq,G
phyloFromQuery :: Phylo
phyloFromQuery = toPhylo (queryParser queryEx) corpus actants termList empty
phyloFromQuery = toPhylo phyloQueryBuild docs termList empty
-- | To do : create a request handler and a query parser
queryParser :: [Char] -> PhyloQueryBuild
......@@ -105,7 +102,7 @@ queryEx = "title=Cesar et Cleôpatre"
phyloQueryBuild :: PhyloQueryBuild
phyloQueryBuild = PhyloQueryBuild "Cesar et Cleôpatre" "An example of Phylomemy (french without accent)"
5 3 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.6 20) 5 0.8 0.5 4 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.4 0)
3 1 defaultFis [] [] (WeightedLogJaccard $ WLJParams 0.6 20) 5 0.8 0.5 4 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.4 0)
......@@ -155,7 +152,7 @@ phylo2 = addPhyloLevel 2 phyloCluster phyloBranch1
phyloCluster :: Map (Date,Date) [PhyloCluster]
phyloCluster = phyloToClusters 3 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.05 10) phyloBranch1
phyloCluster = phyloToClusters 2 (RelatedComponents $ RCParams $ WeightedLogJaccard $ WLJParams 0.05 10) phyloBranch1
----------------------------------
......@@ -186,30 +183,27 @@ phylo1_p = interTempoMatching Ascendant 1 defaultWeightedLogJaccard phylo1_0_1
phylo1_0_1 :: Phylo
phylo1_0_1 = setLevelLinks (0,1) phylo1_1_0
phylo1_0_1 = setLevelLinks (0,1) phylo1
phylo1_1_0 :: Phylo
phylo1_1_0 = setLevelLinks (1,0) phylo1
-- phylo1_1_0 :: Phylo
-- phylo1_1_0 = setLevelLinks (1,0) phylo1
phylo1 :: Phylo
phylo1 = addPhyloLevel (1) phyloFis phylo
phylo1 = addPhyloLevel (1) phyloFis phylo'
-------------------------------------------------------------------
-- | STEP 5 | -- Create lists of Frequent Items Set and filter them
-------------------------------------------------------------------
phyloFis :: Map (Date, Date) [PhyloFis]
phyloFis = filterFis True 1 (filterFisByClique)
$ filterFisByNested
$ filterFis True 1 (filterFisBySupport) (getPhyloFis phylo')
phyloFis = refineFis (getPhyloFis phylo') True 1 1
phylo' :: Phylo
phylo' = docsToFis' phyloDocs phylo
phylo' = docsToFis phyloDocs phylo
----------------------------------------
-- | STEP 2 | -- Init a Phylo of level 0
......@@ -221,7 +215,7 @@ phylo = addPhyloLevel 0 phyloDocs phyloBase
phyloDocs :: Map (Date, Date) [Document]
phyloDocs = corpusToDocs corpus phyloBase
phyloDocs = groupDocsByPeriod date (getPhyloPeriods phyloBase) docs
------------------------------------------------------------------------
......@@ -229,32 +223,25 @@ phyloDocs = corpusToDocs corpus phyloBase
------------------------------------------------------------------------
phyloBase :: Phylo
phyloBase = initPhyloBase periods (PhyloFoundations foundationsRoots termList) nbDocs cooc empty defaultPhyloParam
cooc :: Map Date (Map (Int,Int) Double)
cooc = docsToCooc (parseDocs foundationsRoots corpus) foundationsRoots
phyloBase :: Phylo
phyloBase = toPhyloBase phyloQueryBuild phyloParam docs termList empty
nbDocs :: Map Date Double
nbDocs = countDocs corpus
phyloParam :: PhyloParam
phyloParam = (initPhyloParam (Just defaultPhyloVersion) (Just defaultSoftware) (Just phyloQueryBuild))
periods :: [(Date,Date)]
periods = initPeriods 5 3
$ both fst (head' "Example" corpus,last corpus)
docs :: [Document]
docs = parseDocs foundationsRoots corpus
foundationsRoots :: Vector Ngrams
foundationsRoots = initFoundationsRoots actants
foundationsRoots = initFoundationsRoots (termListToNgrams termList)
--------------------------------------------
-- | STEP 0 | -- Let's start with an example
--------------------------------------------
-- this is a comment
termList :: TermList
termList = []
termList = map (\a -> ([toLower a],[])) actants
actants :: [Ngrams]
actants = [ "Cleopatre" , "Ptolemee", "Ptolemee-XIII", "Ptolemee-XIV"
......
This diff is collapsed.
......@@ -19,13 +19,13 @@ module Gargantext.Viz.Phylo.LinkMaker
import Control.Parallel.Strategies
import Control.Lens hiding (both, Level)
import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, nub, groupBy, union, inits, scanl, find)
import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, delete, intersect, groupBy, union, inits, scanl, find)
import Data.Tuple.Extra
import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Tools
import Gargantext.Viz.Phylo.Metrics.Proximity
import Gargantext.Viz.Phylo.Metrics
import qualified Data.List as List
import qualified Data.Maybe as Maybe
import qualified Data.Map as Map
......
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz.Phylo.Metrics.Clustering
where
import Data.Graph.Clustering.Louvain.CplusPlus
import Data.List (concat,null,nub,(++),elemIndex,groupBy,(!!), (\\), union, intersect)
import Data.Map (fromList,mapKeys)
import Gargantext.Prelude
import Gargantext.Viz.Phylo
-- import Gargantext.Viz.Phylo.Tools
-- import Debug.Trace (trace)
relatedComp :: Eq a => [[a]] -> [[a]]
relatedComp graphs = foldl' (\mem groups ->
if (null mem)
then mem ++ [groups]
else
let related = filter (\groups' -> (not . null) $ intersect groups groups') mem
in if (null related)
then mem ++ [groups]
else (mem \\ related) ++ [union groups (nub $ concat related)] ) [] graphs
louvain :: ([GroupNode],[GroupEdge]) -> IO [[PhyloGroup]]
louvain (nodes,edges) = map (\community -> map (\node -> nodes !! (l_node_id node)) community)
<$> groupBy (\a b -> (l_community_id a) == (l_community_id b))
<$> (cLouvain $ mapKeys (\(x,y) -> (idx x, idx y)) $ fromList edges)
where
--------------------------------------
idx :: PhyloGroup -> Int
idx e = case elemIndex e nodes of
Nothing -> panic "[ERR][Gargantext.Viz.Phylo.Metrics.Clustering] a node is missing"
Just i -> i
--------------------------------------
{-|
Module : Gargantext.Viz.Phylo.Tools
Description : Phylomemy Tools to build/manage it
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz.Phylo.Metrics.Proximity
where
import Data.List (null,union,intersect)
import Data.Map (Map,elems,unionWith,intersectionWith,intersection,size,filterWithKey)
import Gargantext.Prelude
-- import Debug.Trace (trace)
sumInvLog :: Double -> [Double] -> Double
sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
sumLog :: Double -> [Double] -> Double
sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
-- -- | To process WeighedLogJaccard distance between to coocurency matrix
-- weightedLogJaccard :: Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> Double -> Double
-- weightedLogJaccard sens cooc cooc' nbDocs
-- | null union' = 0
-- | union' == inter' = 1
-- | sens == 0 = (fromIntegral $ length $ keysInter) / (fromIntegral $ length $ keysUnion)
-- | sens > 0 = (sumInvLog sens $ elems wInter) / (sumInvLog sens $ elems wUnion)
-- | otherwise = (sumLog sens $ elems wInter) / (sumLog sens $ elems wUnion)
-- where
-- --------------------------------------
-- keysInter :: [Int]
-- keysInter = nub $ concat $ map (\(x,x') -> [x,x']) $ keys inter'
-- --------------------------------------
-- keysUnion :: [Int]
-- keysUnion = nub $ concat $ map (\(x,x') -> [x,x']) $ keys union'
-- --------------------------------------
-- wInter :: Map (Int,Int) Double
-- wInter = map (/nbDocs) inter'
-- --------------------------------------
-- wUnion :: Map (Int,Int) Double
-- wUnion = map (/nbDocs) union'
-- --------------------------------------
-- inter' :: Map (Int, Int) Double
-- inter' = intersectionWith (+) cooc cooc'
-- --------------------------------------
-- union' :: Map (Int, Int) Double
-- union' = unionWith (+) cooc cooc'
-- --------------------------------------
-- | To compute a jaccard similarity between two lists
jaccard :: [Int] -> [Int] -> Double
jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
-- | To get the diagonal of a matrix
toDiago :: Map (Int, Int) Double -> [Double]
toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
-- | To process WeighedLogJaccard distance between to coocurency matrix
weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
| null gInter = 0
| gInter == gUnion = 1
| sens == 0 = jaccard gInter gUnion
| sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
| otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
where
--------------------------------------
gInter :: [Int]
gInter = intersect ngrams ngrams'
--------------------------------------
gUnion :: [Int]
gUnion = union ngrams ngrams'
--------------------------------------
wInter :: [Double]
wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
--------------------------------------
wUnion :: [Double]
wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
--------------------------------------
-- | To process the Hamming distance between two PhyloGroup fields
hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
where
--------------------------------------
inter :: Map (Int, Int) Double
inter = intersection f1 f2
--------------------------------------
......@@ -139,7 +139,7 @@ listToUnDirectedCombiWith f l = [ (f x, f y) | (x:rest) <- tails l, y <- rest ]
-- | To transform a list of Ngrams Indexes into a Label
ngramsToLabel :: Vector Ngrams -> [Int] -> Text
ngramsToLabel ngrams l = unwords $ ngramsToText ngrams l
ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
-- | To transform a list of Ngrams Indexes into a list of Text
......@@ -147,6 +147,11 @@ ngramsToText :: Vector Ngrams -> [Int] -> [Text]
ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
-- | To transform a list of ngrams into a list of indexes
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns v = sort $ map (\n -> getIdxInVector n v) ns
-- | To unify the keys (x,y) that Map 1 share with Map 2 such as: (x,y) <=> (y,x)
unifySharedKeys :: Eq a => Ord a => Map (a,a) b -> Map (a,a) b -> Map (a,a) b
unifySharedKeys m1 m2 = mapKeys (\(x,y) -> if member (y,x) m2
......@@ -439,6 +444,7 @@ initGroup ngrams lbl idx lvl from' to' p = PhyloGroup
lbl
idxs
(Map.empty)
(Map.empty)
Nothing
(getMiniCooc (listToFullCombi idxs) (periodsToYears [(from', to')]) (getPhyloCooc p))
[] [] [] []
......
......@@ -18,17 +18,17 @@ module Gargantext.Viz.Phylo.View.Taggers
where
import Control.Lens hiding (makeLenses, both, Level)
import Data.List (concat,nub,groupBy,sortOn,sort)
import Data.List (concat,nub,groupBy,sortOn,sort, (!!), take)
import Data.Text (Text)
import Data.Tuple (fst, snd)
import Data.Vector (Vector)
import Data.Map (Map)
import Data.Map (Map, (!))
import Gargantext.Prelude
import Gargantext.Viz.Phylo
import Gargantext.Viz.Phylo.Tools
import Gargantext.Viz.Phylo.BranchMaker
import qualified Data.Map as Map
-- import Debug.Trace (trace)
import Debug.Trace (trace)
-- | To get the nth most frequent Ngrams in a list of PhyloGroups
......@@ -82,14 +82,20 @@ branchPeakCooc v nth p = foldl (\v' (id,lbl) -> alterBranchPeak (id,lbl) v') v
$ getNodesByBranches v
getNthMostMeta :: Int -> Text -> PhyloGroup -> [Int]
getNthMostMeta nth meta g = map (\(idx,_) -> (getGroupNgrams g !! idx))
$ take nth
$ sortOn snd $ zip [0..]
$ (g ^. phylo_groupNgramsMeta) ! meta
-- | To set the label of a PhyloNode as the nth most coocurent terms of its PhyloNodes
nodeLabelCooc :: PhyloView -> Int -> Phylo -> PhyloView
nodeLabelCooc v thr p = over (pv_nodes
. traverse)
(\n -> let lbl = ngramsToLabel (getFoundationsRoots p)
$ mostOccNgrams thr
$ head' "nodeLabelCooc" $ getGroupsFromIds [getNodeId n] p
in n & pn_label .~ lbl) v
(\n -> let g = head' "nodeLabelCooc" $ getGroupsFromIds [getNodeId n] p
lbl' = ngramsToLabel (getFoundationsRoots p) $ getNthMostMeta thr "coverage" g
in trace (show (lbl')) $ n & pn_label .~ lbl') v
-- | To process a sorted list of Taggers to a PhyloView
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment