Commit 3f949532 authored by qlobbe's avatar qlobbe

add the phyloBase, Fis and Cooc

parent 349ed2a2
Pipeline #542 failed with stage
......@@ -36,6 +36,7 @@ import Gargantext.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloMaker (toPhylo)
import GHC.IO (FilePath)
import Prelude (Either(..))
......@@ -164,4 +165,7 @@ main = do
corpus <- fileToDocs (corpusParser config) (corpusLimit config) (corpusPath config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
\ No newline at end of file
printIOMsg "Reconstruct the Phylo"
let phylo = toPhylo corpus mapList config
printIOMsg "End of reconstruction"
\ No newline at end of file
......@@ -69,6 +69,7 @@ library:
- Gargantext.Viz.Graph.Index
- Gargantext.Viz.Phylo
- Gargantext.Viz.AdaptativePhylo
- Gargantext.Viz.Phylo.PhyloMaker
- Gargantext.Viz.Phylo.Tools
- Gargantext.Viz.Phylo.Example
- Gargantext.Viz.Phylo.LevelMaker
......
......@@ -33,7 +33,7 @@ import Data.Aeson.TH (deriveJSON)
import Data.Text (Text, pack)
import Data.Vector (Vector)
import Data.Map (Map)
import Data.Matrix (Matrix)
import Data.Set (Set)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
......@@ -60,6 +60,7 @@ data Config =
, corpusLimit :: Int
, phyloName :: Text
, phyloLevel :: Int
, timeUnit :: Int
, timePeriod :: Int
, timeStep :: Int
, fisSupport :: Int
......@@ -67,6 +68,7 @@ data Config =
, branchSize :: Int
} deriving (Show,Generic,Eq)
defaultConfig :: Config
defaultConfig =
Config { corpusPath = ""
, listPath = ""
......@@ -75,6 +77,7 @@ defaultConfig =
, corpusLimit = 1000
, phyloName = pack "Default Phylo"
, phyloLevel = 2
, timeUnit = 1
, timePeriod = 3
, timeStep = 1
, fisSupport = 2
......@@ -94,6 +97,7 @@ data Software =
, _software_version :: Text
} deriving (Generic, Show, Eq)
defaultSoftware :: Software
defaultSoftware =
Software { _software_name = pack "Gargantext"
, _software_version = pack "v4" }
......@@ -106,6 +110,7 @@ data PhyloParam =
, _phyloParam_config :: Config
} deriving (Generic, Show, Eq)
defaultPhyloParam :: PhyloParam
defaultPhyloParam =
PhyloParam { _phyloParam_version = pack "v2.adaptative"
, _phyloParam_software = defaultSoftware
......@@ -147,8 +152,8 @@ data PhyloFoundations = PhyloFoundations
---------------------------
-- | Cooc : a weighted (Double) coocurency matrix
type Cooc = Matrix Double
-- | Cooc : a coocurency matrix between two ngrams
type Cooc = Map (Int,Int) Double
-------------------
......@@ -161,15 +166,80 @@ type Cooc = Matrix Double
-- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
-- timeDocs : a Map with the numbers of docs by minimal unit of time (ex: by year)
-- param : the parameters of the phylomemy (with the user's configuration)
-- periods : the temporal steps of a phylomemy
data Phylo =
Phylo { _phylo_foundations :: PhyloFoundations
, _phylo_timeCooc :: !(Map Date Cooc)
, _phylo_timeDocs :: !(Map Date Double)
, _phylo_param :: PhyloParam
, _phylo_periods :: [PhyloPeriod]
}
deriving (Generic, Show, Eq)
-- | PhyloPeriodId : the id of a given period
type PhyloPeriodId = (Date,Date)
-- | PhyloPeriod : steps of a phylomemy on a temporal axis
-- id: tuple (start date, end date) of the temporal step of the phylomemy
-- levels: levels of granularity
data PhyloPeriod =
PhyloPeriod { _phylo_periodId :: PhyloPeriodId
, _phylo_periodLevels :: [PhyloLevel]
}
deriving (Generic, Show, Eq)
-- | Level : a level of clustering
type Level = Int
-- | PhyloLevelId : the id of a level of clustering in a given period
type PhyloLevelId = (PhyloPeriodId,Level)
-- | PhyloLevel : levels of phylomemy on a synchronic axis
-- Levels description:
-- Level 0: The foundations and the base of the phylo
-- Level 1: First level of clustering (the Fis)
-- Level [2..N]: Nth level of synchronic clustering (cluster of Fis)
data PhyloLevel =
PhyloLevel { _phylo_levelId :: PhyloLevelId
, _phylo_levelGroups :: [PhyloGroup]
}
deriving (Generic, Show, Eq)
--------------------
-- | PhyloGroup | --
--------------------
type Index = Int
type PhyloGroupId = (PhyloLevelId, Index)
-- | PhyloGroup : group of ngrams at each level and period
data PhyloGroup =
PhyloGroup { _phylo_groupId :: PhyloGroupId
}
deriving (Generic, Show, Eq)
---------------------------
-- | Frequent Item Set | --
---------------------------
-- | Clique : Set of ngrams cooccurring in the same Document
type Clique = Set Ngrams
-- | Support : Number of Documents where a Clique occurs
type Support = Int
-- | Fis : Frequent Items Set (ie: the association between a Clique and a Support)
data PhyloFis = PhyloFis
{ _phyloFis_clique :: Clique
, _phyloFis_support :: Support
, _phyloFis_period :: (Date,Date)
} deriving (Generic,NFData,Show,Eq)
----------------
-- | Lenses | --
......@@ -177,6 +247,12 @@ data Phylo =
makeLenses ''Config
makeLenses ''PhyloFoundations
makeLenses ''PhyloFis
makeLenses ''Phylo
makeLenses ''PhyloPeriod
makeLenses ''PhyloLevel
makeLenses ''PhyloGroup
makeLenses ''PhyloParam
------------------------
-- | JSON instances | --
......
......@@ -19,7 +19,7 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloExample where
import Data.List (sortOn)
import Data.List (sortOn, nub, sort)
import Data.Map (Map)
import Data.Text (Text, toLower)
......@@ -35,16 +35,38 @@ import Control.Lens
import qualified Data.Vector as Vector
---------------------------------------------
-- | STEP 2 | -- Build the frequent items set
---------------------------------------------
phyloFis :: Map (Date,Date) [PhyloFis]
phyloFis = toPhyloFis docsByPeriods (fisSupport config) (fisSize config)
docsByPeriods :: Map (Date,Date) [Document]
docsByPeriods = groupDocsByPeriod date periods docs
--------------------------------------------
-- | STEP 1 | -- Init the Base of the Phylo
--------------------------------------------
-- cooc et phyloBase
phyloBase :: Phylo
phyloBase = toPhyloBase docs mapList config
phyloCooc :: Map Date Cooc
phyloCooc = docsToCoocByYear docs (foundations ^. foundations_roots) config
periods :: [(Date,Date)]
periods = toPeriods (sort $ nub $ map date docs) (timePeriod config) (timeStep config)
nbDocsByYear :: Map Date Double
nbDocsByYear = nbDocsByTime docs 1
nbDocsByYear = nbDocsByTime docs (timeUnit config)
config :: Config
......
......@@ -15,22 +15,173 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloMaker where
import Data.Map (Map, fromListWith, keys, unionWith, fromList)
import Data.List (concat, nub, partition, sort, (++))
import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, mapWithKey, toList, elems)
import Data.Set (size)
import Data.Vector (Vector)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import Gargantext.Viz.Phylo.PhyloTools
import Gargantext.Text.Context (TermList)
import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
import Control.DeepSeq (NFData)
import Control.Parallel.Strategies (parList, rdeepseq, using)
import Debug.Trace (trace)
import Control.Lens
import qualified Data.Vector as Vector
import qualified Data.Set as Set
------------------
-- | To Phylo | --
------------------
toPhylo :: [Document] -> TermList -> Config -> Phylo
toPhylo docs lst conf = phyloBase
where
--------------------------------------
_phylo1 :: Phylo
_phylo1 = toPhylo1 docs phyloBase
--------------------------------------
phyloBase :: Phylo
phyloBase = toPhyloBase docs lst conf
--------------------------------------
--------------------
-- | To Phylo 1 | --
--------------------
toPhylo1 :: [Document] -> Phylo -> Phylo
toPhylo1 docs phyloBase = undefined
where
--------------------------------------
_mFis :: Map (Date,Date) [PhyloFis]
_mFis = toPhyloFis _docs' (fisSupport $ getConfig phyloBase) (fisSize $ getConfig phyloBase)
--------------------------------------
_docs' :: Map (Date,Date) [Document]
_docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
--------------------------------------
---------------------------
-- | Frequent Item Set | --
---------------------------
-- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
filterFis keep thr f m = case keep of
False -> map (\l -> f thr l) m
True -> map (\l -> keepFilled (f) thr l) m
-- | To filter Fis with small Support
filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
-- | To filter Fis with small Clique size
filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
-- | To filter nested Fis
filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
filterFisByNested m =
let fis = map (\l ->
foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
then mem
else
let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
in fMax ++ [f] ) [] l)
$ elems m
fis' = fis `using` parList rdeepseq
in fromList $ zip (keys m) fis'
-- | To transform a time map of docs innto a time map of Fis with some filters
toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
toPhyloFis mDocs support clique = traceFis "Filtered Fis"
$ filterFisByNested
$ traceFis "Filtered by clique size"
$ filterFis True clique (filterFisByClique)
$ traceFis "Filtered by support"
$ filterFis True support (filterFisBySupport)
$ traceFis "Unfiltered Fis" mFis
where
--------------------------------------
-- | create the fis from the docs for each period
mFis :: Map (Date,Date) [PhyloFis]
mFis = mapWithKey (\prd docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
in map (\f -> PhyloFis (fst f) (snd f) prd) fis ) mDocs
--------------------------------------
--------------------
-- | to Phylo 0 | --
-- | Coocurency | --
--------------------
-- | To transform the docs into a time map of coocurency matrix
docsToCoocByYear :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
docsToCoocByYear docs fdt conf =
let mCooc = fromListWith sumCooc
$ map (\(_d,l) -> (_d, listToMatrix l))
$ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
mCooc' = fromList
$ map (\t -> (t,empty))
$ toTimeScale (map date docs) (timeUnit conf)
in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
$ unionWith sumCooc mCooc mCooc'
-----------------------
-- | to Phylo Base | --
-----------------------
-- | To group a list of Documents by fixed periods
groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
groupDocsByPeriod f pds es =
let periods = map (inPeriode f es) pds
periods' = periods `using` parList rdeepseq
in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
$ fromList $ zip pds periods'
where
--------------------------------------
inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
inPeriode f' h (start,end) =
fst $ partition (\d -> f' d >= start && f' d <= end) h
--------------------------------------
-- | To count the number of docs by unit of time (like a year)
nbDocsByTime :: [Document] -> Int -> Map Date Double
nbDocsByTime docs step =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
in unionWith (+) time docs'
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
$ unionWith (+) time docs'
-- | To init the basic elements of a Phylo
toPhyloBase :: [Document] -> TermList -> Config -> Phylo
toPhyloBase docs lst conf =
let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
params = defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (timePeriod conf) (timeStep conf)
in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
$ Phylo foundations
(docsToCoocByYear docs (foundations ^. foundations_roots) conf)
(nbDocsByTime docs $ timeUnit conf)
params
(map (\prd -> PhyloPeriod prd []) periods)
......@@ -16,18 +16,28 @@ Portability : POSIX
module Gargantext.Viz.Phylo.PhyloTools where
import Data.Maybe (Maybe, fromMaybe)
import Data.Text (Text)
import Data.Vector (Vector)
import Data.List (sort)
import Data.Vector (Vector, elemIndex)
import Data.List (sort, concat, null, union, (++), tails, sortOn)
import Data.Set (size)
import Data.Map (Map, elems, fromList, unionWith)
import Data.String (String)
import Gargantext.Prelude
import Gargantext.Viz.AdaptativePhylo
import GHC.IO (FilePath)
import Debug.Trace (trace)
import Control.Lens
import qualified Data.Vector as Vector
--------------
-- | Misc | --
--------------
countSup :: Double -> [Double] -> Int
countSup s l = length $ filter (>s) l
---------------------
-- | Foundations | --
......@@ -38,14 +48,110 @@ import qualified Data.Vector as Vector
isRoots :: Ngrams -> Vector Ngrams -> Bool
isRoots n ns = Vector.elem n ns
-- | To transform a list of nrams into a list of foundation's index
ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
--------------
-- | Time | --
--------------
findBounds :: [Date] -> (Date,Date)
findBounds dates =
let dates' = sort dates
in (head' "findBounds" dates', last' "findBounds" dates')
toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
toPeriods dates p s =
let (start,end) = findBounds dates
in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
$ chunkAlong p s [start .. end]
-- | Get a regular & ascendante timeScale from a given list of dates
toTimeScale :: [Date] -> Int -> [Date]
toTimeScale dates step =
let dates' = sort dates
in [head' "toTimeScale" dates', ((head' "toTimeScale" dates') + step) .. last' "toTimeScale" dates']
\ No newline at end of file
let (start,end) = findBounds dates
in [start, (start + step) .. end]
-------------
-- | Fis | --
-------------
-- | To find if l' is nested in l
isNested :: Eq a => [a] -> [a] -> Bool
isNested l l'
| null l' = True
| length l' > length l = False
| (union l l') == l = True
| otherwise = False
-- | To filter Fis with small Support but by keeping non empty Periods
keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
keepFilled f thr l = if (null $ f thr l) && (not $ null l)
then keepFilled f (thr - 1) l
else f thr l
traceClique :: Map (Date, Date) [PhyloFis] -> String
traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
where
--------------------------------------
cliques :: [Double]
cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
--------------------------------------
traceSupport :: Map (Date, Date) [PhyloFis] -> String
traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
where
--------------------------------------
supports :: [Double]
supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
--------------------------------------
traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
<> "Support : " <> (traceSupport mFis) <> "\n"
<> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
--------------
-- | Cooc | --
--------------
listToCombi' :: [a] -> [(a,a)]
listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
listToEqual' :: Eq a => [a] -> [(a,a)]
listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
listToKeys :: [Int] -> [(Int,Int)]
listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
listToMatrix :: [Int] -> Map (Int,Int) Double
listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
sumCooc :: Cooc -> Cooc -> Cooc
sumCooc cooc cooc' = unionWith (+) cooc cooc'
---------------
-- | Phylo | --
---------------
getPeriodIds :: Phylo -> [(Date,Date)]
getPeriodIds phylo = sortOn fst
$ map (\prd -> prd ^. phylo_periodId)
$ phylo ^. phylo_periods
getConfig :: Phylo -> Config
getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment