Commit 22578326 authored by qlobbe's avatar qlobbe Committed by Alexandre Delanoë

1 click phylo v1 is ok

parent 4ee73701
......@@ -149,6 +149,7 @@ seaToLabel :: PhyloConfig -> [Char]
seaToLabel config = case (seaElevation config) of
Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
Adaptative granularity -> ("sea_adapt" <> (show granularity))
Evolving _ -> ("sea_evolv")
sensToLabel :: PhyloConfig -> [Char]
......
......@@ -31,7 +31,6 @@ import Control.Lens (makeLenses)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Map (Map)
import Data.Set (Set)
import Data.Swagger
import Data.Text (Text, pack)
import Data.Vector (Vector)
......@@ -66,6 +65,8 @@ data SeaElevation =
, _cons_gap :: Double }
| Adaptative
{ _adap_steps :: Double }
| Evolving
{ _evol_neighborhood :: Bool }
deriving (Show,Generic,Eq)
instance ToSchema SeaElevation
......@@ -180,6 +181,7 @@ data PhyloConfig =
, phyloScale :: Int
, similarity :: Similarity
, seaElevation :: SeaElevation
, defaultMode :: Bool
, findAncestors :: Bool
, phyloSynchrony :: Synchrony
, phyloQuality :: Quality
......@@ -215,7 +217,7 @@ subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard
------------------------------------------------------------------------
defaultConfig :: PhyloConfig
defaultConfig =
PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
, listPath = "list.csv" -- useful for commandline only
, outputPath = "data/"
, corpusParser = Csv 100000
......@@ -224,6 +226,7 @@ defaultConfig =
, phyloScale = 2
, similarity = WeightedLogJaccard 0.5 1
, seaElevation = Constante 0.1 0.1
, defaultMode = True
, findAncestors = False
, phyloSynchrony = ByProximityThreshold 0.5 0 AllBranches MergeAllGroups
, phyloQuality = Quality 0.5 1
......@@ -365,14 +368,21 @@ data PhyloFoundations = PhyloFoundations
, _foundations_rootsInGroups :: Map Int [PhyloGroupId] -- map of roots associated to groups
} deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
data PhyloCounts = PhyloCounts
{ coocByDate :: !(Map Date Cooc)
, docsByDate :: !(Map Date Double)
, rootsCount :: !(Map Int Double)
, rootsFreq :: !(Map Int Double)
, lastRootsFreq :: !(Map Int Double)
} deriving (Generic, Show, Eq)
data PhyloSources = PhyloSources
{ _sources :: !(Vector Text) } deriving (Generic, Show, Eq)
instance ToSchema PhyloFoundations where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_foundations_")
instance ToSchema PhyloCounts where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
instance ToSchema PhyloSources where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
......@@ -396,6 +406,8 @@ type Period = (Date,Date)
type PeriodStr = (DateStr,DateStr)
-- | Phylo datatype of a phylomemy
-- foundations : the foundations of the phylo
-- timeCooc : a Map of coocurency by minimal unit of time (ex: by year)
......@@ -405,14 +417,12 @@ type PeriodStr = (DateStr,DateStr)
data Phylo =
Phylo { _phylo_foundations :: PhyloFoundations
, _phylo_sources :: PhyloSources
, _phylo_timeCooc :: !(Map Date Cooc)
, _phylo_timeDocs :: !(Map Date Double)
, _phylo_termFreq :: !(Map Int Double)
, _phylo_lastTermFreq :: !(Map Int Double)
, _phylo_diaSimScan :: Set Double
, _phylo_counts :: PhyloCounts
, _phylo_seaLadder :: [Double]
, _phylo_param :: PhyloParam
, _phylo_periods :: Map Period PhyloPeriod
, _phylo_quality :: Double
, _phylo_level :: Double
}
deriving (Generic, Show, Eq)
......@@ -620,6 +630,9 @@ instance ToJSON PhyloSources
instance FromJSON PhyloParam
instance ToJSON PhyloParam
instance FromJSON PhyloCounts
instance ToJSON PhyloCounts
instance FromJSON PhyloPeriod
instance ToJSON PhyloPeriod
......
......@@ -30,7 +30,6 @@ import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching)
import Gargantext.Prelude
import qualified Data.Vector as Vector
import qualified Data.Set as Set
import qualified Data.Map as Map
---------------------------------
......@@ -55,15 +54,11 @@ phyloCleopatre = synchronicClustering $ toHorizon flatPhylo
-----------------------------------------------
flatPhylo :: Phylo
flatPhylo = case (getSeaElevation emptyPhylo) of
Constante s g -> temporalMatching (constDiachronicLadder s g Set.empty)
$ scanSimilarity 1
$ appendGroups clusterToGroup 1 seriesOfClustering emptyPhylo
Adaptative s -> temporalMatching (adaptDiachronicLadder s (emptyPhylo' ^. phylo_diaSimScan) Set.empty) emptyPhylo'
flatPhylo = temporalMatching (getLadder emptyPhylo') emptyPhylo'
emptyPhylo' :: Phylo
emptyPhylo' = scanSimilarity 1
$ joinRootsToGroups
emptyPhylo' = joinRoots
$ findSeaLadder
$ appendGroups clusterToGroup 1 seriesOfClustering emptyPhylo
---------------------------------------------
......@@ -83,11 +78,9 @@ docsByPeriods = groupDocsByPeriod date periods docs
-- | STEP 1 | -- Init the Phylo
---------------------------------
emptyPhylo :: Phylo
emptyPhylo = initPhylo docs config
phyloCooc :: Map Date Cooc
phyloCooc = docsToTimeScaleCooc docs (foundations ^. foundations_roots)
......@@ -106,7 +99,7 @@ config :: PhyloConfig
config =
defaultConfig { phyloName = "Cesar et Cleopatre"
, phyloScale = 2
, seaElevation = Adaptative 4
, seaElevation = Evolving True
, exportFilter = [ByBranchSize 0]
, clique = MaxClique 0 15 ByNeighbours }
......
......@@ -214,13 +214,13 @@ exportToDot phylo export =
{-- home made attributes -}
<> [(toAttr (fromStrict "phyloFoundations") $ pack $ show (length $ Vector.toList $ getRoots phylo))
,(toAttr (fromStrict "phyloTerms") $ pack $ show (length $ nub $ concat $ map (\g -> g ^. phylo_groupNgrams) $ export ^. export_groups))
,(toAttr (fromStrict "phyloDocs") $ pack $ show (sum $ elems $ phylo ^. phylo_timeDocs))
,(toAttr (fromStrict "phyloDocs") $ pack $ show (sum $ elems $ getDocsByDate phylo))
,(toAttr (fromStrict "phyloPeriods") $ pack $ show (length $ elems $ phylo ^. phylo_periods))
,(toAttr (fromStrict "phyloBranches") $ pack $ show (length $ export ^. export_branches))
,(toAttr (fromStrict "phyloGroups") $ pack $ show (length $ export ^. export_groups))
,(toAttr (fromStrict "phyloSources") $ pack $ show (Vector.toList $ getSources phylo))
,(toAttr (fromStrict "phyloTimeScale") $ pack $ getTimeScale phylo)
,(toAttr (fromStrict "PhyloScale") $ pack $ show (_qua_granularity $ phyloQuality $ getConfig phylo))
,(toAttr (fromStrict "PhyloScale") $ pack $ show (getLevel phylo))
,(toAttr (fromStrict "phyloQuality") $ pack $ show (phylo ^. phylo_quality))
,(toAttr (fromStrict "phyloSeaRiseStart") $ pack $ show (getPhyloSeaRiseStart phylo))
,(toAttr (fromStrict "phyloSeaRiseSteps") $ pack $ show (getPhyloSeaRiseSteps phylo))
......@@ -375,7 +375,8 @@ processSort sort' elev export = case sort' of
ByBirthDate o -> sortByBirthDate o export
ByHierarchy _ -> case elev of
Constante s s' -> export & export_branches .~ (branchToIso' s s' $ sortByHierarchy 0 (export ^. export_branches))
Adaptative _ -> export & export_branches .~ (branchToIso $ sortByHierarchy 0 (export ^. export_branches))
Adaptative _ -> export & export_branches .~ (branchToIso $ sortByHierarchy 0 (export ^. export_branches))
Evolving _ -> export & export_branches .~ (branchToIso $ sortByHierarchy 0 (export ^. export_branches))
-----------------
-- | Metrics | --
......@@ -416,7 +417,7 @@ ngramsMetrics phylo export =
& phylo_groupMeta %~ insert "inclusion"
(map (\n -> inclusion (g ^. phylo_groupCooc) ((g ^. phylo_groupNgrams) \\ [n]) n) $ g ^. phylo_groupNgrams)
& phylo_groupMeta %~ insert "frequence"
(map (\n -> getInMap n (phylo ^. phylo_lastTermFreq)) $ g ^. phylo_groupNgrams)
(map (\n -> getInMap n (getLastRootsFreq phylo)) $ g ^. phylo_groupNgrams)
) export
......@@ -643,12 +644,13 @@ toHorizon phylo =
heads = filter (\g -> (not . null) $ (g ^. phylo_groupPeriodChilds))
$ filter (\g -> null (g ^. phylo_groupPeriodParents) && (notElem (getGroupId g) childs)) groups
noHeads = groups \\ heads
nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) [prd]
diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) [prd]
nbDocs = sum $ elems $ filterDocs (getDocsByDate phylo) [prd]
diago = reduceDiagos $ filterDiago (getCoocByDate phylo) [prd]
sim = (similarity $ getConfig phylo)
step = case getSeaElevation phylo of
Constante _ s -> s
Adaptative _ -> 0
Evolving _ -> 0
-- in headsToAncestors nbDocs diago Similarity heads groups []
in map (\ego -> toAncestor nbDocs diago sim step noHeads ego)
$ headsToAncestors nbDocs diago sim step heads []
......@@ -671,7 +673,7 @@ toPhyloExport :: Phylo -> DotGraph DotId
toPhyloExport phylo = exportToDot phylo
$ processFilters (exportFilter $ getConfig phylo) (phyloQuality $ getConfig phylo)
$ processSort (exportSort $ getConfig phylo) (getSeaElevation phylo)
$ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (_phylo_lastTermFreq phylo)
$ processLabels (exportLabel $ getConfig phylo) (getRoots phylo) (getLastRootsFreq phylo)
$ processMetrics phylo export
where
export :: PhyloExport
......@@ -711,7 +713,7 @@ tracePhyloAncestors groups = trace ( "-- | Found " <> show(length $ concat $ map
tracePhyloInfo :: Phylo -> Phylo
tracePhyloInfo phylo = trace ("\n" <> "##########################" <> "\n\n" <> "-- | Phylo with λ = "
<> show(_qua_granularity $ phyloQuality $ getConfig phylo) <> " applied to "
<> show(getLevel phylo) <> " applied to "
<> show(length $ Vector.toList $ getRoots phylo) <> " foundations"
) phylo
......
This diff is collapsed.
......@@ -13,7 +13,7 @@ Portability : POSIX
module Gargantext.Core.Viz.Phylo.PhyloTools where
import Control.Lens hiding (Level)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group, notElem)
import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, iterate, transpose, partition, tails, nubBy, group, notElem, (!!))
import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
import Data.Set (Set, disjoint)
import Data.String (String)
......@@ -28,6 +28,7 @@ import qualified Data.List as List
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified Data.Vector as Vector
import qualified Data.Map as Map
------------
-- | Io | --
......@@ -301,6 +302,9 @@ getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
coocToDiago :: Cooc -> Cooc
coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
coocToAdjacency :: Cooc -> Cooc
coocToAdjacency cooc = Map.map (\_ -> 1) cooc
-- | To build the local cooc matrix of each phylogroup
ngramsToCooc :: [Int] -> [Cooc] -> Cooc
ngramsToCooc ngrams coocs =
......@@ -309,6 +313,75 @@ ngramsToCooc ngrams coocs =
in filterWithKey (\k _ -> elem k pairs) cooc
------------------
-- | Defaults | --
------------------
-- | find the local maxima in a list of values
findMaxima :: [(Double,Double)] -> [Bool]
findMaxima lst = map (hasMax) $ toChunk 3 lst
where
------
hasMax :: [(Double,Double)] -> Bool
hasMax chunk =
if (length chunk) /= 3
then False
else (snd(chunk !! 0) < snd(chunk !! 1)) && (snd(chunk !! 2) < snd(chunk !! 1))
-- | split a list into chunks of size n
toChunk :: Int -> [a] -> [[a]]
toChunk n = takeWhile ((== n) . length) . transpose . take n . iterate tail
-- | To compute the average degree from a cooc matrix
-- http://networksciencebook.com/chapter/2#degree
toAverageDegree :: Cooc -> Vector Ngrams -> Double
toAverageDegree cooc roots = 2 * (fromIntegral $ Map.size cooc) / (fromIntegral $ Vector.length roots)
-- | Use the giant component regime to estimate the default level
-- http://networksciencebook.com/chapter/3#networks-supercritical
regimeToDefaultLevel :: Cooc -> Vector Ngrams -> Double
regimeToDefaultLevel cooc roots
| avg == 0 = 1
| avg < 1 = avg * 0.6
| avg == 1 = 0.6
| avg < lnN = (avg * 0.2) / lnN
| otherwise = 0.2
where
avg :: Double
avg = toAverageDegree cooc roots
lnN :: Double
lnN = log (fromIntegral $ Vector.length roots)
coocToConfidence :: Phylo -> Cooc
coocToConfidence phylo =
let count = getRootsCount phylo
cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty
$ elems $ getCoocByDate phylo
in Map.mapWithKey (\(a,b) w -> confidence a b w count) cooc
where
----
-- confidence
confidence :: Int -> Int -> Double -> Map Int Double -> Double
confidence a b inter card = maximum [(inter / card ! a),(inter / card ! b)]
sumtest :: [Int] -> [Int] -> Int
sumtest l1 l2 = (head' "test" l1) + (head' "test" $ reverse l2)
findDefaultLevel :: Phylo -> Phylo
findDefaultLevel phylo =
let confidence = Map.filterWithKey (\(a,b) _ -> a /= b)
$ Map.filter (> 0.01)
$ coocToConfidence phylo
roots = getRoots phylo
level = regimeToDefaultLevel confidence roots
in updateLevel level phylo
--------------------
-- | PhyloGroup | --
--------------------
......@@ -401,21 +474,46 @@ getScales phylo = nub
getSeaElevation :: Phylo -> SeaElevation
getSeaElevation phylo = seaElevation (getConfig phylo)
getSimilarity :: Phylo -> Similarity
getSimilarity phylo = similarity (getConfig phylo)
getPhyloSeaRiseStart :: Phylo -> Double
getPhyloSeaRiseStart phylo = case (getSeaElevation phylo) of
Constante s _ -> s
Adaptative _ -> 0
Evolving _ -> 0
getPhyloSeaRiseSteps :: Phylo -> Double
getPhyloSeaRiseSteps phylo = case (getSeaElevation phylo) of
Constante _ s -> s
Adaptative s -> s
Adaptative s -> s
Evolving _ -> 0.1
getConfig :: Phylo -> PhyloConfig
getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
getLevel :: Phylo -> Double
getLevel phylo = _phylo_level phylo
getLadder :: Phylo -> [Double]
getLadder phylo = phylo ^. phylo_seaLadder
getCoocByDate :: Phylo -> Map Date Cooc
getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
getDocsByDate :: Phylo -> Map Date Double
getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
getRootsCount :: Phylo -> Map Int Double
getRootsCount phylo = rootsCount (phylo ^. phylo_counts)
getRootsFreq :: Phylo -> Map Int Double
getRootsFreq phylo = rootsFreq (phylo ^. phylo_counts)
getLastRootsFreq :: Phylo -> Map Int Double
getLastRootsFreq phylo = lastRootsFreq (phylo ^. phylo_counts)
setConfig :: PhyloConfig -> Phylo -> Phylo
setConfig config phylo = phylo
......@@ -501,8 +599,10 @@ updatePeriods periods' phylo =
) phylo
updateQuality :: Double -> Phylo -> Phylo
updateQuality quality phylo = phylo { _phylo_quality = quality }
updateQuality quality phylo = phylo { _phylo_quality = quality }
updateLevel :: Double -> Phylo -> Phylo
updateLevel level phylo = phylo { _phylo_level = level }
traceToPhylo :: Scale -> Phylo -> Phylo
traceToPhylo lvl phylo =
......
......@@ -76,7 +76,7 @@ toNextScale phylo groups =
$ fromList $ map (\g -> (getGroupId g, g))
$ foldlWithKey (\acc id groups' ->
-- 4) create the parent group
let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
let parent = mergeGroups (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
in acc ++ [parent]) []
-- 3) group the current groups by parentId
$ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
......@@ -199,8 +199,8 @@ synchronicClustering :: Phylo -> Phylo
synchronicClustering phylo =
let prox = similarity $ getConfig phylo
sync = phyloSynchrony $ getConfig phylo
docs = phylo ^. phylo_timeDocs
diagos = map coocToDiago $ phylo ^. phylo_timeCooc
docs = getDocsByDate phylo
diagos = map coocToDiago $ getCoocByDate phylo
newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
$ map processDynamics
$ chooseClusteringStrategy sync
......
......@@ -56,7 +56,7 @@ sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi
-- process the sumLog
-}
sumLog' :: Double -> Double -> [Double] -> Double
sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1 / tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
{-
......@@ -695,14 +695,14 @@ temporalMatching ladder phylo = updatePhyloGroups 1
sea :: ([(Branch,ShouldTry)],FinalQuality)
sea = seaLevelRise (fromIntegral $ Vector.length $ getRoots phylo)
(similarity $ getConfig phylo)
(_qua_granularity $ phyloQuality $ getConfig phylo)
(getLevel phylo)
(_qua_minBranch $ phyloQuality $ getConfig phylo)
(phylo ^. phylo_termFreq)
(getRootsFreq phylo)
ladder 1
(getTimeFrame $ timeUnit $ getConfig phylo)
(getPeriodIds phylo)
(phylo ^. phylo_timeDocs)
(phylo ^. phylo_timeCooc)
(getDocsByDate phylo)
(getCoocByDate phylo)
((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
(reverse $ sortOn (length . fst) seabed)
......@@ -714,7 +714,7 @@ temporalMatching ladder phylo = updatePhyloGroups 1
(getPeriodIds phylo)
(similarity $ getConfig phylo)
(List.head ladder)
(phylo ^. phylo_timeDocs)
(phylo ^. phylo_timeCooc)
(getDocsByDate phylo)
(getCoocByDate phylo)
((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
(traceTemporalMatching $ getGroupsFromScale 1 phylo)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment