[phylo] slight refactoring

parent 7f8b7680
......@@ -66,7 +66,7 @@ data SeaElevation =
| Adaptative
{ _adap_steps :: Double }
| Evolving
{ _evol_neighborhood :: Bool }
{ _evol_neighborhood :: Bool }
deriving (Show,Generic,Eq)
instance ToSchema SeaElevation
......@@ -78,8 +78,8 @@ data PhyloSimilarity =
| WeightedLogSim
{ _wls_sensibility :: Double
, _wls_minSharedNgrams :: Int }
| Hamming
{ _hmg_sensibility :: Double
| Hamming
{ _hmg_sensibility :: Double
, _hmg_minSharedNgrams :: Int}
deriving (Show,Generic,Eq)
......@@ -207,7 +207,7 @@ data PhyloSubConfig =
subConfig2config :: PhyloSubConfig -> PhyloConfig
subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
subConfig2config subConfig = defaultConfig { similarity = WeightedLogJaccard (_sc_phyloProximity subConfig) 1
, phyloSynchrony = ByProximityThreshold (_sc_phyloSynchrony subConfig) 0 AllBranches MergeAllGroups
, phyloQuality = Quality (_sc_phyloQuality subConfig) 1
, timeUnit = _sc_timeUnit subConfig
......@@ -430,7 +430,6 @@ data Phylo =
instance ToSchema Phylo where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_phylo_")
----------------
-- | Period | --
----------------
......
......@@ -188,4 +188,3 @@ instance ToParamSchema Metric
instance ToParamSchema Order
instance ToParamSchema Sort
instance ToSchema Order
......@@ -66,50 +66,58 @@ toAttr :: AttributeName -> Lazy.Text -> CustomAttribute
toAttr k v = customAttribute k v
metaToAttr :: Map Text.Text [Double] -> [CustomAttribute]
metaToAttr meta = map (\(k,v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
metaToAttr meta = map (\(k, v) -> toAttr (fromStrict k) $ (pack . unwords) $ map show v) $ toList meta
groupIdToDotId :: PhyloGroupId -> DotId
groupIdToDotId (((d,d'),lvl),idx) = (fromStrict . Text.pack) $ ("group" <> (show d) <> (show d') <> (show lvl) <> (show idx))
groupIdToDotId (((d, d'), lvl), idx) =
(fromStrict . Text.pack) $ "group" <> show d <> show d' <> show lvl <> show idx
branchIdToDotId :: PhyloBranchId -> DotId
branchIdToDotId bId = (fromStrict . Text.pack) $ ("branch" <> show (snd bId))
branchIdToDotId bId = (fromStrict . Text.pack) $ "branch" <> show (snd bId)
periodIdToDotId :: Period -> DotId
periodIdToDotId prd = (fromStrict . Text.pack) $ ("period" <> show (fst prd) <> show (snd prd))
periodIdToDotId prd = (fromStrict . Text.pack) $ "period" <> show (fst prd) <> show (snd prd)
groupToTable :: Vector Ngrams -> PhyloGroup -> H.Label
groupToTable fdt g = H.Table H.HTable
{ H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
, H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
, H.tableRows = [header]
<> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
<> ( map ngramsToRow $ splitEvery 4
$ reverse $ sortOn (snd . snd)
$ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
$ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
groupToTable fdt g =
H.Table H.HTable
{ H.tableFontAttrs = Just [H.PointSize 14, H.Align H.HLeft]
, H.tableAttrs = [H.Border 0, H.CellBorder 0, H.BGColor (toColor White)]
, H.tableRows = [header]
<> [H.Cells [H.LabelCell [H.Height 10] $ H.Text [H.Str $ fromStrict ""]]]
<> ( map ngramsToRow $ splitEvery 4
$ reverse $ sortOn (snd . snd)
$ zip (ngramsToText fdt (g ^. phylo_groupNgrams))
$ zip ((g ^. phylo_groupMeta) ! "dynamics") ((g ^. phylo_groupMeta) ! "inclusion"))}
where
--------------------------------------
ngramsToRow :: [(Ngrams, (Double, Double))] -> H.Row
ngramsToRow ns = H.Cells $ map (\(n,(d,_)) ->
H.LabelCell [ H.Align H.HLeft
, dynamicToColor $ floor d] $ H.Text [H.Str $ fromStrict n]) ns
--------------------------------------
header :: H.Row
header =
H.Cells [ H.LabelCell [pickLabelColor $ floor <$> ((g ^. phylo_groupMeta) ! "dynamics")]
$ H.Text [H.Str $ ((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
<> fromStrict " ( "
<> (pack $ show (fst $ g ^. phylo_groupPeriod))
<> fromStrict " , "
<> (pack $ show (snd $ g ^. phylo_groupPeriod))
<> fromStrict " ) "
<> (pack $ show (getGroupId g))]]
--------------------------------------
ngramsToRow :: [(Ngrams, (Double, Double))] -> H.Row
ngramsToRow ns =
H.Cells $ map (\(n, (d, _)) ->
H.LabelCell [ H.Align H.HLeft
, dynamicToColor $ floor d] $ H.Text [H.Str $ fromStrict n]) ns
--------------------------------------
header :: H.Row
header =
H.Cells [ H.LabelCell [pickLabelColor $ floor <$> ((g ^. phylo_groupMeta) ! "dynamics")]
$ H.Text [H.Str $ ((fromStrict . Text.toUpper) $ g ^. phylo_groupLabel)
<> fromStrict " ( "
<> (pack $ show (fst $ g ^. phylo_groupPeriod))
<> fromStrict " , "
<> (pack $ show (snd $ g ^. phylo_groupPeriod))
<> fromStrict " ) "
<> (pack $ show (getGroupId g))]]
--------------------------------------
branchToDotNode :: PhyloBranch -> Int -> Dot DotId
branchToDotNode b bId =
node (branchIdToDotId $ b ^. branch_id)
([FillColor [toWColor CornSilk], FontName "Arial", FontSize 40, Shape Egg, Style [SItem Bold []], Label (toDotLabel $ b ^. branch_label)]
( [ FillColor [toWColor CornSilk]
, FontName "Arial"
, FontSize 40
, Shape Egg
, Style [SItem Bold []]
, Label (toDotLabel $ b ^. branch_label) ]
<> (metaToAttr $ b ^. branch_meta)
<> [ toAttr "nodeType" "branch"
, toAttr "bId" (pack $ show bId)
......@@ -121,37 +129,42 @@ branchToDotNode b bId =
periodToDotNode :: (Date,Date) -> (Text.Text,Text.Text) -> Dot DotId
periodToDotNode prd prd' =
node (periodIdToDotId prd)
([Shape BoxShape, FontSize 50, Label (toDotLabel $ Text.pack (show (fst prd) <> " " <> show (snd prd)))]
node (periodIdToDotId prd) $
[ Shape BoxShape
, FontSize 50
, Label $ toDotLabel $ Text.pack $ show (fst prd) <> " " <> show (snd prd) ]
<> [ toAttr "nodeType" "period"
, toAttr "strFrom" (fromStrict $ Text.pack $ (show $ fst prd'))
, toAttr "strTo" (fromStrict $ Text.pack $ (show $ snd prd'))
, toAttr "from" (fromStrict $ Text.pack $ (show $ fst prd))
, toAttr "to" (fromStrict $ Text.pack $ (show $ snd prd))])
, toAttr "strFrom" $ fromStrict $ Text.pack $ show $ fst prd'
, toAttr "strTo" $ fromStrict $ Text.pack $ show $ snd prd'
, toAttr "from" $ fromStrict $ Text.pack $ show $ fst prd
, toAttr "to" $ fromStrict $ Text.pack $ show $ snd prd ]
groupToDotNode :: Vector Ngrams -> PhyloGroup -> Int -> Dot DotId
groupToDotNode fdt g bId =
node (groupIdToDotId $ getGroupId g)
([FontName "Arial", Shape Square, penWidth 4, toLabel (groupToTable fdt g)]
<> [ toAttr "nodeType" "group"
, toAttr "gid" (groupIdToDotId $ getGroupId g)
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
, toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
, toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId)
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))
, toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
, toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
, toAttr "sourceFull" (pack $ show (g ^. phylo_groupSources))
, toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
, toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
, toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
, toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
, toAttr "seaLvl" (pack $ show ((g ^. phylo_groupMeta) ! "seaLevels"))
])
( [ FontName "Arial"
, Shape Square
, penWidth 4
, toLabel (groupToTable fdt g) ]
<> [ toAttr "nodeType" "group"
, toAttr "gid" (groupIdToDotId $ getGroupId g)
, toAttr "from" (pack $ show (fst $ g ^. phylo_groupPeriod))
, toAttr "to" (pack $ show (snd $ g ^. phylo_groupPeriod))
, toAttr "strFrom" (pack $ show (fst $ g ^. phylo_groupPeriod'))
, toAttr "strTo" (pack $ show (snd $ g ^. phylo_groupPeriod'))
, toAttr "branchId" (pack $ unwords (init $ map show $ snd $ g ^. phylo_groupBranchId))
, toAttr "bId" (pack $ show bId)
, toAttr "support" (pack $ show (g ^. phylo_groupSupport))
, toAttr "weight" (pack $ show (g ^. phylo_groupWeight))
, toAttr "source" (pack $ show (nub $ g ^. phylo_groupSources))
, toAttr "sourceFull" (pack $ show (g ^. phylo_groupSources))
, toAttr "lbl" (pack $ show (ngramsToLabel fdt (g ^. phylo_groupNgrams)))
, toAttr "foundation" (pack $ show (idxToLabel (g ^. phylo_groupNgrams)))
, toAttr "role" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "dynamics")))
, toAttr "frequence" (pack $ show (idxToLabel' ((g ^. phylo_groupMeta) ! "frequence")))
, toAttr "seaLvl" (pack $ show ((g ^. phylo_groupMeta) ! "seaLevels"))
])
toDotEdge' :: DotId -> DotId -> [Char] -> [Char] -> EdgeType -> Dot DotId
......@@ -598,7 +611,13 @@ getGroupThr step g =
breaks = (g ^. phylo_groupMeta) ! "breaks"
in (last' "export" (take (round $ (last' "export" breaks) + 1) seaLvl)) - step
toAncestor :: Double -> Map Int Double -> PhyloSimilarity -> Double -> [PhyloGroup] -> PhyloGroup -> PhyloGroup
toAncestor :: Double
-> Map Int Double
-> PhyloSimilarity
-> Double
-> [PhyloGroup]
-> PhyloGroup
-> PhyloGroup
toAncestor nbDocs diago similarity step candidates ego =
let curr = ego ^. phylo_groupAncestors
in ego & phylo_groupAncestors .~ (curr ++ (map (\(g,w) -> (getGroupId g,w))
......@@ -607,7 +626,13 @@ toAncestor nbDocs diago similarity step candidates ego =
$ filter (\g -> g ^. phylo_groupBranchId /= ego ^. phylo_groupBranchId ) candidates))
headsToAncestors :: Double -> Map Int Double -> PhyloSimilarity -> Double -> [PhyloGroup] -> [PhyloGroup] -> [PhyloGroup]
headsToAncestors :: Double
-> Map Int Double
-> PhyloSimilarity
-> Double
-> [PhyloGroup]
-> [PhyloGroup]
-> [PhyloGroup]
headsToAncestors nbDocs diago similarity step heads acc =
if (null heads)
then acc
......
......@@ -48,8 +48,8 @@ data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
toPhylo' (PhyloN phylo) = toPhylo'
toPhylo' (PhyloBase phylo) = toPhylo
toPhylo' (PhyloN phylo) = toPhylo'
toPhylo' (PhyloBase phylo) = toPhylo
-}
-- TODO an adaptative synchronic clustering with a slider
......@@ -58,11 +58,11 @@ toPhylo :: Phylo -> Phylo
toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
if (phyloScale $ getConfig phylowithoutLink) > 1
then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
else phyloAncestors
else phyloAncestors
where
--------------------------------------
phyloAncestors :: Phylo
phyloAncestors =
phyloAncestors =
if (findAncestors $ getConfig phylowithoutLink)
then toHorizon phyloWithLinks
else phyloWithLinks
......@@ -77,44 +77,44 @@ toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink
-----------------------------
{-
{-
-- create a square ladder
-}
squareLadder :: [Double] -> [Double]
squareLadder ladder = List.map (\x -> x * x) ladder
{-
{-
-- create an adaptative 'sea elevation' ladder
-}
adaptSeaLadder :: Double -> Set Double -> Set Double -> [Double]
adaptSeaLadder curr similarities ladder =
adaptSeaLadder curr similarities ladder =
if curr <= 0 || Set.null similarities
then Set.toList ladder
else
else
let idx = ((Set.size similarities) `div` (floor curr)) - 1
thr = Set.elemAt idx similarities
-- we use a sliding methods 1/10, then 1/9, then ... 1/2
in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
{-
{-
-- create a constante 'sea elevation' ladder
-}
constSeaLadder :: Double -> Double -> Set Double -> [Double]
constSeaLadder curr step ladder =
constSeaLadder curr step ladder =
if curr > 1
then Set.toList ladder
else constSeaLadder (curr + step) step (Set.insert curr ladder)
{-
{-
-- create an evolving 'sea elevation' ladder based on estimated & local quality maxima
-}
-}
evolvSeaLadder :: Double -> Double -> Map Int Double -> Set Double -> [((PhyloGroup,PhyloGroup),Double)] -> [Double]
evolvSeaLadder nbFdt lambda freq similarities graph = map snd
$ filter fst
evolvSeaLadder nbFdt lambda freq similarities graph = map snd
$ filter fst
$ zip maxima (map fst qua')
-- 3) find the corresponding measures of similarity and create the ladder
where
......@@ -125,7 +125,7 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
--------
-- 1.2)
qua' :: [(Double,Double)]
qua' = foldl (\acc (s,q) ->
qua' = foldl (\acc (s,q) ->
if length acc == 0
then [(s,q)]
else if (snd (List.last acc)) == q
......@@ -135,7 +135,7 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
--------
-- 1.1) for each measure of similarity, prune the flat phylo, compute the branches and estimate the quality
qua :: [Double]
qua = map (\thr ->
qua = map (\thr ->
let edges = filter (\edge -> snd edge >= thr) graph
nodes = nub $ concat $ map (\((n,n'),_) -> [n,n']) edges
branches = toRelatedComponents nodes edges
......@@ -143,46 +143,46 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
) $ (Set.toList similarities)
{-
{-
-- find a similarity ladder regarding the "sea elevation" strategy
-}
findSeaLadder :: Phylo -> Phylo
findSeaLadder phylo = case getSeaElevation phylo of
findSeaLadder phylo = case getSeaElevation phylo of
Constante start gap -> phylo & phylo_seaLadder .~ (constSeaLadder start gap Set.empty)
Adaptative steps -> phylo & phylo_seaLadder .~ (squareLadder $ adaptSeaLadder steps similarities Set.empty)
Evolving _ -> let ladder = evolvSeaLadder
Evolving _ -> let ladder = evolvSeaLadder
(fromIntegral $ Vector.length $ getRoots phylo)
(getLevel phylo)
(getRootsFreq phylo)
similarities simGraph
in phylo & phylo_seaLadder .~ (if length ladder > 0
then ladder
then ladder
-- if we don't find any local maxima with the evolving strategy
else constSeaLadder 0.1 0.1 Set.empty)
where
--------
-- 2) extract the values of the kinship links
-- 2) extract the values of the kinship links
similarities :: Set Double
similarities = Set.fromList $ sort $ map snd simGraph
--------
-- 1) we process an initial calculation of the kinship links
-- this initial calculation is used to estimate the real sea ladder
simGraph :: [((PhyloGroup,PhyloGroup),Double)]
simGraph = foldl' (\acc period ->
simGraph = foldl' (\acc period ->
-- 1.1) process period by period
let sources = getGroupsFromScalePeriods 1 [period] phylo
next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
targets = getGroupsFromScalePeriods 1 next phylo
docs = filterDocs (getDocsByDate phylo) ([period] ++ next)
diagos = filterDiago (getCoocByDate phylo) ([period] ++ next)
-- 1.2) compute the kinship similarities between pairs of source & target in parallel
pairs = map (\source ->
let candidates = filter (\target -> (> 2) $ length
pairs = map (\source ->
let candidates = filter (\target -> (> 2) $ length
$ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
in map (\target ->
in map (\target ->
let nbDocs = (sum . elems)
$ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
diago = reduceDiagos
diago = reduceDiagos
$ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
in ((source,target),toSimilarity nbDocs diago (getSimilarity phylo) (getGroupNgrams source) (getGroupNgrams target) (getGroupNgrams target))
) candidates
......@@ -202,15 +202,15 @@ appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ co
let pId = phyloLvl ^. phylo_scalePeriod
pId' = phyloLvl ^. phylo_scalePeriodStr
phyloCUnit = m ! pId
in phyloLvl
in phyloLvl
& phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
groups ++ [ (((pId,lvl),length groups)
, f obj pId pId' lvl (length groups)
(elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId]))
] ) [] phyloCUnit)
else
else
phyloLvl )
phylo
phylo
clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
......@@ -227,16 +227,16 @@ clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
-----------------------
-- | To Phylo Step | --
-----------------------
-----------------------
indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
indexDates' m = map (\docs ->
indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
indexDates' m = map (\docs ->
let ds = map (\d -> date' d) docs
f = if (null ds)
then ""
else toFstDate ds
l = if (null ds)
l = if (null ds)
then ""
else toLstDate ds
in (f,l)) m
......@@ -250,8 +250,8 @@ joinRoots phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap p
rootsMap :: Map Int [PhyloGroupId]
rootsMap = fromListWith (++)
$ concat -- flatten
$ map (\g ->
map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
$ map (\g ->
map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
$ getGroupsFromScale 1 phylo
......@@ -263,10 +263,9 @@ maybeDefaultParams phylo = if (defaultMode (getConfig phylo))
-- To build the first phylo step from docs and terms
-- QL: backend entre phyloBase et Clustering
-- tophylowithoutLink
toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
toPhyloWithoutLink docs conf = joinRoots
$ findSeaLadder
$ findSeaLadder
$ maybeDefaultParams
$ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
where
......@@ -306,23 +305,23 @@ filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >=
-- To filter nested Fis
filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
filterCliqueByNested m =
let clq = map (\l ->
filterCliqueByNested m =
let clq = map (\l ->
foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
then mem
else
else
let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
in fMax ++ [f] ) [] l)
$ elems m
$ elems m
clq' = clq `using` parList rdeepseq
in fromList $ zip (keys m) clq'
in fromList $ zip (keys m) clq'
-- | To transform a time map of docs into a time map of Fis with some filters
toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
Fis s s' -> -- traceFis "Filtered Fis"
filterCliqueByNested
filterCliqueByNested
{- \$ traceFis "Filtered by clique size" -}
$ filterClique True s' (filterCliqueBySize)
{- \$ traceFis "Filtered by support" -}
......@@ -332,33 +331,33 @@ toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
seriesOfClustering
where
--------------------------------------
--------------------------------------
seriesOfClustering :: Map (Date,Date) [Clustering]
seriesOfClustering = case (clique $ getConfig phylo) of
Fis _ _ ->
let fis = map (\(prd,docs) ->
seriesOfClustering = case (clique $ getConfig phylo) of
Fis _ _ ->
let fis = map (\(prd,docs) ->
case (corpusParser $ getConfig phylo) of
Csv' _ -> let lst = toList
Csv' _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
_ -> let lst = toList
_ -> let lst = toList
$ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
)
$ toList phyloDocs
fis' = fis `using` parList rdeepseq
in fromList fis'
MaxClique _ thr filterType ->
let mcl = map (\(prd,docs) ->
MaxClique _ thr filterType ->
let mcl = map (\(prd,docs) ->
let cooc = map round
$ foldl sumCooc empty
$ map listToMatrix
$ map listToMatrix
$ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
$ toList phyloDocs
mcl' = mcl `using` parList rdeepseq
in fromList mcl'
--------------------------------------
mcl' = mcl `using` parList rdeepseq
in fromList mcl'
--------------------------------------
-- dev viz graph maxClique getMaxClique
......@@ -368,9 +367,9 @@ toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
--------------------
-- To transform the docs into a time map of coocurency matrix
-- To transform the docs into a time map of coocurency matrix
docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
docsToTimeScaleCooc docs fdt =
docsToTimeScaleCooc docs fdt =
let mCooc = fromListWith sumCooc
$ map (\(_d,l) -> (_d, listToMatrix l))
$ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
......@@ -389,8 +388,8 @@ docsToTimeScaleCooc docs fdt =
groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
groupDocsByPeriodRec f prds docs acc =
if ((null prds) || (null docs))
then acc
else
then acc
else
let prd = head' "groupBy" prds
docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
......@@ -402,7 +401,7 @@ groupDocsByPeriod' f pds docs =
let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
periods = map (inPeriode f docs') pds
periods' = periods `using` parList rdeepseq
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
$ fromList $ zip pds periods'
where
--------------------------------------
......@@ -419,14 +418,14 @@ groupDocsByPeriod f pds es =
let periods = map (inPeriode f es) pds
periods' = periods `using` parList rdeepseq
in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
$ fromList $ zip pds periods'
where
--------------------------------------
inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
inPeriode f' h (start,end) =
fst $ partition (\d -> f' d >= start && f' d <= end) h
--------------------------------------
--------------------------------------
docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
......@@ -434,7 +433,7 @@ docsToTermFreq docs fdt =
let nbDocs = fromIntegral $ length docs
freqs = map (/(nbDocs))
$ fromList
$ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
$ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
......@@ -442,39 +441,39 @@ docsToTermFreq docs fdt =
docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
docsToTermCount docs roots = fromList
$ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
$ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) roots) docs
docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
docsToLastTermFreq n docs fdt =
docsToLastTermFreq n docs fdt =
let last = take n $ reverse $ sort $ map date docs
nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
freqs = map (/(nbDocs))
$ fromList
$ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
$ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
$ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
sumFreqs = sum $ elems freqs
in map (/sumFreqs) freqs
in map (/sumFreqs) freqs
-- To count the number of docs by unit of time
docsToTimeScaleNb :: [Document] -> Map Date Double
docsToTimeScaleNb docs =
docsToTimeScaleNb docs =
let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
$ unionWith (+) time docs'
initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
initPhyloScales lvlMax pId =
initPhyloScales lvlMax pId =
fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
setDefault :: PhyloConfig -> PhyloConfig
setDefault conf = conf {
setDefault conf = conf {
phyloScale = 2,
similarity = WeightedLogJaccard 0.5 2,
findAncestors = True,
......@@ -491,7 +490,7 @@ setDefault conf = conf {
-- Init the basic elements of a Phylo
--
initPhylo :: [Document] -> PhyloConfig -> Phylo
initPhylo docs conf =
initPhylo docs conf =
let roots = Vector.fromList $ nub $ concat $ map text docs
foundations = PhyloFoundations roots empty
docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
......@@ -499,12 +498,12 @@ initPhylo docs conf =
(docsToTimeScaleNb docs)
(docsToTermCount docs (foundations ^. foundations_roots))
(docsToTermFreq docs (foundations ^. foundations_roots))
(docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
(docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
params = if (defaultMode conf)
then defaultPhyloParam { _phyloParam_config = setDefault conf }
else defaultPhyloParam { _phyloParam_config = conf }
periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
$ Phylo foundations
docsSources
docsCounts
......
......@@ -172,11 +172,11 @@ toLstDate ds = snd
getTimeScale :: Phylo -> [Char]
getTimeScale p = case (timeUnit $ getConfig p) of
Epoch _ _ _ -> "epoch"
Year _ _ _ -> "year"
Month _ _ _ -> "month"
Week _ _ _ -> "week"
Day _ _ _ -> "day"
Epoch {} -> "epoch"
Year {} -> "year"
Month {} -> "month"
Week {} -> "week"
Day {} -> "day"
-- | Get a regular & ascendante timeScale from a given list of dates
......@@ -188,27 +188,27 @@ toTimeScale dates step =
getTimeStep :: TimeUnit -> Int
getTimeStep time = case time of
Epoch _ s _ -> s
Year _ s _ -> s
Month _ s _ -> s
Week _ s _ -> s
Day _ s _ -> s
Epoch { .. } -> _epoch_step
Year { .. } -> _year_step
Month { .. } -> _month_step
Week { .. } -> _week_step
Day { .. } -> _day_step
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
Epoch p _ _ -> p
Year p _ _ -> p
Month p _ _ -> p
Week p _ _ -> p
Day p _ _ -> p
Epoch { .. } -> _epoch_period
Year { .. } -> _year_period
Month { .. } -> _month_period
Week { .. } -> _week_period
Day { .. } -> _day_period
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
Epoch _ _ f -> f
Year _ _ f -> f
Month _ _ f -> f
Week _ _ f -> f
Day _ _ f -> f
Epoch { .. } -> _epoch_matchingFrame
Year { .. } -> _year_matchingFrame
Month { .. } -> _month_matchingFrame
Week { .. } -> _week_matchingFrame
Day { .. } -> _day_matchingFrame
-------------
-- | Fis | --
......@@ -220,7 +220,7 @@ isNested :: Eq a => [a] -> [a] -> Bool
isNested l l'
| null l' = True
| length l' > length l = False
| (union l l') == l = True
| union l l' == l = True
| otherwise = False
......@@ -251,8 +251,8 @@ traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> "
traceFis :: [Char] -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
<> "Support : " <> (traceSupport mFis) <> "\n"
<> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
<> "Support : " <> traceSupport mFis <> "\n"
<> "Nb Ngrams : " <> traceClique mFis <> "\n" ) mFis
----------------
......@@ -323,7 +323,7 @@ findMaxima lst = map (hasMax) $ toChunk 3 lst
where
------
hasMax :: [(Double,Double)] -> Bool
hasMax chunk =
hasMax chunk =
if (length chunk) /= 3
then False
else (snd(chunk !! 0) < snd(chunk !! 1)) && (snd(chunk !! 2) < snd(chunk !! 1))
......@@ -331,7 +331,7 @@ findMaxima lst = map (hasMax) $ toChunk 3 lst
-- | split a list into chunks of size n
toChunk :: Int -> [a] -> [[a]]
toChunk n = takeWhile ((== n) . length) . transpose . take n . iterate tail
toChunk n = takeWhile ((== n) . length) . transpose . take n . iterate tail
-- | To compute the average degree from a cooc matrix
......@@ -343,7 +343,7 @@ toAverageDegree cooc roots = 2 * (fromIntegral $ Map.size cooc) / (fromIntegral
-- | Use the giant component regime to estimate the default level
-- http://networksciencebook.com/chapter/3#networks-supercritical
regimeToDefaultLevel :: Cooc -> Vector Ngrams -> Double
regimeToDefaultLevel cooc roots
regimeToDefaultLevel cooc roots
| avg == 0 = 1
| avg < 1 = avg * 0.6
| avg == 1 = 0.6
......@@ -356,26 +356,26 @@ regimeToDefaultLevel cooc roots
lnN = log (fromIntegral $ Vector.length roots)
coocToConfidence :: Phylo -> Cooc
coocToConfidence phylo =
coocToConfidence phylo =
let count = getRootsCount phylo
cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty
cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty
$ elems $ getCoocByDate phylo
in Map.mapWithKey (\(a,b) w -> confidence a b w count) cooc
where
where
----
-- confidence
confidence :: Int -> Int -> Double -> Map Int Double -> Double
confidence a b inter card = maximum [(inter / card ! a),(inter / card ! b)]
sumtest :: [Int] -> [Int] -> Int
sumtest :: [Int] -> [Int] -> Int
sumtest l1 l2 = (head' "test" l1) + (head' "test" $ reverse l2)
findDefaultLevel :: Phylo -> Phylo
findDefaultLevel phylo =
let confidence = Map.filterWithKey (\(a,b) _ -> a /= b)
$ Map.filter (> 0.01)
findDefaultLevel phylo =
let confidence = Map.filterWithKey (\(a,b) _ -> a /= b)
$ Map.filter (> 0.01)
$ coocToConfidence phylo
roots = getRoots phylo
level = regimeToDefaultLevel confidence roots
......@@ -488,7 +488,7 @@ getPhyloSeaRiseSteps :: Phylo -> Double
getPhyloSeaRiseSteps phylo = case (getSeaElevation phylo) of
Constante _ s -> s
Adaptative s -> s
Evolving _ -> 0.1
Evolving _ -> 0.1
getConfig :: Phylo -> PhyloConfig
......@@ -501,10 +501,10 @@ getLadder :: Phylo -> [Double]
getLadder phylo = phylo ^. phylo_seaLadder
getCoocByDate :: Phylo -> Map Date Cooc
getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
getDocsByDate :: Phylo -> Map Date Double
getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
getRootsCount :: Phylo -> Map Int Double
getRootsCount phylo = rootsCount (phylo ^. phylo_counts)
......@@ -599,10 +599,10 @@ updatePeriods periods' phylo =
) phylo
updateQuality :: Double -> Phylo -> Phylo
updateQuality quality phylo = phylo { _phylo_quality = quality }
updateQuality quality phylo = phylo { _phylo_quality = quality }
updateLevel :: Double -> Phylo -> Phylo
updateLevel level phylo = phylo { _phylo_level = level }
updateLevel level phylo = phylo { _phylo_level = level }
traceToPhylo :: Scale -> Phylo -> Phylo
traceToPhylo lvl phylo =
......@@ -697,7 +697,7 @@ getMinSharedNgrams :: PhyloSimilarity -> Int
getMinSharedNgrams proxi = case proxi of
WeightedLogJaccard _ m -> m
WeightedLogSim _ m -> m
Hamming _ _ -> undefined
Hamming _ _ -> undefined
----------------
-- | Branch | --
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment