Commit 15511563 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WORKFLOW] clean, issue in map2mat: diagonal == 0.

parent e079af35
......@@ -30,10 +30,10 @@ import Gargantext.Prelude
import Prelude (print, seq)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
import Gargantext.Text.Metrics.Count (cooc, removeApax)
import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
......@@ -51,19 +51,80 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
-}
workflow lang path = do
-- Text <- IO Text <- FilePath
text <- readFile path
-- context :: Text -> [Text]
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Mono lang contexts
-- myterms <- extractTerms (Mono lang) contexts # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList)
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
(InclusionSize 1000 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue (-1))
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- Cooc -> Matrix
let (ti, fi) = createIndices myCooc3
printDebug "ti" $ M.size ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (-2) (M.size ti) myCooc4
printDebug "matCooc" matCooc
pure matCooc
-- Matrix -> Clustering
--let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
-- printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat
--
-- let distanceMap = mat2map distanceMat
-- printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
---}
-- partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON
-- printDebug "partitions" $ length partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int -> Map (Int, Int) Double -> [LouvainNode] -> Graph
data2graph :: [(Label, Int)] -> Map (Int, Int) Int
-> Map (Int, Int) Double
-> [LouvainNode]
-> Graph
data2graph labels coocs distance partitions = Graph nodes edges
where
community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
nodes = [ Node { n_size = coocs M.! (n, n) -- TODO lookup with default ?
nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
, n_type = Terms -- or Unknown
, n_id = cs (show n)
, n_label = T.unwords l
, n_attributes =
-- TODO lookup with default ?
Attributes { clust_default = community_id_by_node_id M.! n } }
, n_attributes =
Attributes { clust_default = maybe 0 identity
(M.lookup n community_id_by_node_id) } }
| (l, n) <- labels ]
edges = [ Edge { e_source = s
, e_target = t
......@@ -72,42 +133,9 @@ data2graph labels coocs distance partitions = Graph nodes edges
| (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
-----------------------------------------------------------
-- printDebug msg x = putStrLn $ msg <> " " <> show x
printDebug _ _ = pure ()
printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure ()
workflow lang path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Mono lang contexts
printDebug "myterms" $ sum $ map length myterms
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc1 = cooc myterms
printDebug "myCooc1" $ M.size myCooc1
let myCooc2 = removeApax myCooc1
printDebug "myCooc2" $ M.size myCooc2
let myCooc3 = filterCooc myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- Cooc -> Matrix
let (ti, fi) = createIndices myCooc3
printDebug "ti" $ M.size ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat 0 (M.size ti) myCooc4
-- Matrix -> Clustering
let distanceMat = conditional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
{-
let distance = fromIndex fi distanceMap
printDebug "distance" $ M.size distance
-}
partitions <- cLouvain distanceMap
---- | Building : -> Graph -> JSON
printDebug "partitions" $ length partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
......@@ -58,48 +58,58 @@ import GHC.Real (round)
import Debug.Trace
import Prelude (seq)
filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
filterCooc cc = filterCooc' ts cc
data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int
data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m')
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
errMessage = panic "Filter cooc: no key"
selection = [(x,y) | x <- ts, y <- ts, x > y]
type MapListSize = Int
type SampleBins = Double
type Clusters = Int
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
takeSome l s k scores = L.take l
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
$ takeSample n m
$ splitKmeans k scores
$ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
-- $ splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
splitKmeans x xs = elements
$ V.head
splitKmeans x xs = L.concat $ map elements
$ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n)
$ L.reverse $ map (L.sortOn _scored_incExc)
$ map (reverse . (L.sortOn _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m
$ splitEvery m
$ L.reverse $ L.sortOn _scored_speGen xs
......
......@@ -75,9 +75,9 @@ type Grouped = Stems
type Occs = Int
type Coocs = Int
removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
type Threshold = Int
removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
removeApax t = DMS.filter (> t)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
......
......@@ -148,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
-----------------------------------------------------------------------
-----------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment