Commit e0f3433b authored by Alexandre Delanoë's avatar Alexandre Delanoë

[GRAPH] Working on terms selection to complete balanced graph.

parent bb2042f3
......@@ -36,7 +36,7 @@ import Gargantext.Core.Types (CorpusId)
--import Gargantext.Database.Types.Node
import Gargantext.Prelude
--import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
--import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
--import Gargantext.Text.Metrics.Count (coocOn)
--import Gargantext.Text.Parsers.CSV
--import Gargantext.Text.Terms (TermType, extractTerms)
......@@ -120,21 +120,21 @@ cooc2graph :: (Map (Text, Text) Int) -> IO Graph
cooc2graph myCooc = do
--printDebug "myCooc" myCooc
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
{-
let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
(InclusionSize 500 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc
--printDebug "myCooc3 size" $ M.size myCooc3
--printDebug "myCooc3" myCooc3
--} --printDebug "myCooc3 size" $ M.size myCooc3
-- Cooc -> Matrix
let (ti, _) = createIndices myCooc3
let (ti, _) = createIndices myCooc
--printDebug "ti size" $ M.size ti
--printDebug "ti" ti
let myCooc4 = toIndex ti myCooc3
let myCooc4 = toIndex ti myCooc
--printDebug "myCooc4 size" $ M.size myCooc4
--printDebug "myCooc4" myCooc4
......@@ -153,7 +153,7 @@ cooc2graph myCooc = do
--printDebug "distanceMap size" $ M.size distanceMap
--printDebug "distanceMap" distanceMap
-- let distance = fromIndex fi distanceMap
--let distance = fromIndex fi distanceMap
--printDebug "distance" $ M.size distance
partitions <- case Map.size distanceMap > 0 of
......
......@@ -106,8 +106,8 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 100
b = 1000
a = 1
b = 10000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
......
......@@ -38,28 +38,25 @@ data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
data FilterConfig = FilterConfig
{ fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
selection = [(x,y) | x <- ts
, y <- ts
, x > y
]
M.empty (listToCombi identity ts)
-- | Map list creation
......@@ -70,7 +67,8 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
$ L.take l'
$ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
......@@ -90,16 +88,17 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste
$ sortWith (Down . _scored_speGen) xs
data Scored ts = Scored { _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
data Scored ts = Scored
{ _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around.
coocScored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti,fi) = createIndices m
(ti, fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment