Commit c44c07c7 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[COOC FILTERING] workflow done but optimization issue.

parent e5419936
...@@ -46,27 +46,23 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) ...@@ -46,27 +46,23 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-} -}
pipeline path = do workflow lang path = do
-- Text <- IO Text <- FilePath -- Text <- IO Text <- FilePath
text <- readFile path text <- readFile path
let contexts = splitBy (Sentences 5) text let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi FR contexts myterms <- extractTerms Multi lang contexts
-- TODO filter (\t -> not . elem t stopList) myterms -- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList) -- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms let myCooc = filterCooc $ removeApax $ cooc myterms
--let (ti, fi) = createIndices myCooc
pure True
--pure $ incExcSpeGen myCooc
-- Cooc -> Matrix -- Cooc -> Matrix
--let (ti, fi) = createIndices myCooc
-- -- filter by spec/gen (dynmaic programming) -- @np FIXME optimization issue of filterCooc (too much memory consumed)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered pure myCooc
---- -- Matrix -> Clustering
------ -- Matrix -> Clustering -- pure $ bestpartition False $ map2graph $ toIndex ti myCooc
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores --partitions <- cLouvain $ toIndex ti $ M.map (\v -> (fromIntegral v) :: Double) myCooc
-- partitions <- cLouvain theScores --pure partitions
-- pure partitions
---- | Building : -> Graph -> JSON ---- | Building : -> Graph -> JSON
...@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m ...@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m
-} -}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
...@@ -53,16 +54,21 @@ import qualified Data.Array.Accelerate as DAA ...@@ -53,16 +54,21 @@ import qualified Data.Array.Accelerate as DAA
import GHC.Real (round) import GHC.Real (round)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m = filterCooc cc = filterCooc' ts cc
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection where
----(ti, fi) = createIndices m ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
-- . fromIndex fi $ filterMat $ cooc2mat ti m
filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
where
errMessage = panic "Filter cooc: no key"
selection = [(x,y) | x <- ts, y <- ts, x > y]
type MapListSize = Int type MapListSize = Int
type SampleBins = Double type SampleBins = Double
type Clusters = Int type Clusters = Int
-- | Map list creation -- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score) -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
...@@ -83,23 +89,44 @@ takeSome l s k scores = L.take l ...@@ -83,23 +89,44 @@ takeSome l s k scores = L.take l
m = round $ (fromIntegral $ length scores) / (s) m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = L.concat $ map (L.take n) takeSample n m xs = L.concat $ map (L.take n)
$ L.reverse $ map (L.sortOn _scored_incExc) $ L.reverse $ map (L.sortOn _scored_incExc)
$ splitEvery m -- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m
$ L.reverse $ L.sortOn _scored_speGen xs $ L.reverse $ L.sortOn _scored_speGen xs
data Scored t = Scored { _scored_terms :: t data Scored t = Scored { _scored_terms :: !t
, _scored_incExc :: InclusionExclusion , _scored_incExc :: !InclusionExclusion
, _scored_speGen :: SpecificityGenericity , _scored_speGen :: !SpecificityGenericity
} deriving (Show) } deriving (Show)
incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t] coocScored :: Ord t => Map (t,t) Int -> [Scored t]
incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where where
(ti,fi) = createIndices m (ti,fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m (is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss) scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)]) incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m) incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where where
...@@ -108,7 +135,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m) ...@@ -108,7 +135,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
metrics_text :: Text metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences metrics_text = T.intercalate " " metrics_sentences
......
...@@ -55,8 +55,6 @@ data Graph = Graph { g_nodes :: [Node] ...@@ -55,8 +55,6 @@ data Graph = Graph { g_nodes :: [Node]
} }
deriving (Show, Generic) deriving (Show, Generic)
$(deriveJSON (unPrefix "g_") ''Graph) $(deriveJSON (unPrefix "g_") ''Graph)
----------------------------------------------------------- -----------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment