Commit c44c07c7 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[COOC FILTERING] workflow done but optimization issue.

parent e5419936
......@@ -46,27 +46,23 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-}
pipeline path = do
workflow lang path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi FR contexts
myterms <- extractTerms Multi lang contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms
--let (ti, fi) = createIndices myCooc
pure True
--pure $ incExcSpeGen myCooc
let myCooc = filterCooc $ removeApax $ cooc myterms
-- Cooc -> Matrix
-- -- filter by spec/gen (dynmaic programming)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------ -- Matrix -> Clustering
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
-- partitions <- cLouvain theScores
-- pure partitions
--let (ti, fi) = createIndices myCooc
-- @np FIXME optimization issue of filterCooc (too much memory consumed)
pure myCooc
-- Matrix -> Clustering
-- pure $ bestpartition False $ map2graph $ toIndex ti myCooc
--partitions <- cLouvain $ toIndex ti $ M.map (\v -> (fromIntegral v) :: Double) myCooc
--pure partitions
---- | Building : -> Graph -> JSON
......@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
......@@ -53,16 +54,21 @@ import qualified Data.Array.Accelerate as DAA
import GHC.Real (round)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
filterCooc cc = filterCooc' ts cc
where
ts = map _scored_terms $ takeSome 350 5 2 $ coocScored cc
filterCooc' :: Ord t => [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' ts m = foldl' (\m' k -> M.insert k (maybe errMessage identity $ M.lookup k m) m') M.empty selection
where
errMessage = panic "Filter cooc: no key"
selection = [(x,y) | x <- ts, y <- ts, x > y]
type MapListSize = Int
type SampleBins = Double
type Clusters = Int
type MapListSize = Int
type SampleBins = Double
type Clusters = Int
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
......@@ -83,23 +89,44 @@ takeSome l s k scores = L.take l
m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = L.concat $ map (L.take n)
$ L.reverse $ map (L.sortOn _scored_incExc)
$ splitEvery m
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m
$ L.reverse $ L.sortOn _scored_speGen xs
data Scored t = Scored { _scored_terms :: t
, _scored_incExc :: InclusionExclusion
, _scored_speGen :: SpecificityGenericity
data Scored t = Scored { _scored_terms :: !t
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
coocScored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti,fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where
......@@ -108,7 +135,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences
......
......@@ -55,8 +55,6 @@ data Graph = Graph { g_nodes :: [Node]
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "g_") ''Graph)
-----------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment