Commit 8a838c7f authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'pipeline'

parents a630946f 45c3bb43
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-|
Module : Data.ByteString.Extended
Description : Short description
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Example showing how to extend existing base libraries.
-}
module Data.ByteString.Extended ( module Data.ByteString
, replace
) where
import Data.ByteString
replace :: ByteString -> ByteString -> ByteString -> ByteString
replace = undefined
-- instance (Binary k, Binary v) => Binary (HaskMap k v) where
-- ...
......@@ -42,18 +42,16 @@ main = do
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
writeCsv wPath (h, docs')
writeCsv wPath (h, docs')
......@@ -24,6 +24,7 @@ library:
# - -Werror
exposed-modules:
- Gargantext
- Gargantext.Pipeline
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Types
......@@ -122,25 +123,38 @@ library:
- zlib
# - utc
executable:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
executables:
gargantext:
main: Main.hs
source-dirs: app
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
gargantext-workflow:
main: Main.hs
source-dirs: app-workflow
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
tests:
garg-test:
......
......@@ -14,26 +14,31 @@ Portability : POSIX
module Gargantext.Pipeline
where
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
import qualified Data.List as L
import Data.Tuple.Extra (both)
----------------------------------------------
import Gargantext.Core (Lang(FR))
import Gargantext.Core.Types (Label)
import Gargantext.Prelude
import Prelude (print, seq)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional', conditional, distributional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Text.Metrics.Count (cooc, removeApax)
import Gargantext.Viz.Graph (Graph(..), Node(..), Edge(..), Attributes(..), TypeNode(..))
import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
{-
......@@ -46,27 +51,94 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-}
pipeline path = do
workflow lang path = do
-- Text <- IO Text <- FilePath
text <- readFile path
let contexts = splitBy (Sentences 5) text
myterms <- extractTerms Multi FR contexts
-- TODO filter (\t -> not . elem t stopList) myterms
-- TODO groupBy (Stem | GroupList)
-- Context :: Text -> [Text]
-- Contexts = Paragraphs n | Sentences n | Chars n
myterms <- extractTerms (Mono lang) contexts
-- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList)
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
-- compute copresences of terms
-- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
let myCooc = removeApax $ cooc myterms
--let (ti, fi) = createIndices myCooc
pure True
--pure $ incExcSpeGen myCooc
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 20 )
(InclusionSize 1000 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue (-1))
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- Cooc -> Matrix
-- -- filter by spec/gen (dynmaic programming)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------ -- Matrix -> Clustering
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
-- partitions <- cLouvain theScores
-- pure partitions
---- | Building : -> Graph -> JSON
let (ti, fi) = createIndices myCooc3
printDebug "ti" $ M.size ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (-2) (M.size ti) myCooc4
printDebug "matCooc" matCooc
pure matCooc
-- Matrix -> Clustering
--let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
-- printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat
--
-- let distanceMap = mat2map distanceMat
-- printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
---}
-- partitions <- cLouvain distanceMap
------ | Building : -> Graph -> JSON
-- printDebug "partitions" $ length partitions
-- pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int
-> Map (Int, Int) Double
-> [LouvainNode]
-> Graph
data2graph labels coocs distance partitions = Graph nodes edges
where
community_id_by_node_id = M.fromList [ (n, c) | LouvainNode n c <- partitions ]
nodes = [ Node { n_size = maybe 0 identity (M.lookup (n,n) coocs)
, n_type = Terms -- or Unknown
, n_id = cs (show n)
, n_label = T.unwords l
, n_attributes =
Attributes { clust_default = maybe 0 identity
(M.lookup n community_id_by_node_id) } }
| (l, n) <- labels ]
edges = [ Edge { e_source = s
, e_target = t
, e_weight = w
, e_id = i }
| (i, ((s,t), w)) <- zip [0..] (M.toList distance) ]
-----------------------------------------------------------
printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure ()
......@@ -50,6 +50,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, undefined
, IO()
, compare
, on
)
-- TODO import functions optimized in Utils.Count
......@@ -109,7 +110,7 @@ ma = movingAverage 3
-- | splitEvery n == chunkAlong n n
splitEvery :: Int -> [a] -> [[a]]
splitEvery _ [] = L.cycle [[]]
splitEvery _ [] = []
splitEvery n xs =
let (h,t) = L.splitAt n xs
in h : splitEvery n t
......@@ -235,5 +236,5 @@ unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
maximumWith f = L.maximumBy (compare `on` f)
......@@ -16,6 +16,7 @@ noApax m = M.filter (>1) m
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
......@@ -50,55 +51,99 @@ import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate.Interpreter as DAA
import qualified Data.Array.Accelerate as DAA
-- import Data.Array.Accelerate ((:.)(..), Z(..))
import GHC.Real (round)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
import Debug.Trace
import Prelude (seq)
data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int
data SampleBins = SampleBins Double
data Clusters = Clusters Int
data DefaultValue = DefaultValue Int
type ListSize = Int
type BinSize = Double
data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_inclusionSize :: InclusionSize
, fc_sampleBins :: SampleBins
, fc_clusters :: Clusters
, fc_defaultValue :: DefaultValue
}
-- Map list creation
-- Kmean split into 2 main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in s parts
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
selection = [(x,y) | x <- ts, y <- ts, x > y]
-- | Map list creation
-- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * s = l
takeSome :: Ord t => ListSize -> BinSize -> [Scored t] -> [Scored t]
takeSome l s scores = L.take l
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
$ takeSample n m
$ splitKmeans 2 scores
$ L.take l' $ L.reverse $ L.sortOn _scored_incExc scores
-- $ splitKmeans k scores
where
-- (TODO: benchmark with accelerate-example kmeans version)
splitKmeans x xs = elements
$ V.head
-- TODO: benchmark with accelerate-example kmeans version
splitKmeans x xs = L.concat $ map elements
$ V.take (k-1)
$ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
euclidSq x xs
n = round ((fromIntegral l)/s)
m = round $ (fromIntegral $ length scores) / (s)
takeSample n m xs = L.concat $ map (L.take n)
$ L.reverse $ map (L.sortOn _scored_incExc)
takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
L.concat $ map (L.take n)
$ map (reverse . (L.sortOn _scored_incExc))
-- TODO use kmeans s instead of splitEvery
-- in order to split in s heteregenous parts
-- without homogeneous order hypothesis
$ splitEvery m
$ L.reverse $ L.sortOn _scored_speGen xs
data Scored t = Scored { _scored_terms :: t
, _scored_incExc :: InclusionExclusion
, _scored_speGen :: SpecificityGenericity
data Scored t = Scored { _scored_terms :: !t
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
incExcSpeGen_sorted' :: Ord t => Map (t,t) Int -> [Scored t]
incExcSpeGen_sorted' m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
coocScored :: Ord t => Map (t,t) Int -> [Scored t]
coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
where
(ti,fi) = createIndices m
(is, ss) = incExcSpeGen $ cooc2mat ti m
scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
where
......@@ -107,7 +152,6 @@ incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences
......@@ -136,7 +180,7 @@ metrics_sentences_Test = metrics_sentences == metrics_sentences'
-}
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
-- | Occurrences
{-
......
......@@ -75,14 +75,15 @@ type Grouped = Stems
type Occs = Int
type Coocs = Int
removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
removeApax = DMS.filter (> 1)
type Threshold = Int
removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
removeApax t = DMS.filter (> t)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
where
terms_occs = occurrencesOn _terms_stem (List.concat tss)
label_policy = mkLabelPolicy terms_occs
coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
......@@ -93,10 +94,21 @@ coocOnWithLabel on policy tss =
delta f = f *** f
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
mkLabelPolicy = DMS.map f where
f = _terms_label . fst . maximumWith snd . DMS.toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy :: Map Grouped Label -> Grouped -> Label
useLabelPolicy m g = case DMS.lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
{-
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
-}
coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
......
......@@ -42,23 +42,23 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi | MonoMulti
data TermType lang = Mono lang | Multi lang | MonoMulti lang
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
extractTerms :: Traversable t => TermType -> Lang -> t Text -> IO (t [Terms])
extractTerms termType lang = mapM (terms termType lang)
extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms termTypeLang = mapM (terms termTypeLang)
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType -> Lang -> Text -> IO [Terms]
terms Mono lang txt = pure $ monoterms' lang txt
terms Multi lang txt = multiterms lang txt
terms MonoMulti lang txt = terms Multi lang txt
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
------------------------------------------------------------------------
......@@ -19,11 +19,14 @@ module Gargantext.Viz.Graph
import GHC.Generics (Generic)
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Data.Map (Map)
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix)
-----------------------------------------------------------
import Data.Graph.Clustering.Louvain.CplusPlus (LouvainNode)
data TypeNode = Terms | Unknown
deriving (Show, Generic)
......@@ -55,8 +58,6 @@ data Graph = Graph { g_nodes :: [Node]
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "g_") ''Graph)
-----------------------------------------------------------
......@@ -109,7 +109,6 @@ conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where
ie :: Acc (Matrix Double) -> Acc (Matrix Double)
ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
sg :: Acc (Matrix Double) -> Acc (Matrix Double)
......@@ -149,8 +148,6 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
crossProduct m = zipWith (*) (cross m ) (cross (transpose m))
cross mat = zipWith (-) (mkSum n mat) (mat)
-----------------------------------------------------------------------
-----------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment