Commit c34120e3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] fix cooc behavior.

parent 09cf2917
...@@ -16,19 +16,44 @@ module Gargantext.Pipeline ...@@ -16,19 +16,44 @@ module Gargantext.Pipeline
where where
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified Data.List as L
import Data.Tuple.Extra (both)
---------------------------------------------- ----------------------------------------------
import Gargantext.Core (Lang(FR)) import Gargantext.Core (Lang(FR))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex) import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional) import Gargantext.Viz.Graph.Distances.Matrice (incExcSpeGen, conditional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Text.Metrics.Occurrences (cooc, removeApax) import Gargantext.Text.Metrics.Occurrences (cooc, removeApax)
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms) import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
import Data.Array.Accelerate (Matrix)
filterMat :: Matrix Int -> [(Index, Index)]
filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
where
(incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (incExcSpeGen m)
n = nIe + nSg
nIe = 30
nSg = 70
pipeline path = do pipeline path = do
-- Text <- IO Text <- FilePath -- Text <- IO Text <- FilePath
text <- readFile path text <- readFile path
...@@ -39,12 +64,17 @@ pipeline path = do ...@@ -39,12 +64,17 @@ pipeline path = do
-- TODO groupBy (Stem | GroupList) -- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms let myCooc = removeApax $ cooc myterms
let (ti, fi) = createIndices myCooc
pure ti
-- Cooc -> Matrix -- Cooc -> Matrix
let theScores = M.take 350 $ M.filter (>0) $ score conditional myCooc
let (ti, _) = createIndices theScores -- -- filter by spec/gen (dynmaic programming)
-- -- let theScores = M.filter (>0) $ score conditional myCoocFiltered
---- -- Matrix -> Clustering -> Graph -> JSON ----
---- pure $ bestpartition False $ map2graph $ toIndex ti theScores ------ -- Matrix -> Clustering
partitions <- cLouvain $ toIndex ti theScores ------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
pure partitions -- partitions <- cLouvain theScores
-- pure partitions
---- | Building : -> Graph -> JSON
...@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust ...@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust
-- maximumWith -- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y)) maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
...@@ -14,12 +14,69 @@ Mainly reexport functions in @Data.Text.Metrics@ ...@@ -14,12 +14,69 @@ Mainly reexport functions in @Data.Text.Metrics@
module Gargantext.Text.Metrics where module Gargantext.Text.Metrics where
--import Data.Text (Text) import Data.Text (Text, pack)
import Data.List (concat)
--import GHC.Real (Ratio) --import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM --import qualified Data.Text.Metrics as DTM
--
--import Gargantext.Prelude import Gargantext.Prelude
--
import Gargantext.Text.Metrics.Occurrences (occurrences, cooc)
import Gargantext.Text.Terms (TermType(Multi), terms)
import Gargantext.Core (Lang(EN))
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
--noApax :: Ord a => Map a Occ -> Map a Occ --noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m --noApax m = M.filter (>1) m
metrics_text :: Text
metrics_text = "A table is an object. A glas is an object. The glas is on the table. The spoon is an object. The spoon is on the table."
-- | Sentences
metrics_sentences :: [Text]
metrics_sentences = [ "A table is an object."
, "A glas is an object."
, "The glas is on the table."
, "The spoon is an object."
, "The spoon is on the table."
]
metrics_sentences_Test = splitBy (Sentences 0) metrics_text == metrics_sentences
-- | Terms reordered to visually check occurrences
metrics_terms :: [[[Text]]]
metrics_terms = [[["table"],["object"] ]
,[ ["object"],["glas"] ]
,[["table"], ["glas"] ]
,[ ["object"], ["spoon"]]
,[["table"], ["spoon"]]
]
--metrics_terms_Test = (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) == metrics_terms
-- | Occurrences
{-
fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
, (fromList ["object"],fromList [(["object"], 3 )])
, (fromList ["glas"] ,fromList [(["glas"] , 2 )])
, (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
-}
metrics_occ = occurrences <$> concat <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text)
{-
-- fromList [((["glas"],["object"]),6)
,((["glas"],["spoon"]),4)
,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
-}
metrics_cooc = cooc <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text)
metrics_cooc' = (mapM (terms Multi EN) $ splitBy (Sentences 0) "The table object. The table object.")
...@@ -82,9 +82,9 @@ removeApax = DMS.filter (> 1) ...@@ -82,9 +82,9 @@ removeApax = DMS.filter (> 1)
cooc :: [[Terms]] -> Map (Label, Label) Int cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss = cooc tss =
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss) mapKeys (delta $ labelPolicy terms_occs) $ coocOn _terms_stem tss
where where
terms_occs = occurrences (List.concat tss) terms_occs = occurrencesOn _terms_stem (List.concat tss)
delta f = f *** f delta f = f *** f
...@@ -93,24 +93,26 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList ...@@ -93,24 +93,26 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList
Just label -> label Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g) Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
where where
ts' = List.nub $ map f ts
xs = [ ((x, y), 1) xs = [ ((x, y), 1)
| xs <- tss | x <- ts'
, ys <- tss , y <- ts'
, x <- Set.toList xs
, y <- Set.toList ys
, x < y , x < y
] ]
-- | Compute the grouped occurrences (occ) -- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped (Map Terms Int) occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem occurrences = occurrencesOn _terms_stem
occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int) occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
-- TODO add groups and filter stops -- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a sumOcc :: Ord a => [Occ a] -> Occ a
......
...@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text ...@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text
$(deriveJSON (unPrefix "_properties") ''Properties) $(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { _sentences :: [Sentence]} data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic) deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''Sentences) $(deriveJSON (unPrefix "_") ''PosSentences)
-- request = -- request =
...@@ -134,7 +134,7 @@ corenlpRaw lang txt = do ...@@ -134,7 +134,7 @@ corenlpRaw lang txt = do
pure (getResponseBody response) pure (getResponseBody response)
corenlp :: Lang -> Text -> IO Sentences corenlp :: Lang -> Text -> IO PosSentences
corenlp lang txt = do corenlp lang txt = do
response <- corenlp' lang txt response <- corenlp' lang txt
pure (getResponseBody response) pure (getResponseBody response)
......
...@@ -102,8 +102,22 @@ conditional m = run (miniMax $ proba r $ map fromIntegral $ use m) ...@@ -102,8 +102,22 @@ conditional m = run (miniMax $ proba r $ map fromIntegral $ use m)
r = rank' m r = rank' m
conditional' :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) {-
conditional' m = (run $ ie (use m), run $ sg (use m)) Metric Specificity and genericty: select terms
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Mean{j} P(j_k|i)
Spec(i) = Mean{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
-}
incExcSpeGen :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
incExcSpeGen m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where where
ie :: Matrix' Double -> Matrix' Double ie :: Matrix' Double -> Matrix' Double
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment