Commit c34120e3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] fix cooc behavior.

parent 09cf2917
......@@ -16,19 +16,44 @@ module Gargantext.Pipeline
where
import Data.Text.IO (readFile)
import Control.Arrow ((***))
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified Data.List as L
import Data.Tuple.Extra (both)
----------------------------------------------
import Gargantext.Core (Lang(FR))
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex)
import Gargantext.Viz.Graph.Distances.Matrice (conditional)
import Gargantext.Viz.Graph.Index (score, createIndices, toIndex, fromIndex, cooc2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (incExcSpeGen, conditional)
import Gargantext.Viz.Graph.Index (Index)
import Gargantext.Text.Metrics.Occurrences (cooc, removeApax)
import Gargantext.Text.Terms (TermType(Multi, Mono), extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
--filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
--filterCooc m =
---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
----(ti, fi) = createIndices m
-- . fromIndex fi $ filterMat $ cooc2mat ti m
import Data.Array.Accelerate (Matrix)
filterMat :: Matrix Int -> [(Index, Index)]
filterMat m = S.toList $ S.take n $ S.fromList $ (L.take nIe incExc') <> (L.take nSg speGen')
where
(incExc', speGen') = both ( map fst . L.sortOn snd . M.toList . mat2map) (incExcSpeGen m)
n = nIe + nSg
nIe = 30
nSg = 70
pipeline path = do
-- Text <- IO Text <- FilePath
text <- readFile path
......@@ -39,12 +64,17 @@ pipeline path = do
-- TODO groupBy (Stem | GroupList)
let myCooc = removeApax $ cooc myterms
let (ti, fi) = createIndices myCooc
pure ti
-- Cooc -> Matrix
let theScores = M.take 350 $ M.filter (>0) $ score conditional myCooc
let (ti, _) = createIndices theScores
--
---- -- Matrix -> Clustering -> Graph -> JSON
---- pure $ bestpartition False $ map2graph $ toIndex ti theScores
partitions <- cLouvain $ toIndex ti theScores
pure partitions
-- -- filter by spec/gen (dynmaic programming)
-- let theScores = M.filter (>0) $ score conditional myCoocFiltered
----
------ -- Matrix -> Clustering
------ pure $ bestpartition False $ map2graph $ toIndex ti theScores
-- partitions <- cLouvain theScores
-- pure partitions
---- | Building : -> Graph -> JSON
......@@ -237,5 +237,3 @@ unMaybe = map fromJust . L.filter isJust
-- maximumWith
maximumWith f = L.maximumBy (\x y -> compare (f x) (f y))
......@@ -14,12 +14,69 @@ Mainly reexport functions in @Data.Text.Metrics@
module Gargantext.Text.Metrics where
--import Data.Text (Text)
import Data.Text (Text, pack)
import Data.List (concat)
--import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM
--
--import Gargantext.Prelude
--
import Gargantext.Prelude
import Gargantext.Text.Metrics.Occurrences (occurrences, cooc)
import Gargantext.Text.Terms (TermType(Multi), terms)
import Gargantext.Core (Lang(EN))
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
metrics_text :: Text
metrics_text = "A table is an object. A glas is an object. The glas is on the table. The spoon is an object. The spoon is on the table."
-- | Sentences
metrics_sentences :: [Text]
metrics_sentences = [ "A table is an object."
, "A glas is an object."
, "The glas is on the table."
, "The spoon is an object."
, "The spoon is on the table."
]
metrics_sentences_Test = splitBy (Sentences 0) metrics_text == metrics_sentences
-- | Terms reordered to visually check occurrences
metrics_terms :: [[[Text]]]
metrics_terms = [[["table"],["object"] ]
,[ ["object"],["glas"] ]
,[["table"], ["glas"] ]
,[ ["object"], ["spoon"]]
,[["table"], ["spoon"]]
]
--metrics_terms_Test = (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text) == metrics_terms
-- | Occurrences
{-
fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
, (fromList ["object"],fromList [(["object"], 3 )])
, (fromList ["glas"] ,fromList [(["glas"] , 2 )])
, (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
-}
metrics_occ = occurrences <$> concat <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text)
{-
-- fromList [((["glas"],["object"]),6)
,((["glas"],["spoon"]),4)
,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
-}
metrics_cooc = cooc <$> (mapM (terms Multi EN) $ splitBy (Sentences 0) metrics_text)
metrics_cooc' = (mapM (terms Multi EN) $ splitBy (Sentences 0) "The table object. The table object.")
......@@ -82,9 +82,9 @@ removeApax = DMS.filter (> 1)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc tss =
mapKeys (delta $ labelPolicy terms_occs) $ cooc' (map (Set.fromList . map _terms_stem) tss)
mapKeys (delta $ labelPolicy terms_occs) $ coocOn _terms_stem tss
where
terms_occs = occurrences (List.concat tss)
terms_occs = occurrencesOn _terms_stem (List.concat tss)
delta f = f *** f
......@@ -93,24 +93,26 @@ labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
cooc' :: Ord b => [Set b] -> Map (b, b) Coocs
cooc' tss = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
where
ts' = List.nub $ map f ts
xs = [ ((x, y), 1)
| xs <- tss
, ys <- tss
, x <- Set.toList xs
, y <- Set.toList ys
| x <- ts'
, y <- ts'
, x < y
]
-- | Compute the grouped occurrences (occ)
occurrences :: [Terms] -> Map Grouped (Map Terms Int)
occurrences = occurrences' _terms_stem
occurrences = occurrencesOn _terms_stem
occurrences' :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrences' f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
-- TODO add groups and filter stops
sumOcc :: Ord a => [Occ a] -> Occ a
......
......@@ -96,10 +96,10 @@ data Properties = Properties { _propertiesAnnotators :: Text
$(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { _sentences :: [Sentence]}
data PosSentences = PosSentences { _sentences :: [Sentence]}
deriving (Show, Generic)
$(deriveJSON (unPrefix "_") ''Sentences)
$(deriveJSON (unPrefix "_") ''PosSentences)
-- request =
......@@ -134,7 +134,7 @@ corenlpRaw lang txt = do
pure (getResponseBody response)
corenlp :: Lang -> Text -> IO Sentences
corenlp :: Lang -> Text -> IO PosSentences
corenlp lang txt = do
response <- corenlp' lang txt
pure (getResponseBody response)
......
......@@ -102,8 +102,22 @@ conditional m = run (miniMax $ proba r $ map fromIntegral $ use m)
r = rank' m
conditional' :: Matrix Double -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie (use m), run $ sg (use m))
{-
Metric Specificity and genericty: select terms
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Mean{j} P(j_k|i)
Spec(i) = Mean{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
-}
incExcSpeGen :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
incExcSpeGen m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where
ie :: Matrix' Double -> Matrix' Double
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment