Commit bb989318 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC] distances.

parent 6b880dc3
...@@ -29,10 +29,10 @@ import Gargantext.Core (Lang) ...@@ -29,10 +29,10 @@ import Gargantext.Core (Lang)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map) import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (conditional) import Gargantext.Viz.Graph.Distances.Matrice (distributional)
import Gargantext.Viz.Graph (Graph(..), data2graph) import Gargantext.Viz.Graph (Graph(..), data2graph)
import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
import Gargantext.Text.Terms (TermType, extractTerms) import Gargantext.Text.Terms (TermType, extractTerms)
import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
...@@ -40,7 +40,6 @@ import Gargantext.Text.Parsers.CSV ...@@ -40,7 +40,6 @@ import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
{- {-
____ _ _ ____ _ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_ / ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
...@@ -93,7 +92,7 @@ textFlow' termType contexts = do ...@@ -93,7 +92,7 @@ textFlow' termType contexts = do
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 400 ) (InclusionSize 900 )
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue 0 ) (DefaultValue 0 )
...@@ -109,19 +108,20 @@ textFlow' termType contexts = do ...@@ -109,19 +108,20 @@ textFlow' termType contexts = do
printDebug "myCooc4" $ M.size myCooc4 printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4
-- printDebug "matCooc" matCooc printDebug "matCooc" matCooc
-- Matrix -> Clustering -- Matrix -> Clustering
let distanceMat = conditional matCooc --let distanceMat = conditional' matCooc
-- let distanceMat = distributional matCooc let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat" $ A.arrayShape distanceMat
-- printDebug "distanceMat" distanceMat printDebug "distanceMat" distanceMat
-- --
let distanceMap = mat2map distanceMat let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap printDebug "distanceMap" $ M.size distanceMap
--{-
-- let distance = fromIndex fi distanceMap -- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance -- printDebug "distance" $ M.size distance
---}
partitions <- cLouvain distanceMap partitions <- cLouvain distanceMap
-- Building : -> Graph -> JSON -- Building : -> Graph -> JSON
printDebug "partitions" $ length partitions printDebug "partitions" $ length partitions
......
...@@ -9,6 +9,7 @@ Portability : POSIX ...@@ -9,6 +9,7 @@ Portability : POSIX
2 main measures are actually implemented in order to compute the proximity of two terms. 2 main measures are actually implemented in order to compute the proximity of two terms.
- Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus. - Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
- Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence. - Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
...@@ -99,7 +100,6 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m ...@@ -99,7 +100,6 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
miniMax' = (the $ minimum $ maximum m) miniMax' = (the $ minimum $ maximum m)
-- | Conditional distance (basic version) -- | Conditional distance (basic version)
conditional :: Matrix Int -> Matrix Double conditional :: Matrix Int -> Matrix Double
conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m) conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
...@@ -134,10 +134,18 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr ...@@ -134,10 +134,18 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
-- The distributional measure \[P_c\] of @i@ and @j@ terms is: -- The distributional measure \[P_c\] of @i@ and @j@ terms is:
-- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} -- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
-- \] -- \]
-- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information --
-- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text -- Mutual information
-- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
-- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term --
-- Number of cooccurrences of @i@ and @j@ in the same context of text
-- \[C{ij}\]
--
-- The expected value of the cooccurrences
-- \[E_{ij} = \frac {S_{i} S_{j}} {N}\]
--
-- Total cooccurrences of @i@ term
-- \[N_{i} = \sum_{i}^{} S_{i}\]
distributional :: Matrix Int -> Matrix Double distributional :: Matrix Int -> Matrix Double
distributional m = run $ miniMax $ ri (map fromIntegral $ use m) distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
where where
...@@ -160,25 +168,21 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m) ...@@ -160,25 +168,21 @@ distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
----------------------------------------------------------------------- -----------------------------------------------------------------------
----------------------------------------------------------------------- -----------------------------------------------------------------------
{- | Metric Specificity and genericity: select terms
{- - let N termes and occurrences of i \[N{i}\]
Metric Specificity and genericity: select terms
let N termes - Cooccurrences of i and j \[N{ij}\]
Ni : occ de i - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
Nij : cooc i et j
Probability to get i given j : P(i|j)=Nij/Nj
Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
Inclusion (i) = Gen(i)+Spec(i)
Genericity score = Gen(i)- Spec(i)
- Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
- Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
References: - \[Inclusion (i) = Gen(i) + Spec(i)\)
* Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH] - \[GenericityScore = Gen(i)- Spec(i)\]
- References: Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH]
-} -}
type InclusionExclusion = Double type InclusionExclusion = Double
type SpecificityGenericity = Double type SpecificityGenericity = Double
...@@ -195,7 +199,7 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m) ...@@ -195,7 +199,7 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | Inclusion (i) = Gen(i)+Spec(i) -- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double) inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pH mat) inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
--
-- | Genericity score = Gen(i)- Spec(i) -- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double) specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (-) (pV mat) (pH mat) specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment