From 6f66c3dcc0b6398ae41314ab32a8366c7bef76d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org> Date: Thu, 23 Aug 2018 10:35:04 +0200 Subject: [PATCH] [DOC] adding a file for examples, text ngrams extraction and some metrics (begin). --- package.yaml | 1 + src/Gargantext/Text/Metrics.hs | 100 --------------------- src/Gargantext/Text/Metrics/Examples.hs | 115 ++++++++++++++++++++++++ src/Gargantext/Text/Metrics/Freq.hs | 4 +- 4 files changed, 117 insertions(+), 103 deletions(-) create mode 100644 src/Gargantext/Text/Metrics/Examples.hs diff --git a/package.yaml b/package.yaml index 1211e618..904cfad6 100644 --- a/package.yaml +++ b/package.yaml @@ -33,6 +33,7 @@ library: - Gargantext.Text.Context - Gargantext.Text.List.CSV - Gargantext.Text.Metrics + - Gargantext.Text.Metrics.Examples - Gargantext.Text.Metrics.Count - Gargantext.Text.Metrics.CharByChar - Gargantext.Text.Parsers.CSV diff --git a/src/Gargantext/Text/Metrics.hs b/src/Gargantext/Text/Metrics.hs index ca564625..65fe3f01 100644 --- a/src/Gargantext/Text/Metrics.hs +++ b/src/Gargantext/Text/Metrics.hs @@ -24,22 +24,9 @@ import qualified Data.List as L import Data.Map (Map) import qualified Data.Map as M -import Data.Text (Text) -import qualified Data.Text as T - -import Data.Tuple.Extra (both) ---import GHC.Real (Ratio) ---import qualified Data.Text.Metrics as DTM -import Data.Array.Accelerate (toList, Matrix) --import Math.KMeans (kmeans, euclidSq, elements) import Gargantext.Prelude -import Gargantext.Text.Metrics.Count (occurrences, cooc) -import Gargantext.Text.Terms (TermType(MonoMulti), terms) -import Gargantext.Core (Lang(EN)) -import Gargantext.Core.Types (Terms(..), Label) -import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) -import Gargantext.Text.Metrics.Count (Grouped) import Gargantext.Viz.Graph.Distances.Matrice import Gargantext.Viz.Graph.Index @@ -121,90 +108,3 @@ coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scor (ti,fi) = createIndices m (is, ss) = incExcSpeGen $ cooc2mat ti m scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss) - - - - - - - - - - - - - - - - - - -incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)]) -incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m) - where - (ti,fi) = createIndices m - ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x) - - - -metrics_text :: Text -metrics_text = T.intercalate " " metrics_sentences - -metrics_sentences' :: [Text] -metrics_sentences' = splitBy (Sentences 0) metrics_text - --- | Sentences -metrics_sentences :: [Text] -metrics_sentences = [ "There is a table with a glass of wine and a spoon." - , "I can see the glass on the table." - , "There was only a spoon on that table." - , "The glass just fall from the table, pouring wine everywhere." - , "I wish the glass did not contain wine." - ] - -metrics_sentences_Test :: Bool -metrics_sentences_Test = metrics_sentences == metrics_sentences' - --- | Terms reordered to visually check occurrences --- >>> metrics_terms --- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]] -metrics_terms :: IO [[Terms]] -metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text - --- | Occurrences -{- -fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])] - , (fromList ["object"],fromList [(["object"], 3 )]) - , (fromList ["glas"] ,fromList [(["glas"] , 2 )]) - , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )]) --} -metrics_occ :: IO (Map Grouped (Map Terms Int)) -metrics_occ = occurrences <$> L.concat <$> metrics_terms - -{- --- fromList [((["glas"],["object"]),6) - ,((["glas"],["spoon"]),4) - ,((["glas"],["table"]),6) - ,((["object"],["spoon"]),6) - ,((["object"],["table"]),9) - ,((["spoon"],["table"]),6)] - --} - -metrics_cooc :: IO (Map (Label, Label) Int) -metrics_cooc = cooc <$> metrics_terms - -metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity)) -metrics_cooc_mat = do - m <- metrics_cooc - let (ti,_) = createIndices m - let mat_cooc = cooc2mat ti m - pure ( ti - , mat_cooc - , incExcSpeGen_proba mat_cooc - , incExcSpeGen mat_cooc - ) - -metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)]) -metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc - diff --git a/src/Gargantext/Text/Metrics/Examples.hs b/src/Gargantext/Text/Metrics/Examples.hs new file mode 100644 index 00000000..091a770d --- /dev/null +++ b/src/Gargantext/Text/Metrics/Examples.hs @@ -0,0 +1,115 @@ +{-| +Module : Gargantext.Text.Metrics.Examples +Description : Minimal Examples to test behavior of the functions. +Copyright : (c) CNRS, 2017 - present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +This file is intended for these purposes: + +- documentation for teaching and research +- behavioral tests (that should be completed with uni-tests and scale-tests + +-} + +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} + +module Gargantext.Text.Metrics.Examples + where + +import Data.Ord (Down(..)) +import qualified Data.List as L + +import Data.Map (Map) +import qualified Data.Map as M + +import Data.Text (Text) +import qualified Data.Text as T + +import Data.Tuple.Extra (both) +import Data.Array.Accelerate (toList, Matrix) + +import Gargantext.Prelude +import Gargantext.Text.Metrics.Count (occurrences, cooc) +import Gargantext.Text.Terms (TermType(MonoMulti), terms) +import Gargantext.Core (Lang(EN)) +import Gargantext.Core.Types (Terms(..), Label) +import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) +import Gargantext.Text.Metrics.Count (Grouped) +import Gargantext.Viz.Graph.Distances.Matrice +import Gargantext.Viz.Graph.Index + +import qualified Data.Array.Accelerate as DAA + + +-- | From list to simple text +-- +-- >>> metrics_text +-- "There is a table with a glass of wine and a spoon. I can see the glass on the table. There was only a spoon on that table. The glass just fall from the table, pouring wine everywhere. I wish the glass did not contain wine." +metrics_text :: Text +metrics_text = T.intercalate " " metrics_sentences + + +-- | Sentences +-- +-- >>> metrics_sentences +-- ["There is a table with a glass of wine and a spoon.","I can see the glass on the table.","There was only a spoon on that table.","The glass just fall from the table, pouring wine everywhere.","I wish the glass did not contain wine."] +metrics_sentences :: [Text] +metrics_sentences = [ "There is a table with a glass of wine and a spoon." + , "I can see the glass on the table." + , "There was only a spoon on that table." + , "The glass just fall from the table, pouring wine everywhere." + , "I wish the glass did not contain wine." + ] + +metrics_sentences_Test :: Bool +metrics_sentences_Test = metrics_sentences == splitBy (Sentences 0) metrics_text + +-- | Terms reordered to visually check occurrences +-- Split text by sentence and then extract ngrams. +-- +-- >>> metrics_terms +-- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]] +metrics_terms :: IO [[Terms]] +metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text + +-- | Test the Occurrences +-- +-- >>> metrics_occ +-- fromList [(fromList ["glass"],fromList [(["glass"],4)]),(fromList ["spoon"],fromList [(["spoon"],2)]),(fromList ["tabl"],fromList [(["table"],4)]),(fromList ["wine"],fromList [(["wine"],3)])] +metrics_occ :: IO (Map Grouped (Map Terms Int)) +metrics_occ = occurrences <$> L.concat <$> metrics_terms + +-- | Test the cooccurrences +-- +-- >>> metrics_cooc +-- fromList [((["glass"],["glass"]),4),((["spoon"],["glass"]),1),((["spoon"],["spoon"]),2),((["table"],["glass"]),3),((["table"],["spoon"]),2),((["table"],["table"]),4),((["wine"],["glass"]),3),((["wine"],["spoon"]),1),((["wine"],["table"]),2),((["wine"],["wine"]),3)] +metrics_cooc :: IO (Map (Label, Label) Int) +metrics_cooc = cooc <$> metrics_terms + +-- | Tests +metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity)) +metrics_cooc_mat = do + m <- metrics_cooc + let (ti,_) = createIndices m + let mat_cooc = cooc2mat ti m + pure ( ti + , mat_cooc + , incExcSpeGen_proba mat_cooc + , incExcSpeGen mat_cooc + ) + +metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)]) +metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc + +incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)]) +incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m) + where + (ti,fi) = createIndices m + ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x) + + diff --git a/src/Gargantext/Text/Metrics/Freq.hs b/src/Gargantext/Text/Metrics/Freq.hs index c6a145fe..fb8f8d35 100644 --- a/src/Gargantext/Text/Metrics/Freq.hs +++ b/src/Gargantext/Text/Metrics/Freq.hs @@ -1,14 +1,12 @@ {-| Module : Gargantext.Text.Metrics.Freq -Description : +Description : Some functions to count. Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -Here is a longer description of this module, containing some -commentary with @some markup@. -} {-# LANGUAGE NoImplicitPrelude #-} -- 2.21.0