Commit 16a85fb4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC+TESTS] contexts of texts.

parent 198b4ef8
...@@ -25,26 +25,29 @@ library: ...@@ -25,26 +25,29 @@ library:
exposed-modules: exposed-modules:
- Gargantext - Gargantext
- Gargantext.API - Gargantext.API
- Gargantext.API.Auth
- Gargantext.API.Count
- Gargantext.API.FrontEnd
- Gargantext.API.Orchestrator - Gargantext.API.Orchestrator
- Gargantext.API.Search - Gargantext.API.Search
- Gargantext.API.FrontEnd
- Gargantext.API.Count
- Gargantext.API.Auth
- Gargantext.API.Settings - Gargantext.API.Settings
- Gargantext.Core - Gargantext.Core
- Gargantext.Core.Types - Gargantext.Core.Types
- Gargantext.Core.Types.Main
- Gargantext.Core.Types.Node
- Gargantext.Core.Utils.Prefix
- Gargantext.Database - Gargantext.Database
- Gargantext.Prelude - Gargantext.Prelude
- Gargantext.Text - Gargantext.Text
- Gargantext.Text.Context - Gargantext.Text.Context
- Gargantext.Text.Examples
- Gargantext.Text.List.CSV - Gargantext.Text.List.CSV
- Gargantext.Text.Metrics - Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Examples
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Metrics.CharByChar - Gargantext.Text.Metrics.CharByChar
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Parsers.CSV - Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Parsers.Date - Gargantext.Text.Parsers.Date
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Terms - Gargantext.Text.Terms
- Gargantext.Text.Terms.WithList - Gargantext.Text.Terms.WithList
......
...@@ -2,5 +2,5 @@ import System.FilePath.Glob ...@@ -2,5 +2,5 @@ import System.FilePath.Glob
import Test.DocTest import Test.DocTest
main :: IO () main :: IO ()
main = glob "src/Gargantext/Text/Metrics.hs" >>= doctest main = glob "src/Gargantext/Text/" >>= doctest
...@@ -30,6 +30,7 @@ module Gargantext.Prelude ...@@ -30,6 +30,7 @@ module Gargantext.Prelude
import GHC.Exts (sortWith) import GHC.Exts (sortWith)
import Control.Monad.IO.Class (MonadIO)
import Data.Maybe (isJust, fromJust, maybe) import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Int64, Double, Integer import Protolude ( Bool(True, False), Int, Int64, Double, Integer
, Fractional, Num, Maybe(Just,Nothing) , Fractional, Num, Maybe(Just,Nothing)
...@@ -73,6 +74,11 @@ import Text.Read (Read()) ...@@ -73,6 +74,11 @@ import Text.Read (Read())
import Data.String.Conversions (cs) import Data.String.Conversions (cs)
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x
-- printDebug _ _ = pure ()
map2 :: (t -> b) -> [[t]] -> [[b]] map2 :: (t -> b) -> [[t]] -> [[b]]
map2 fun = map (map fun) map2 fun = map (map fun)
......
{-| {-|
Module : Gargantext.Text.Context Module : Gargantext.Text.Context
Description : Description : How to manage contexts of texts ?
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Context of text management tool, here are logic of main types. Context of text management tool, here are logic of main types:
- Term
- Multi-term
- Label
- Sentence
- Corpus
How to split contexts is describes in this module.
-} -}
...@@ -24,9 +32,7 @@ import Text.HTML.TagSoup (parseTags, isTagText, Tag(..)) ...@@ -24,9 +32,7 @@ import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text import Gargantext.Text
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------ ------------------------------------------------------------------------
type Term = Text type Term = Text
type MultiTerm = [Term] type MultiTerm = [Term]
type Label = MultiTerm type Label = MultiTerm
...@@ -38,21 +44,25 @@ type Corpus a = [Sentence a] -- a list of sentences ...@@ -38,21 +44,25 @@ type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a] -- type ConText a = [Sentence a]
-- type Corpus a = [ConText a] -- type Corpus a = [ConText a]
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag :: Text -> [Tag Text] tag :: Text -> [Tag Text]
tag = parseTags tag = parseTags
-- | splitBy contexts of Chars or Sentences or Paragraphs -- | splitBy contexts of Chars or Sentences or Paragraphs
-- >> splitBy (Chars 0) "abcde" -- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"] -- ["a","b","c","d","e"]
-- >> splitBy (Chars 1) "abcde" --
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"] -- ["ab","bc","cd","de"]
-- >> splitBy (Chars 2) "abcde" --
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"] -- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text] splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
......
{-| {-|
Module : Gargantext.Text.Metrics.Examples Module : Gargantext.Text.Examples
Description : Minimal Examples to test behavior of the functions. Description : Minimal Examples to test behavior of the functions.
Copyright : (c) CNRS, 2017 - present Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
...@@ -10,15 +10,23 @@ Portability : POSIX ...@@ -10,15 +10,23 @@ Portability : POSIX
This file is intended for these purposes: This file is intended for these purposes:
- documentation for teaching and research - documentation for teaching and research
- learn basics of Haskell which is a scientific programming language
- behavioral tests (that should be completed with uni-tests and scale-tests - behavioral tests (that should be completed with uni-tests and scale-tests
This documents defines basic of Text definitions according to Gargantext..
- What is a term ?
- What is a sentence ?
- What is a paragraph ?
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.Examples module Gargantext.Text.Examples
where where
import Data.Ord (Down(..)) import Data.Ord (Down(..))
...@@ -45,56 +53,61 @@ import Gargantext.Viz.Graph.Index ...@@ -45,56 +53,61 @@ import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate as DAA import qualified Data.Array.Accelerate as DAA
-- | From list to simple text
--
-- >>> metrics_text
-- "There is a table with a glass of wine and a spoon. I can see the glass on the table. There was only a spoon on that table. The glass just fall from the table, pouring wine everywhere. I wish the glass did not contain wine."
metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences
-- | Sentences -- | Sentences
-- Let be a list of Texts: ['Data.Text.Text']. Each text in this example is a sentence.
-- --
-- >>> metrics_sentences -- >>> ex_sentences
-- ["There is a table with a glass of wine and a spoon.","I can see the glass on the table.","There was only a spoon on that table.","The glass just fall from the table, pouring wine everywhere.","I wish the glass did not contain wine."] -- ["There is a table with a glass of wine and a spoon.","I can see the glass on the table.","There was only a spoon on that table.","The glass just fall from the table, pouring wine everywhere.","I wish the glass did not contain wine."]
metrics_sentences :: [Text] ex_sentences :: [Text]
metrics_sentences = [ "There is a table with a glass of wine and a spoon." ex_sentences = [ "There is a table with a glass of wine and a spoon."
, "I can see the glass on the table." , "I can see the glass on the table."
, "There was only a spoon on that table." , "There was only a spoon on that table."
, "The glass just fall from the table, pouring wine everywhere." , "The glass just fall from the table, pouring wine everywhere."
, "I wish the glass did not contain wine." , "I wish the glass did not contain wine."
] ]
-- | From list to simple text as paragraph.
-- Let 'Data.Text.intercalate' each sentence with a space. Result is a paragraph.
--
-- >>> T.intercalate (T.pack " ") ex_sentences
-- "There is a table with a glass of wine and a spoon. I can see the glass on the table. There was only a spoon on that table. The glass just fall from the table, pouring wine everywhere. I wish the glass did not contain wine."
ex_paragraph :: Text
ex_paragraph = T.intercalate " " ex_sentences
metrics_sentences_Test :: Bool -- | Let split sentences by Contexts of text.
metrics_sentences_Test = metrics_sentences == splitBy (Sentences 0) metrics_text -- More about 'Gargantext.Text.Context'
--
-- >>> ex_sentences == splitBy (Sentences 0) ex_paragraph
-- True
-- | Terms reordered to visually check occurrences -- | Terms reordered to visually check occurrences
-- Split text by sentence and then extract ngrams. -- Split text by sentence and then extract ngrams.
-- --
-- >>> metrics_terms -- >>> mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) ex_paragraph
-- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]] -- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]]
metrics_terms :: IO [[Terms]] ex_terms :: IO [[Terms]]
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text ex_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) ex_paragraph
-- | Test the Occurrences -- | Test the Occurrences
-- --
-- >>> metrics_occ -- >>> occurrences <$> L.concat <$> ex_terms
-- fromList [(fromList ["glass"],fromList [(["glass"],4)]),(fromList ["spoon"],fromList [(["spoon"],2)]),(fromList ["tabl"],fromList [(["table"],4)]),(fromList ["wine"],fromList [(["wine"],3)])] -- fromList [(fromList ["glass"],fromList [(["glass"],4)]),(fromList ["spoon"],fromList [(["spoon"],2)]),(fromList ["tabl"],fromList [(["table"],4)]),(fromList ["wine"],fromList [(["wine"],3)])]
metrics_occ :: IO (Map Grouped (Map Terms Int)) ex_occ :: IO (Map Grouped (Map Terms Int))
metrics_occ = occurrences <$> L.concat <$> metrics_terms ex_occ = occurrences <$> L.concat <$> ex_terms
-- | Test the cooccurrences -- | Test the cooccurrences
-- Use the 'Gargantext.Text.Metrics.Count.cooc' function.
-- --
-- >>> metrics_cooc -- >>> cooc <$> ex_terms
-- fromList [((["glass"],["glass"]),4),((["spoon"],["glass"]),1),((["spoon"],["spoon"]),2),((["table"],["glass"]),3),((["table"],["spoon"]),2),((["table"],["table"]),4),((["wine"],["glass"]),3),((["wine"],["spoon"]),1),((["wine"],["table"]),2),((["wine"],["wine"]),3)] -- fromList [((["glass"],["glass"]),4),((["spoon"],["glass"]),1),((["spoon"],["spoon"]),2),((["table"],["glass"]),3),((["table"],["spoon"]),2),((["table"],["table"]),4),((["wine"],["glass"]),3),((["wine"],["spoon"]),1),((["wine"],["table"]),2),((["wine"],["wine"]),3)]
metrics_cooc :: IO (Map (Label, Label) Int) ex_cooc :: IO (Map (Label, Label) Int)
metrics_cooc = cooc <$> metrics_terms ex_cooc = cooc <$> ex_terms
-- | Tests -- | Tests
metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity)) ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
metrics_cooc_mat = do ex_cooc_mat = do
m <- metrics_cooc m <- ex_cooc
let (ti,_) = createIndices m let (ti,_) = createIndices m
let mat_cooc = cooc2mat ti m let mat_cooc = cooc2mat ti m
pure ( ti pure ( ti
...@@ -103,8 +116,8 @@ metrics_cooc_mat = do ...@@ -103,8 +116,8 @@ metrics_cooc_mat = do
, incExcSpeGen mat_cooc , incExcSpeGen mat_cooc
) )
metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)]) ex_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)])
metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc ex_incExcSpeGen = incExcSpeGen_sorted <$> ex_cooc
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)]) incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m) incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
......
...@@ -21,7 +21,6 @@ import GHC.IO (FilePath) ...@@ -21,7 +21,6 @@ import GHC.IO (FilePath)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Control.Monad.IO.Class (MonadIO)
import Data.Map.Strict (Map) import Data.Map.Strict (Map)
import qualified Data.Array.Accelerate as A import qualified Data.Array.Accelerate as A
...@@ -53,9 +52,6 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..)) ...@@ -53,9 +52,6 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
|___/ |___/
-} -}
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x
-- printDebug _ _ = pure ()
data TextFlow = CSV FilePath data TextFlow = CSV FilePath
| FullText FilePath | FullText FilePath
...@@ -97,7 +93,7 @@ textFlow' termType contexts = do ...@@ -97,7 +93,7 @@ textFlow' termType contexts = do
let myCooc2 = M.filter (>1) myCooc1 let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2) printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 400 ) (InclusionSize 400 )
(SampleBins 10 ) (SampleBins 10 )
...@@ -105,7 +101,7 @@ textFlow' termType contexts = do ...@@ -105,7 +101,7 @@ textFlow' termType contexts = do
(DefaultValue 0 ) (DefaultValue 0 )
) myCooc2 ) myCooc2
printDebug "myCooc3" $ M.size myCooc3 printDebug "myCooc3" $ M.size myCooc3
putStrLn $ show myCooc3 -- putStrLn $ show myCooc3
-- Cooc -> Matrix -- Cooc -> Matrix
let (ti, _) = createIndices myCooc3 let (ti, _) = createIndices myCooc3
...@@ -115,12 +111,12 @@ textFlow' termType contexts = do ...@@ -115,12 +111,12 @@ textFlow' termType contexts = do
printDebug "myCooc4" $ M.size myCooc4 printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4
--printDebug "matCooc" matCooc -- printDebug "matCooc" matCooc
-- Matrix -> Clustering -- Matrix -> Clustering
let distanceMat = conditional matCooc let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc -- let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat" $ A.arrayShape distanceMat
--printDebug "distanceMat" distanceMat -- printDebug "distanceMat" distanceMat
-- --
let distanceMap = mat2map distanceMat let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap printDebug "distanceMap" $ M.size distanceMap
...@@ -135,7 +131,8 @@ textFlow' termType contexts = do ...@@ -135,7 +131,8 @@ textFlow' termType contexts = do
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
----------------------------------------------------------- -----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain) -- | From data to Graph
-- FIXME: distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int data2graph :: [(Label, Int)] -> Map (Int, Int) Int
-> Map (Int, Int) Double -> Map (Int, Int) Double
-> [LouvainNode] -> [LouvainNode]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment