Commit 16a85fb4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC+TESTS] contexts of texts.

parent 198b4ef8
......@@ -25,26 +25,29 @@ library:
exposed-modules:
- Gargantext
- Gargantext.API
- Gargantext.API.Auth
- Gargantext.API.Count
- Gargantext.API.FrontEnd
- Gargantext.API.Orchestrator
- Gargantext.API.Search
- Gargantext.API.FrontEnd
- Gargantext.API.Count
- Gargantext.API.Auth
- Gargantext.API.Settings
- Gargantext.Core
- Gargantext.Core.Types
- Gargantext.Core.Types.Main
- Gargantext.Core.Types.Node
- Gargantext.Core.Utils.Prefix
- Gargantext.Database
- Gargantext.Prelude
- Gargantext.Text
- Gargantext.Text.Context
- Gargantext.Text.Examples
- Gargantext.Text.List.CSV
- Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Examples
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Metrics.CharByChar
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Parsers.Date
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search
- Gargantext.Text.Terms
- Gargantext.Text.Terms.WithList
......
......@@ -2,5 +2,5 @@ import System.FilePath.Glob
import Test.DocTest
main :: IO ()
main = glob "src/Gargantext/Text/Metrics.hs" >>= doctest
main = glob "src/Gargantext/Text/" >>= doctest
......@@ -30,6 +30,7 @@ module Gargantext.Prelude
import GHC.Exts (sortWith)
import Control.Monad.IO.Class (MonadIO)
import Data.Maybe (isJust, fromJust, maybe)
import Protolude ( Bool(True, False), Int, Int64, Double, Integer
, Fractional, Num, Maybe(Just,Nothing)
......@@ -73,6 +74,11 @@ import Text.Read (Read())
import Data.String.Conversions (cs)
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x
-- printDebug _ _ = pure ()
map2 :: (t -> b) -> [[t]] -> [[b]]
map2 fun = map (map fun)
......
{-|
Module : Gargantext.Text.Context
Description :
Description : How to manage contexts of texts ?
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Context of text management tool, here are logic of main types.
Context of text management tool, here are logic of main types:
- Term
- Multi-term
- Label
- Sentence
- Corpus
How to split contexts is describes in this module.
-}
......@@ -24,9 +32,7 @@ import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text
import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
type Term = Text
type MultiTerm = [Term]
type Label = MultiTerm
......@@ -38,21 +44,25 @@ type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag :: Text -> [Tag Text]
tag = parseTags
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- >> splitBy (Chars 0) "abcde"
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
-- >> splitBy (Chars 1) "abcde"
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
-- >> splitBy (Chars 2) "abcde"
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
......
{-|
Module : Gargantext.Text.Metrics.Examples
Module : Gargantext.Text.Examples
Description : Minimal Examples to test behavior of the functions.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
......@@ -10,15 +10,23 @@ Portability : POSIX
This file is intended for these purposes:
- documentation for teaching and research
- learn basics of Haskell which is a scientific programming language
- behavioral tests (that should be completed with uni-tests and scale-tests
This documents defines basic of Text definitions according to Gargantext..
- What is a term ?
- What is a sentence ?
- What is a paragraph ?
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.Examples
module Gargantext.Text.Examples
where
import Data.Ord (Down(..))
......@@ -45,56 +53,61 @@ import Gargantext.Viz.Graph.Index
import qualified Data.Array.Accelerate as DAA
-- | From list to simple text
--
-- >>> metrics_text
-- "There is a table with a glass of wine and a spoon. I can see the glass on the table. There was only a spoon on that table. The glass just fall from the table, pouring wine everywhere. I wish the glass did not contain wine."
metrics_text :: Text
metrics_text = T.intercalate " " metrics_sentences
-- | Sentences
-- Let be a list of Texts: ['Data.Text.Text']. Each text in this example is a sentence.
--
-- >>> metrics_sentences
-- >>> ex_sentences
-- ["There is a table with a glass of wine and a spoon.","I can see the glass on the table.","There was only a spoon on that table.","The glass just fall from the table, pouring wine everywhere.","I wish the glass did not contain wine."]
metrics_sentences :: [Text]
metrics_sentences = [ "There is a table with a glass of wine and a spoon."
, "I can see the glass on the table."
, "There was only a spoon on that table."
, "The glass just fall from the table, pouring wine everywhere."
, "I wish the glass did not contain wine."
]
ex_sentences :: [Text]
ex_sentences = [ "There is a table with a glass of wine and a spoon."
, "I can see the glass on the table."
, "There was only a spoon on that table."
, "The glass just fall from the table, pouring wine everywhere."
, "I wish the glass did not contain wine."
]
-- | From list to simple text as paragraph.
-- Let 'Data.Text.intercalate' each sentence with a space. Result is a paragraph.
--
-- >>> T.intercalate (T.pack " ") ex_sentences
-- "There is a table with a glass of wine and a spoon. I can see the glass on the table. There was only a spoon on that table. The glass just fall from the table, pouring wine everywhere. I wish the glass did not contain wine."
ex_paragraph :: Text
ex_paragraph = T.intercalate " " ex_sentences
metrics_sentences_Test :: Bool
metrics_sentences_Test = metrics_sentences == splitBy (Sentences 0) metrics_text
-- | Let split sentences by Contexts of text.
-- More about 'Gargantext.Text.Context'
--
-- >>> ex_sentences == splitBy (Sentences 0) ex_paragraph
-- True
-- | Terms reordered to visually check occurrences
-- Split text by sentence and then extract ngrams.
--
-- >>> metrics_terms
-- >>> mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) ex_paragraph
-- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]]
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
ex_terms :: IO [[Terms]]
ex_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) ex_paragraph
-- | Test the Occurrences
--
-- >>> metrics_occ
-- >>> occurrences <$> L.concat <$> ex_terms
-- fromList [(fromList ["glass"],fromList [(["glass"],4)]),(fromList ["spoon"],fromList [(["spoon"],2)]),(fromList ["tabl"],fromList [(["table"],4)]),(fromList ["wine"],fromList [(["wine"],3)])]
metrics_occ :: IO (Map Grouped (Map Terms Int))
metrics_occ = occurrences <$> L.concat <$> metrics_terms
ex_occ :: IO (Map Grouped (Map Terms Int))
ex_occ = occurrences <$> L.concat <$> ex_terms
-- | Test the cooccurrences
-- Use the 'Gargantext.Text.Metrics.Count.cooc' function.
--
-- >>> metrics_cooc
-- >>> cooc <$> ex_terms
-- fromList [((["glass"],["glass"]),4),((["spoon"],["glass"]),1),((["spoon"],["spoon"]),2),((["table"],["glass"]),3),((["table"],["spoon"]),2),((["table"],["table"]),4),((["wine"],["glass"]),3),((["wine"],["spoon"]),1),((["wine"],["table"]),2),((["wine"],["wine"]),3)]
metrics_cooc :: IO (Map (Label, Label) Int)
metrics_cooc = cooc <$> metrics_terms
ex_cooc :: IO (Map (Label, Label) Int)
ex_cooc = cooc <$> ex_terms
-- | Tests
metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
metrics_cooc_mat = do
m <- metrics_cooc
ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
ex_cooc_mat = do
m <- ex_cooc
let (ti,_) = createIndices m
let mat_cooc = cooc2mat ti m
pure ( ti
......@@ -103,8 +116,8 @@ metrics_cooc_mat = do
, incExcSpeGen mat_cooc
)
metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)])
metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
ex_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)])
ex_incExcSpeGen = incExcSpeGen_sorted <$> ex_cooc
incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
......
......@@ -21,7 +21,6 @@ import GHC.IO (FilePath)
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Control.Monad.IO.Class (MonadIO)
import Data.Map.Strict (Map)
import qualified Data.Array.Accelerate as A
......@@ -53,9 +52,6 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
|___/
-}
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x
-- printDebug _ _ = pure ()
data TextFlow = CSV FilePath
| FullText FilePath
......@@ -97,7 +93,7 @@ textFlow' termType contexts = do
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 400 )
(SampleBins 10 )
......@@ -105,7 +101,7 @@ textFlow' termType contexts = do
(DefaultValue 0 )
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
putStrLn $ show myCooc3
-- putStrLn $ show myCooc3
-- Cooc -> Matrix
let (ti, _) = createIndices myCooc3
......@@ -115,12 +111,12 @@ textFlow' termType contexts = do
printDebug "myCooc4" $ M.size myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4
--printDebug "matCooc" matCooc
-- printDebug "matCooc" matCooc
-- Matrix -> Clustering
let distanceMat = conditional matCooc
-- let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat
--printDebug "distanceMat" distanceMat
-- printDebug "distanceMat" distanceMat
--
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
......@@ -135,7 +131,8 @@ textFlow' termType contexts = do
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
-----------------------------------------------------------
-- distance should not be a map since we just "toList" it (same as cLouvain)
-- | From data to Graph
-- FIXME: distance should not be a map since we just "toList" it (same as cLouvain)
data2graph :: [(Label, Int)] -> Map (Int, Int) Int
-> Map (Int, Int) Double
-> [LouvainNode]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment