Commit fcec7770 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing...

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing on doctest in documentation.
parent c1004ce9
name: gargantext name: gargantext
version: '0.1.0.0' version: '0.1.0.0'
synopsis: Deep (Collaborative) Text mining project synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
author: Gargantext Team author: Gargantext Team
...@@ -27,17 +27,22 @@ library: ...@@ -27,17 +27,22 @@ library:
- Gargantext.API - Gargantext.API
- Gargantext.Core - Gargantext.Core
- Gargantext.Core.Types - Gargantext.Core.Types
- Gargantext.Database
- Gargantext.Prelude - Gargantext.Prelude
- Gargantext.Text - Gargantext.Text
- Gargantext.Text.Context - Gargantext.Text.Context
- Gargantext.Text.List.CSV - Gargantext.Text.List.CSV
- Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Count - Gargantext.Text.Metrics.Count
- Gargantext.Text.Metrics.CharByChar
- Gargantext.Text.Parsers.CSV - Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Terms - Gargantext.Text.Terms
- Gargantext.Text.Terms.WithList - Gargantext.Text.Terms.WithList
- Gargantext.TextFlow - Gargantext.TextFlow
- Gargantext.Viz.Graph.Distances.Matrice - Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index
dependencies: dependencies:
- QuickCheck - QuickCheck
- accelerate - accelerate
...@@ -175,23 +180,23 @@ executables: ...@@ -175,23 +180,23 @@ executables:
- full-text-search - full-text-search
tests: tests:
garg-test: # garg-test:
main: Main.hs # main: Main.hs
source-dirs: src-test # source-dirs: src-test
ghc-options: # ghc-options:
- -threaded # - -threaded
- -rtsopts # - -rtsopts
- -with-rtsopts=-N # - -with-rtsopts=-N
dependencies: # dependencies:
- base # - base
- gargantext # - gargantext
- hspec # - hspec
- QuickCheck # - QuickCheck
- quickcheck-instances # - quickcheck-instances
- time # - time
- parsec # - parsec
- duckling # - duckling
- text # - text
garg-doctest: garg-doctest:
main: Main.hs main: Main.hs
source-dirs: src-doctest source-dirs: src-doctest
......
...@@ -2,5 +2,5 @@ import System.FilePath.Glob ...@@ -2,5 +2,5 @@ import System.FilePath.Glob
import Test.DocTest import Test.DocTest
main :: IO () main :: IO ()
main = glob "src/**/*.hs" >>= doctest main = glob "src/Gargantext/Text/Metrics.hs" >>= doctest
{-| {-|
Module : Gargantext Module : Gargantext
Description : Textmining platform Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
...@@ -10,18 +10,17 @@ Portability : POSIX ...@@ -10,18 +10,17 @@ Portability : POSIX
@Gargantext@: search, map, share @Gargantext@: search, map, share
-} -}
module Gargantext ( module Gargantext ( module Gargantext.API
module Gargantext.Database , module Gargantext.Core
-- module Gargantext.Ngrams, , module Gargantext.Database
-- module Gargantext.Utils, , module Gargantext.Prelude
) where , module Gargantext.Text
-- , module Gargantext.Viz
) where
import Gargantext.API
import Gargantext.Core
import Gargantext.Database import Gargantext.Database
-- import Gargantext.Ngrams import Gargantext.Prelude
-- import Gargantext.Utils import Gargantext.Text
--import Gargantext.Viz
{-| {-|
Module : Gargantext.Core Module : Gargantext.Core
Description : Which Natural language is supported ? Description : Supported Natural language
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
...@@ -15,8 +15,6 @@ module Gargantext.Core ...@@ -15,8 +15,6 @@ module Gargantext.Core
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Language of a Text -- | Language of a Text
-- For simplicity, we suppose text has an homogenous language -- For simplicity, we suppose text has an homogenous language
data Lang = EN | FR
-- | DE | IT | SP -- | DE | IT | SP
-- EN == english -- EN == english
-- FR == french -- FR == french
...@@ -24,3 +22,4 @@ data Lang = EN | FR ...@@ -24,3 +22,4 @@ data Lang = EN | FR
-- IT == italian (not implemented yet) -- IT == italian (not implemented yet)
-- SP == spanish (not implemented yet) -- SP == spanish (not implemented yet)
-- ... add your language and help us to implement it (: -- ... add your language and help us to implement it (:
data Lang = EN | FR
...@@ -28,7 +28,7 @@ import Data.Text (Text, words) ...@@ -28,7 +28,7 @@ import Data.Text (Text, words)
import Database.PostgreSQL.Simple import Database.PostgreSQL.Simple
import Database.PostgreSQL.Simple.ToField import Database.PostgreSQL.Simple.ToField
import Gargantext (connectGargandb) import Gargantext.Database.Utils (connectGargandb)
import Gargantext.Prelude import Gargantext.Prelude
newtype TSQuery = UnsafeTSQuery [Text] newtype TSQuery = UnsafeTSQuery [Text]
......
...@@ -9,18 +9,13 @@ Portability : POSIX ...@@ -9,18 +9,13 @@ Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@ Mainly reexport functions in @Data.Text.Metrics@
TODO
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics module Gargantext.Text.Metrics
where where
import Data.Ord (Down(..)) import Data.Ord (Down(..))
...@@ -74,7 +69,6 @@ filterCooc fc cc = (filterCooc' fc) ts cc ...@@ -74,7 +69,6 @@ filterCooc fc cc = (filterCooc' fc) ts cc
where where
ts = map _scored_terms $ takeSome fc $ coocScored cc ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m') foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
...@@ -95,7 +89,7 @@ takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t] ...@@ -95,7 +89,7 @@ takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m $ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores $ L.take l' $ sortWith (Down . _scored_incExc) scores
-- $ splitKmeans k scores -- splitKmeans k scores
where where
-- TODO: benchmark with accelerate-example kmeans version -- TODO: benchmark with accelerate-example kmeans version
--splitKmeans x xs = L.concat $ map elements --splitKmeans x xs = L.concat $ map elements
...@@ -172,15 +166,8 @@ metrics_sentences_Test :: Bool ...@@ -172,15 +166,8 @@ metrics_sentences_Test :: Bool
metrics_sentences_Test = metrics_sentences == metrics_sentences' metrics_sentences_Test = metrics_sentences == metrics_sentences'
-- | Terms reordered to visually check occurrences -- | Terms reordered to visually check occurrences
-- >>> -- >>> metrics_terms
{- [ [["table"],["glass"],["wine"],["spoon"]] -- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]]
, [["glass"],["table"]]
, [["spoon"],["table"]]
, [["glass"],["table"],["wine"]]
, [["glass"],["wine"]]
]
-}
metrics_terms :: IO [[Terms]] metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
...@@ -197,9 +184,13 @@ metrics_occ = occurrences <$> L.concat <$> metrics_terms ...@@ -197,9 +184,13 @@ metrics_occ = occurrences <$> L.concat <$> metrics_terms
{- {-
-- fromList [((["glas"],["object"]),6) -- fromList [((["glas"],["object"]),6)
,((["glas"],["spoon"]),4) ,((["glas"],["spoon"]),4)
,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)] ,((["glas"],["table"]),6)
,((["object"],["spoon"]),6)
,((["object"],["table"]),9)
,((["spoon"],["table"]),6)]
-} -}
metrics_cooc :: IO (Map (Label, Label) Int) metrics_cooc :: IO (Map (Label, Label) Int)
metrics_cooc = cooc <$> metrics_terms metrics_cooc = cooc <$> metrics_terms
......
...@@ -37,7 +37,8 @@ import Data.Attoparsec.ByteString (parseOnly, Parser) ...@@ -37,7 +37,8 @@ import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as DT import qualified Data.Text as DT
-- | Activate Async for to parse in parallel
-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
......
...@@ -28,12 +28,12 @@ group [] = [] ...@@ -28,12 +28,12 @@ group [] = []
group ntags = group2 NP NP group ntags = group2 NP NP
$ group2 NP VB $ group2 NP VB
$ group2 NP IN $ group2 NP IN
-- $ group2 IN DT -- - $ group2 IN DT
$ group2 VB NP $ group2 VB NP
$ group2 JJ NP $ group2 JJ NP
$ group2 NP JJ $ group2 NP JJ
$ group2 JJ JJ $ group2 JJ JJ
-- $ group2 JJ CC -- - $ group2 JJ CC
$ ntags $ ntags
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -55,15 +55,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..)) ...@@ -55,15 +55,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m () printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure () -- printDebug _ _ = pure ()
data TextFlow = CSV FilePath data TextFlow = CSV FilePath
| FullText FilePath | FullText FilePath
| Contexts [T.Text] | Contexts [T.Text]
| SQL Int | SQL Int
| Database T.Text | Database T.Text
-- | ExtDatabase Query -- ExtDatabase Query
-- | IntDatabase NodeId -- IntDatabase NodeId
textFlow :: TermType Lang -> TextFlow -> IO Graph textFlow :: TermType Lang -> TextFlow -> IO Graph
textFlow termType workType = do textFlow termType workType = do
...@@ -88,7 +88,7 @@ textFlow' termType contexts = do ...@@ -88,7 +88,7 @@ textFlow' termType contexts = do
printDebug "myterms" (sum $ map length myterms) printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list -- Bulding the map list
-- compute copresences of terms -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
-- Cooc = Map (Term, Term) Int -- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1) printDebug "myCooc1" (M.size myCooc1)
...@@ -98,13 +98,14 @@ textFlow' termType contexts = do ...@@ -98,13 +98,14 @@ textFlow' termType contexts = do
printDebug "myCooc2" (M.size myCooc2) printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 4000 ) (InclusionSize 400 )
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue 0 ) (DefaultValue 0 )
) myCooc2 ) myCooc2
printDebug "myCooc3" $ M.size myCooc3 printDebug "myCooc3" $ M.size myCooc3
putStrLn $ show myCooc3
-- Cooc -> Matrix -- Cooc -> Matrix
let (ti, _) = createIndices myCooc3 let (ti, _) = createIndices myCooc3
......
{-|
Module : Gargantext.Viz
Description : Viz tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Vizualisation of text stats
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz
where
...@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org ...@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
2 main measures are actually implemented in order to compute the proximity of two terms.
- Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
- Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
Motivation and definition of the @Conditional@ distance. Motivation and definition of the @Conditional@ distance.
Implementation use Accelerate library : Implementation use Accelerate library :
...@@ -44,7 +49,7 @@ import qualified Gargantext.Prelude as P ...@@ -44,7 +49,7 @@ import qualified Gargantext.Prelude as P
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- Test perf. -- | Test perf.
distriTest :: Matrix Double distriTest :: Matrix Double
distriTest = distributional $ myMat 100 distriTest = distributional $ myMat 100
----------------------------------------------------------------------- -----------------------------------------------------------------------
...@@ -67,11 +72,11 @@ rank m = arrayRank $ arrayShape m ...@@ -67,11 +72,11 @@ rank m = arrayRank $ arrayShape m
-- How to force use with SquareMatrix ? -- How to force use with SquareMatrix ?
type Dim = Int type Dim = Int
dim :: (Matrix a) -> Dim dim :: Matrix a -> Dim
dim m = n dim m = n
where where
Z :. _ :. n = arrayShape m Z :. _ :. n = arrayShape m
-- == indexTail (arrayShape m) -- indexTail (arrayShape m)
----------------------------------------------------------------------- -----------------------------------------------------------------------
proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
...@@ -80,7 +85,7 @@ proba r mat = zipWith (/) mat (mkSum r mat) ...@@ -80,7 +85,7 @@ proba r mat = zipWith (/) mat (mkSum r mat)
mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
-- divByDiag -- | divByDiag
divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double) divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat) divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
where where
...@@ -94,11 +99,16 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m ...@@ -94,11 +99,16 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
miniMax' = (the $ minimum $ maximum m) miniMax' = (the $ minimum $ maximum m)
-- | Conditional distance (basic version) -- | Conditional distance (basic version)
conditional :: Matrix Int -> Matrix Double conditional :: Matrix Int -> Matrix Double
conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m) conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
-- | Conditional distance (advanced version) -- | Conditional distance (advanced version)
-- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence"
-- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp.
-- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get:
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity) conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m) conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where where
...@@ -121,6 +131,13 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr ...@@ -121,6 +131,13 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- | Distributional Distance -- | Distributional Distance
-- The distributional measure \[P_c\] of @i@ and @j@ terms is:
-- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
-- \]
-- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information
-- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text
-- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences
-- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term
distributional :: Matrix Int -> Matrix Double distributional :: Matrix Int -> Matrix Double
distributional m = run $ miniMax $ ri (map fromIntegral $ use m) distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
where where
......
...@@ -36,7 +36,7 @@ extra-deps: ...@@ -36,7 +36,7 @@ extra-deps:
- servant-server-0.13 - servant-server-0.13
- servant-swagger-ui-0.2.3.2.2.8 - servant-swagger-ui-0.2.3.2.2.8
- stemmer-0.5.2 - stemmer-0.5.2
- text-1.2.3.0 # - text-1.2.3.0
- text-show-3.6.2 - text-show-3.6.2
- servant-flatten-0.2 - servant-flatten-0.2
- serialise-0.2.0.0 # imt-api-client - serialise-0.2.0.0 # imt-api-client
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment