[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing...

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing on doctest in documentation.

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing...
[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing on doctest in documentation.
fcec7770 · Alexandre Delanoë · c1004ce9 · fcec7770 · fcec7770 · fcec7770
Commit fcec7770 authored Aug 22, 2018 by Alexandre Delanoë
12 changed files
--- a/package.yaml
+++ b/package.yaml
 name: gargantext
 version: '0.1.0.0'
-synopsis: Deep (Collaborative) Text mining project
+synopsis: Search, map, share
 description: Please see README.md
 category: Data
 author: Gargantext Team
@@ -27,17 +27,22 @@ library:
  - Gargantext.API
  - Gargantext.Core
  - Gargantext.Core.Types
+  - Gargantext.Database
  - Gargantext.Prelude
  - Gargantext.Text
  - Gargantext.Text.Context
  - Gargantext.Text.List.CSV
+  - Gargantext.Text.Metrics
  - Gargantext.Text.Metrics.Count
+  - Gargantext.Text.Metrics.CharByChar
  - Gargantext.Text.Parsers.CSV
+  - Gargantext.Text.Parsers.Date
  - Gargantext.Text.Search
  - Gargantext.Text.Terms
  - Gargantext.Text.Terms.WithList
  - Gargantext.TextFlow
  - Gargantext.Viz.Graph.Distances.Matrice
+  - Gargantext.Viz.Graph.Index
  dependencies:
  - QuickCheck
  - accelerate
@@ -175,23 +180,23 @@ executables:
      - full-text-search
 tests:
-  garg-test:
+#   garg-test:
-    main: Main.hs
+#     main: Main.hs
-    source-dirs: src-test
+#     source-dirs: src-test
-    ghc-options:
+#     ghc-options:
-    - -threaded
+#     - -threaded
-    - -rtsopts
+#     - -rtsopts
-    - -with-rtsopts=-N
+#     - -with-rtsopts=-N
-    dependencies:
+#     dependencies:
-    - base
+#     - base
-    - gargantext
+#     - gargantext
-    - hspec
+#     - hspec
-    - QuickCheck
+#     - QuickCheck
-    - quickcheck-instances 
+#     - quickcheck-instances 
-    - time
+#     - time
-    - parsec
+#     - parsec
-    - duckling
+#     - duckling
-    - text
+#     - text
  garg-doctest:
    main: Main.hs
    source-dirs: src-doctest

--- a/src-doctest/Main.hs
+++ b/src-doctest/Main.hs
@@ -2,5 +2,5 @@ import System.FilePath.Glob
 import Test.DocTest
 main :: IO ()
-main = glob "src/**/*.hs" >>= doctest
+main = glob "src/Gargantext/Text/Metrics.hs" >>= doctest
--- a/src/Gargantext.hs
+++ b/src/Gargantext.hs
 {-|
 Module      : Gargantext
-Description : Textmining platform
+Description : Textmining Collaborative Platform
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
@@ -10,18 +10,17 @@ Portability : POSIX
 @Gargantext@: search, map, share
 -}
-module Gargantext (
+module Gargantext ( module Gargantext.API
-                     module Gargantext.Database
+                  , module Gargantext.Core
--                     module Gargantext.Ngrams,
+                  , module Gargantext.Database
--                     module Gargantext.Utils,
+                  , module Gargantext.Prelude
-                    ) where
+                  , module Gargantext.Text
+--                  , module Gargantext.Viz
+                  ) where
+import Gargantext.API
+import Gargantext.Core
 import Gargantext.Database
-- import Gargantext.Ngrams
+import Gargantext.Prelude
-- import Gargantext.Utils
+import Gargantext.Text
+--import Gargantext.Viz
--- a/src/Gargantext/Core.hs
+++ b/src/Gargantext/Core.hs
 {-|
 Module      : Gargantext.Core
-Description : Which Natural language is supported ?
+Description : Supported Natural language
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
@@ -15,8 +15,6 @@ module Gargantext.Core
 ------------------------------------------------------------------------
 -- | Language of a Text
 -- For simplicity, we suppose text has an homogenous language
-data Lang = EN | FR
 -- | DE | IT | SP
 --  EN == english
 --  FR == french
@@ -24,3 +22,4 @@ data Lang = EN | FR
 --  IT == italian (not implemented yet)
 --  SP == spanish (not implemented yet)
 --  ... add your language and help us to implement it (:
+data Lang = EN | FR
--- a/src/Gargantext/Database/TextSearch.hs
+++ b/src/Gargantext/Database/TextSearch.hs
@@ -28,7 +28,7 @@ import Data.Text (Text, words)
 import Database.PostgreSQL.Simple
 import Database.PostgreSQL.Simple.ToField
-import Gargantext (connectGargandb)
+import Gargantext.Database.Utils (connectGargandb)
 import Gargantext.Prelude
 newtype TSQuery = UnsafeTSQuery [Text]

--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -9,18 +9,13 @@ Portability : POSIX
 Mainly reexport functions in @Data.Text.Metrics@
-TODO
-noApax :: Ord a => Map a Occ -> Map a Occ
-noApax m = M.filter (>1) m
 -}
 {-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Metrics 
+module Gargantext.Text.Metrics
  where
 import Data.Ord (Down(..))
@@ -74,7 +69,6 @@ filterCooc fc cc = (filterCooc' fc) ts cc
  where
    ts     = map _scored_terms $ takeSome fc $ coocScored cc
 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
  foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
@@ -95,7 +89,7 @@ takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
                    $ takeSample n m
                    $ L.take l' $ sortWith (Down . _scored_incExc) scores
-                    -- $ splitKmeans k scores
+                    -- splitKmeans k scores
  where
    -- TODO: benchmark with accelerate-example kmeans version
    --splitKmeans x xs = L.concat $ map elements
@@ -172,15 +166,8 @@ metrics_sentences_Test :: Bool
 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 -- | Terms reordered to visually check occurrences
-- >>> 
+-- >>> metrics_terms
-{- [ [["table"],["glass"],["wine"],["spoon"]]
+-- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]]
-   , [["glass"],["table"]]
-   , [["spoon"],["table"]]
-   , [["glass"],["table"],["wine"]]
-   , [["glass"],["wine"]]
-   ]
-}
 metrics_terms :: IO [[Terms]]
 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
@@ -197,9 +184,13 @@ metrics_occ = occurrences <$> L.concat <$> metrics_terms
 {- 
 -- fromList [((["glas"],["object"]),6)
            ,((["glas"],["spoon"]),4)
-            ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
+            ,((["glas"],["table"]),6)
+            ,((["object"],["spoon"]),6)
+            ,((["object"],["table"]),9)
+            ,((["spoon"],["table"]),6)]
 -}
 metrics_cooc :: IO (Map (Label, Label) Int)
 metrics_cooc = cooc <$> metrics_terms

--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -37,7 +37,8 @@ import Data.Attoparsec.ByteString (parseOnly, Parser)
 import Data.Text (Text)
 import qualified Data.Text as DT
-- | Activate Async for to parse in parallel
+-- Activate Async for to parse in parallel
 import Control.Concurrent.Async as CCA (mapConcurrently)
 import Data.Text.Encoding (decodeUtf8)

--- a/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
+++ b/src/Gargantext/Text/Terms/Multi/Lang/Fr.hs
@@ -28,12 +28,12 @@ group [] = []
 group ntags = group2 NP NP
            $ group2 NP VB
            $ group2 NP IN
-            -- $ group2 IN DT
+            -- - $ group2 IN DT
            $ group2 VB NP
            $ group2 JJ NP
            $ group2 NP JJ
            $ group2 JJ JJ
-            -- $ group2 JJ CC
+            -- - $ group2 JJ CC
            $ ntags
 ------------------------------------------------------------------------

--- a/src/Gargantext/TextFlow.hs
+++ b/src/Gargantext/TextFlow.hs
@@ -55,15 +55,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
 printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
 printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure ()
+-- printDebug _ _ = pure ()
 data TextFlow = CSV FilePath
              | FullText FilePath
              | Contexts [T.Text]
              | SQL Int
              | Database T.Text
-                -- | ExtDatabase Query
+                -- ExtDatabase Query
-                -- | IntDatabase NodeId
+                -- IntDatabase NodeId
 textFlow :: TermType Lang -> TextFlow -> IO Graph
 textFlow termType workType = do
@@ -88,7 +88,7 @@ textFlow' termType contexts = do
  printDebug "myterms" (sum $ map length myterms)
  -- Bulding the map list
-  -- compute copresences of terms
+  -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
  -- Cooc = Map (Term, Term) Int
  let myCooc1 = cooc myterms
  printDebug "myCooc1" (M.size myCooc1)
@@ -98,13 +98,14 @@ textFlow' termType contexts = do
  printDebug "myCooc2" (M.size myCooc2)
  -- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
-  let myCooc3 = filterCooc ( FilterConfig (MapListSize   1000 )
+  let myCooc3 = filterCooc ( FilterConfig (MapListSize    100 )
-                                          (InclusionSize 4000 )
+                                          (InclusionSize  400 )
                                          (SampleBins      10 )
                                          (Clusters         3 )
                                          (DefaultValue     0 )
                           ) myCooc2
  printDebug "myCooc3" $ M.size myCooc3
+  putStrLn $ show myCooc3
  -- Cooc -> Matrix
  let (ti, _) = createIndices myCooc3

--- a/src/Gargantext/Viz.hs
+++ b/src/Gargantext/Viz.hs
+{-|
+Module      : Gargantext.Viz
+Description : Viz tools
+Copyright   : (c) CNRS, 2018
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Vizualisation of text stats
+-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+module Gargantext.Viz
+  where
--- a/src/Gargantext/Viz/Graph/Distances/Matrice.hs
+++ b/src/Gargantext/Viz/Graph/Distances/Matrice.hs
@@ -7,6 +7,11 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
+2 main measures are actually implemented in order to compute the proximity of two terms.
+- Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
+- Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
 Motivation and definition of the @Conditional@ distance.
 Implementation use Accelerate library :
@@ -44,7 +49,7 @@ import qualified Gargantext.Prelude as P
 -----------------------------------------------------------------------
-- Test perf.
+-- | Test perf.
 distriTest :: Matrix Double
 distriTest = distributional $ myMat 100
 -----------------------------------------------------------------------
@@ -67,11 +72,11 @@ rank m = arrayRank $ arrayShape m
 -- How to force use with SquareMatrix ?
 type Dim = Int
-dim :: (Matrix a) -> Dim
+dim :: Matrix a -> Dim
 dim m = n
  where
    Z :. _ :. n = arrayShape m
-    -- == indexTail (arrayShape m)
+    -- indexTail (arrayShape m)
 -----------------------------------------------------------------------
 proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
@@ -80,7 +85,7 @@ proba r mat = zipWith (/) mat (mkSum r mat)
 mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
-- divByDiag 
+-- | divByDiag 
 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
  where
@@ -94,11 +99,16 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
    miniMax' = (the $ minimum $ maximum m)
 -- | Conditional distance (basic version)
 conditional :: Matrix Int -> Matrix Double
 conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
 -- | Conditional distance (advanced version)
+-- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence"
+-- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp.
+-- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get:
+-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
  where
@@ -121,6 +131,13 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
 -----------------------------------------------------------------------
 -- | Distributional Distance
+-- The distributional measure \[P_c\] of @i@ and @j@ terms is:
+-- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
+-- \]
+-- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information
+-- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text
+-- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences
+-- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term
 distributional :: Matrix Int -> Matrix Double
 distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
  where

--- a/stack.yaml
+++ b/stack.yaml
@@ -36,7 +36,7 @@ extra-deps:
 - servant-server-0.13
 - servant-swagger-ui-0.2.3.2.2.8
 - stemmer-0.5.2
- text-1.2.3.0
+  # - text-1.2.3.0
 - text-show-3.6.2
 - servant-flatten-0.2
 - serialise-0.2.0.0  # imt-api-client