Commit fcec7770 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing...

[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing on doctest in documentation.
parent c1004ce9
name: gargantext
version: '0.1.0.0'
synopsis: Deep (Collaborative) Text mining project
synopsis: Search, map, share
description: Please see README.md
category: Data
author: Gargantext Team
......@@ -27,17 +27,22 @@ library:
- Gargantext.API
- Gargantext.Core
- Gargantext.Core.Types
- Gargantext.Database
- Gargantext.Prelude
- Gargantext.Text
- Gargantext.Text.Context
- Gargantext.Text.List.CSV
- Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Metrics.CharByChar
- Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date
- Gargantext.Text.Search
- Gargantext.Text.Terms
- Gargantext.Text.Terms.WithList
- Gargantext.TextFlow
- Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index
dependencies:
- QuickCheck
- accelerate
......@@ -175,23 +180,23 @@ executables:
- full-text-search
tests:
garg-test:
main: Main.hs
source-dirs: src-test
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- base
- gargantext
- hspec
- QuickCheck
- quickcheck-instances
- time
- parsec
- duckling
- text
# garg-test:
# main: Main.hs
# source-dirs: src-test
# ghc-options:
# - -threaded
# - -rtsopts
# - -with-rtsopts=-N
# dependencies:
# - base
# - gargantext
# - hspec
# - QuickCheck
# - quickcheck-instances
# - time
# - parsec
# - duckling
# - text
garg-doctest:
main: Main.hs
source-dirs: src-doctest
......
......@@ -2,5 +2,5 @@ import System.FilePath.Glob
import Test.DocTest
main :: IO ()
main = glob "src/**/*.hs" >>= doctest
main = glob "src/Gargantext/Text/Metrics.hs" >>= doctest
{-|
Module : Gargantext
Description : Textmining platform
Description : Textmining Collaborative Platform
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -10,18 +10,17 @@ Portability : POSIX
@Gargantext@: search, map, share
-}
module Gargantext (
module Gargantext.Database
-- module Gargantext.Ngrams,
-- module Gargantext.Utils,
) where
module Gargantext ( module Gargantext.API
, module Gargantext.Core
, module Gargantext.Database
, module Gargantext.Prelude
, module Gargantext.Text
-- , module Gargantext.Viz
) where
import Gargantext.API
import Gargantext.Core
import Gargantext.Database
-- import Gargantext.Ngrams
-- import Gargantext.Utils
import Gargantext.Prelude
import Gargantext.Text
--import Gargantext.Viz
{-|
Module : Gargantext.Core
Description : Which Natural language is supported ?
Description : Supported Natural language
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -15,8 +15,6 @@ module Gargantext.Core
------------------------------------------------------------------------
-- | Language of a Text
-- For simplicity, we suppose text has an homogenous language
data Lang = EN | FR
-- | DE | IT | SP
-- EN == english
-- FR == french
......@@ -24,3 +22,4 @@ data Lang = EN | FR
-- IT == italian (not implemented yet)
-- SP == spanish (not implemented yet)
-- ... add your language and help us to implement it (:
data Lang = EN | FR
......@@ -28,7 +28,7 @@ import Data.Text (Text, words)
import Database.PostgreSQL.Simple
import Database.PostgreSQL.Simple.ToField
import Gargantext (connectGargandb)
import Gargantext.Database.Utils (connectGargandb)
import Gargantext.Prelude
newtype TSQuery = UnsafeTSQuery [Text]
......
......@@ -9,18 +9,13 @@ Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
TODO
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics
module Gargantext.Text.Metrics
where
import Data.Ord (Down(..))
......@@ -74,7 +69,6 @@ filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
......@@ -95,7 +89,7 @@ takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores
-- $ splitKmeans k scores
-- splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
--splitKmeans x xs = L.concat $ map elements
......@@ -172,15 +166,8 @@ metrics_sentences_Test :: Bool
metrics_sentences_Test = metrics_sentences == metrics_sentences'
-- | Terms reordered to visually check occurrences
-- >>>
{- [ [["table"],["glass"],["wine"],["spoon"]]
, [["glass"],["table"]]
, [["spoon"],["table"]]
, [["glass"],["table"],["wine"]]
, [["glass"],["wine"]]
]
-}
-- >>> metrics_terms
-- [[["table"],["glass"],["wine"],["spoon"]],[["glass"],["table"]],[["spoon"],["table"]],[["glass"],["table"],["wine"]],[["glass"],["wine"]]]
metrics_terms :: IO [[Terms]]
metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
......@@ -197,9 +184,13 @@ metrics_occ = occurrences <$> L.concat <$> metrics_terms
{-
-- fromList [((["glas"],["object"]),6)
,((["glas"],["spoon"]),4)
,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
,((["glas"],["table"]),6)
,((["object"],["spoon"]),6)
,((["object"],["table"]),9)
,((["spoon"],["table"]),6)]
-}
metrics_cooc :: IO (Map (Label, Label) Int)
metrics_cooc = cooc <$> metrics_terms
......
......@@ -37,7 +37,8 @@ import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text)
import qualified Data.Text as DT
-- | Activate Async for to parse in parallel
-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8)
......
......@@ -28,12 +28,12 @@ group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
-- $ group2 IN DT
-- - $ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
-- $ group2 JJ CC
-- - $ group2 JJ CC
$ ntags
------------------------------------------------------------------------
......
......@@ -55,15 +55,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, LouvainNode(..))
printDebug :: (Show a, MonadIO m) => [Char] -> a -> m ()
printDebug msg x = putStrLn $ msg <> " " <> show x
--printDebug _ _ = pure ()
-- printDebug _ _ = pure ()
data TextFlow = CSV FilePath
| FullText FilePath
| Contexts [T.Text]
| SQL Int
| Database T.Text
-- | ExtDatabase Query
-- | IntDatabase NodeId
-- ExtDatabase Query
-- IntDatabase NodeId
textFlow :: TermType Lang -> TextFlow -> IO Graph
textFlow termType workType = do
......@@ -88,7 +88,7 @@ textFlow' termType contexts = do
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
-- compute copresences of terms
-- compute copresences of terms, i.e. cooccurrences of terms in same context of text
-- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1)
......@@ -98,13 +98,14 @@ textFlow' termType contexts = do
printDebug "myCooc2" (M.size myCooc2)
-- Filtering terms with inclusion/Exclusion and Specifity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 1000 )
(InclusionSize 4000 )
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 400 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
putStrLn $ show myCooc3
-- Cooc -> Matrix
let (ti, _) = createIndices myCooc3
......
{-|
Module : Gargantext.Viz
Description : Viz tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Vizualisation of text stats
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Viz
where
......@@ -7,6 +7,11 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
2 main measures are actually implemented in order to compute the proximity of two terms.
- Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
- Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
Motivation and definition of the @Conditional@ distance.
Implementation use Accelerate library :
......@@ -44,7 +49,7 @@ import qualified Gargantext.Prelude as P
-----------------------------------------------------------------------
-- Test perf.
-- | Test perf.
distriTest :: Matrix Double
distriTest = distributional $ myMat 100
-----------------------------------------------------------------------
......@@ -67,11 +72,11 @@ rank m = arrayRank $ arrayShape m
-- How to force use with SquareMatrix ?
type Dim = Int
dim :: (Matrix a) -> Dim
dim :: Matrix a -> Dim
dim m = n
where
Z :. _ :. n = arrayShape m
-- == indexTail (arrayShape m)
-- indexTail (arrayShape m)
-----------------------------------------------------------------------
proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
......@@ -80,7 +85,7 @@ proba r mat = zipWith (/) mat (mkSum r mat)
mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
-- divByDiag
-- | divByDiag
divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
where
......@@ -94,11 +99,16 @@ miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
miniMax' = (the $ minimum $ maximum m)
-- | Conditional distance (basic version)
conditional :: Matrix Int -> Matrix Double
conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
-- | Conditional distance (advanced version)
-- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence"
-- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp.
-- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get:
-- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
where
......@@ -121,6 +131,13 @@ conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegr
-----------------------------------------------------------------------
-- | Distributional Distance
-- The distributional measure \[P_c\] of @i@ and @j@ terms is:
-- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
-- \]
-- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information
-- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text
-- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences
-- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term
distributional :: Matrix Int -> Matrix Double
distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
where
......
......@@ -36,7 +36,7 @@ extra-deps:
- servant-server-0.13
- servant-swagger-ui-0.2.3.2.2.8
- stemmer-0.5.2
- text-1.2.3.0
# - text-1.2.3.0
- text-show-3.6.2
- servant-flatten-0.2
- serialise-0.2.0.0 # imt-api-client
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment