Commit 72a1cfff authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TestFlow] seems good, need to add tests on it and fix distributional distance.

parent bb989318
...@@ -50,8 +50,12 @@ library: ...@@ -50,8 +50,12 @@ library:
- Gargantext.Text.Parsers.WOS - Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Terms - Gargantext.Text.Terms
- Gargantext.Text.Terms.Mono
- Gargantext.Text.Terms.Multi.Lang.En
- Gargantext.Text.Terms.Multi.Lang.Fr
- Gargantext.Text.Terms.WithList - Gargantext.Text.Terms.WithList
- Gargantext.TextFlow - Gargantext.TextFlow
- Gargantext.Viz.Graph
- Gargantext.Viz.Graph.Distances.Matrice - Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index - Gargantext.Viz.Graph.Index
dependencies: dependencies:
......
{-|PI/Application.hs {-|
API/Count.hs
API/FrontEnd.hs
API/Node.hs
API/Auth.hs
API.hs
Database/NodeNodeNgram.hs
Database/User.hs
Database/Queries.hs
Module : Gargantext.API.Settings Module : Gargantext.API.Settings
Description : Settings of the API (Server and Client) Description : Settings of the API (Server and Client)
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
......
{-| {-|
Module : Gargantext.Database Module : Gargantext.Database
Description : Main commands of BASHQL a Domain Specific Language to deal with Gargantext Database. Description : BASHQL to deal with Gargantext Database.
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
* BASHQL is a Domain Specific Language to deal with the Database
* BASHQL = functional (Bash * SQL) * BASHQL = functional (Bash * SQL)
* Which language to chose when working with a database ? To make it * Which language to chose when working with a database ? To make it
...@@ -42,6 +44,7 @@ write Haskell bash translations. ...@@ -42,6 +44,7 @@ write Haskell bash translations.
- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG). - FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
* References * References
[0] MIT Press has published "Category theory for the sciences". The book [0] MIT Press has published "Category theory for the sciences". The book
can also be purchased on Amazon. Here are reviews by the MAA, by the can also be purchased on Amazon. Here are reviews by the MAA, by the
AMS, and by SIAM. AMS, and by SIAM.
...@@ -64,6 +67,7 @@ module Gargantext.Database ( module Gargantext.Database.Utils ...@@ -64,6 +67,7 @@ module Gargantext.Database ( module Gargantext.Database.Utils
, del , del' , del , del'
, tree, tree' , tree, tree'
, postCorpus, postAnnuaire , postCorpus, postAnnuaire
, Connection
) )
where where
......
...@@ -59,6 +59,9 @@ import qualified Data.Profunctor.Product as PP ...@@ -59,6 +59,9 @@ import qualified Data.Profunctor.Product as PP
-- | Types for Node Database Management -- | Types for Node Database Management
data PGTSVector data PGTSVector
------------------------------------------------------------------------
type CorpusId = Int
------------------------------------------------------------------------
instance FromField HyperdataCorpus where instance FromField HyperdataCorpus where
fromField = fromField' fromField = fromField'
......
...@@ -11,9 +11,9 @@ This file is intended for these purposes: ...@@ -11,9 +11,9 @@ This file is intended for these purposes:
- documentation for teaching and research - documentation for teaching and research
- learn basics of Haskell which is a scientific programming language - learn basics of Haskell which is a scientific programming language
- behavioral tests (that should be completed with uni-tests and scale-tests - behavioral tests (that should be completed with uni-tests and scale-tests)
This documents defines basic of Text definitions according to Gargantext.. This document defines basic of Text definitions according to Gargantext..
- What is a term ? - What is a term ?
- What is a sentence ? - What is a sentence ?
...@@ -104,7 +104,18 @@ ex_occ = occurrences <$> L.concat <$> ex_terms ...@@ -104,7 +104,18 @@ ex_occ = occurrences <$> L.concat <$> ex_terms
ex_cooc :: IO (Map (Label, Label) Int) ex_cooc :: IO (Map (Label, Label) Int)
ex_cooc = cooc <$> ex_terms ex_cooc = cooc <$> ex_terms
-- | Tests -- | Tests the specificity and genericity
--
-- >>> ex_cooc_mat
-- (fromList [(["glass"],0),(["spoon"],1),(["table"],2),(["wine"],3)],Matrix (Z :. 4 :. 4)
-- [ 4, 0, 0, 0,
-- 1, 2, 0, 0,
-- 3, 2, 4, 0,
-- 3, 1, 2, 3],Matrix (Z :. 4 :. 4)
-- [ 1.0, 0.25, 0.75, 0.75,
-- 0.0, 1.0, 1.0, 0.5,
-- 0.0, 0.0, 1.0, 0.5,
-- 0.0, 0.0, 0.0, 1.0],(Vector (Z :. 4) [0.5833333333333334,0.5833333333333334,0.75,0.5833333333333334],Vector (Z :. 4) [-0.5833333333333334,-0.4166666666666667,0.41666666666666674,0.5833333333333334]))
ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity)) ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
ex_cooc_mat = do ex_cooc_mat = do
m <- ex_cooc m <- ex_cooc
......
...@@ -36,7 +36,7 @@ import qualified Data.Array.Accelerate as DAA ...@@ -36,7 +36,7 @@ import qualified Data.Array.Accelerate as DAA
import GHC.Real (round) import GHC.Real (round)
--import Debug.Trace import Debug.Trace (trace)
data MapListSize = MapListSize Int data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int data InclusionSize = InclusionSize Int
...@@ -51,19 +51,19 @@ data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize ...@@ -51,19 +51,19 @@ data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_defaultValue :: DefaultValue , fc_defaultValue :: DefaultValue
} }
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc filterCooc fc cc = (filterCooc' fc) ts cc
where where
ts = map _scored_terms $ takeSome fc $ coocScored cc ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m') foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection M.empty selection
where where
selection = [(x,y) | x <- ts selection = [(x,y) | x <- ts
, y <- ts , y <- ts
-- , x >= y -- , x >= y
] ]
...@@ -75,7 +75,7 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore ...@@ -75,7 +75,7 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t] takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m $ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores -- splitKmeans k scores
where where
-- TODO: benchmark with accelerate-example kmeans version -- TODO: benchmark with accelerate-example kmeans version
...@@ -95,10 +95,10 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste ...@@ -95,10 +95,10 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste
$ sortWith (Down . _scored_speGen) xs $ sortWith (Down . _scored_speGen) xs
data Scored t = Scored { _scored_terms :: !t data Scored ts = Scored { _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion , _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity , _scored_speGen :: !SpecificityGenericity
} deriving (Show) } deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better -- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around. -- to compute them earlier and pass them around.
......
...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono.Token.En ...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..) ( EitherList(..)
, Tokenizer , Tokenizer
, tokenize , tokenize
, run
, defaultTokenizer , defaultTokenizer
, whitespace , whitespace
, uris , uris
...@@ -64,8 +63,6 @@ import Gargantext.Prelude ...@@ -64,8 +63,6 @@ import Gargantext.Prelude
-- ,"Hyphen-words" -- ,"Hyphen-words"
-- ,"Yes/No questions" -- ,"Yes/No questions"
-- ] -- ]
---
type Tokenizer = Text -> EitherList Text Text type Tokenizer = Text -> EitherList Text Text
-- | The EitherList is a newtype-wrapped list of Eithers. -- | The EitherList is a newtype-wrapped list of Eithers.
......
...@@ -21,15 +21,22 @@ import GHC.IO (FilePath) ...@@ -21,15 +21,22 @@ import GHC.IO (FilePath)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text.IO (readFile) import Data.Text.IO (readFile)
import Data.Maybe (catMaybes)
import qualified Data.Set as DS
import qualified Data.Array.Accelerate as A import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
---------------------------------------------- ----------------------------------------------
import Gargantext.Database (Connection)
import Gargantext.Database.Node
import Gargantext.Core.Types.Node
import Gargantext.Core (Lang) import Gargantext.Core (Lang)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map) import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (distributional) import Gargantext.Viz.Graph.Distances.Matrice (distributional, measureConditional)
import Gargantext.Viz.Graph (Graph(..), data2graph) import Gargantext.Viz.Graph (Graph(..), data2graph)
import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..)) import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
...@@ -38,7 +45,7 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences)) ...@@ -38,7 +45,7 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
{- {-
____ _ _ ____ _ _
...@@ -50,11 +57,27 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain) ...@@ -50,11 +57,27 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-} -}
contextText :: [T.Text]
contextText = ["The dog is an animal."
,"The bird is an animal."
,"The bird is an animal."
,"The bird and the dog are an animal."
,"The table is an object."
,"The pen is an object."
,"This object is a pen or a table?"
,"The girl has a human body."
,"The girl has a human body."
,"The boy has a human body."
,"The boy has a human body."
]
data TextFlow = CSV FilePath data TextFlow = CSV FilePath
| FullText FilePath | FullText FilePath
| Contexts [T.Text] | Contexts [T.Text]
| SQL Int | DB Connection CorpusId
| Database T.Text | Query T.Text
-- ExtDatabase Query -- ExtDatabase Query
-- IntDatabase NodeId -- IntDatabase NodeId
...@@ -64,6 +87,7 @@ textFlow termType workType = do ...@@ -64,6 +87,7 @@ textFlow termType workType = do
FullText path -> splitBy (Sentences 5) <$> readFile path FullText path -> splitBy (Sentences 5) <$> readFile path
CSV path -> readCsvOn [csv_title, csv_abstract] path CSV path -> readCsvOn [csv_title, csv_abstract] path
Contexts ctxt -> pure ctxt Contexts ctxt -> pure ctxt
SQL con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
_ -> undefined _ -> undefined
textFlow' termType contexts textFlow' termType contexts
...@@ -78,53 +102,61 @@ textFlow' termType contexts = do ...@@ -78,53 +102,61 @@ textFlow' termType contexts = do
-- TermsType = Mono | Multi | MonoMulti -- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList) -- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList|Ontology) -- # groupBy (Stem|GroupList|Ontology)
printDebug "terms" myterms
printDebug "myterms" (sum $ map length myterms) printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list -- Bulding the map list
-- compute copresences of terms, i.e. cooccurrences of terms in same context of text -- compute copresences of terms, i.e. cooccurrences of terms in same context of text
-- Cooc = Map (Term, Term) Int -- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1) printDebug "myCooc1 size" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix -- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1 let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2" (M.size myCooc2) printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores -- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 ) let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
(InclusionSize 900 ) (InclusionSize 500 )
(SampleBins 10 ) (SampleBins 10 )
(Clusters 3 ) (Clusters 3 )
(DefaultValue 0 ) (DefaultValue 0 )
) myCooc2 ) myCooc2
printDebug "myCooc3" $ M.size myCooc3 printDebug "myCooc3 size" $ M.size myCooc3
-- putStrLn $ show myCooc3 printDebug "myCooc3" myCooc3
-- Cooc -> Matrix -- Cooc -> Matrix
let (ti, _) = createIndices myCooc3 let (ti, _) = createIndices myCooc3
printDebug "ti" $ M.size ti printDebug "ti size" $ M.size ti
printDebug "ti" ti
let myCooc4 = toIndex ti myCooc3 let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4 printDebug "myCooc4 size" $ M.size myCooc4
printDebug "myCooc4" myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4 let matCooc = map2mat (0) (M.size ti) myCooc4
printDebug "matCooc shape" $ A.arrayShape matCooc
printDebug "matCooc" matCooc printDebug "matCooc" matCooc
-- Matrix -> Clustering -- Matrix -> Clustering
--let distanceMat = conditional' matCooc let distanceMat = measureConditional matCooc
let distanceMat = distributional matCooc --let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat printDebug "distanceMat" distanceMat
-- --
let distanceMap = mat2map distanceMat --let distanceMap = M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap let distanceMap = M.map (\n -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap
printDebug "distanceMap" distanceMap
-- let distance = fromIndex fi distanceMap -- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance -- printDebug "distance" $ M.size distance
partitions <- cLouvain distanceMap partitions <- cLouvain distanceMap
-- Building : -> Graph -> JSON -- Building : -> Graph -> JSON
printDebug "partitions" $ length partitions printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
--printDebug "partitions" partitions --printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment