Commit 72a1cfff authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TestFlow] seems good, need to add tests on it and fix distributional distance.

parent bb989318
......@@ -50,8 +50,12 @@ library:
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search
- Gargantext.Text.Terms
- Gargantext.Text.Terms.Mono
- Gargantext.Text.Terms.Multi.Lang.En
- Gargantext.Text.Terms.Multi.Lang.Fr
- Gargantext.Text.Terms.WithList
- Gargantext.TextFlow
- Gargantext.Viz.Graph
- Gargantext.Viz.Graph.Distances.Matrice
- Gargantext.Viz.Graph.Index
dependencies:
......
{-|PI/Application.hs
API/Count.hs
API/FrontEnd.hs
API/Node.hs
API/Auth.hs
API.hs
Database/NodeNodeNgram.hs
Database/User.hs
Database/Queries.hs
{-|
Module : Gargantext.API.Settings
Description : Settings of the API (Server and Client)
Copyright : (c) CNRS, 2017-Present
......
{-|
Module : Gargantext.Database
Description : Main commands of BASHQL a Domain Specific Language to deal with Gargantext Database.
Description : BASHQL to deal with Gargantext Database.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
* BASHQL is a Domain Specific Language to deal with the Database
* BASHQL = functional (Bash * SQL)
* Which language to chose when working with a database ? To make it
......@@ -42,6 +44,7 @@ write Haskell bash translations.
- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
* References
[0] MIT Press has published "Category theory for the sciences". The book
can also be purchased on Amazon. Here are reviews by the MAA, by the
AMS, and by SIAM.
......@@ -64,6 +67,7 @@ module Gargantext.Database ( module Gargantext.Database.Utils
, del , del'
, tree, tree'
, postCorpus, postAnnuaire
, Connection
)
where
......
......@@ -59,6 +59,9 @@ import qualified Data.Profunctor.Product as PP
-- | Types for Node Database Management
data PGTSVector
------------------------------------------------------------------------
type CorpusId = Int
------------------------------------------------------------------------
instance FromField HyperdataCorpus where
fromField = fromField'
......
......@@ -11,9 +11,9 @@ This file is intended for these purposes:
- documentation for teaching and research
- learn basics of Haskell which is a scientific programming language
- behavioral tests (that should be completed with uni-tests and scale-tests
- behavioral tests (that should be completed with uni-tests and scale-tests)
This documents defines basic of Text definitions according to Gargantext..
This document defines basic of Text definitions according to Gargantext..
- What is a term ?
- What is a sentence ?
......@@ -104,7 +104,18 @@ ex_occ = occurrences <$> L.concat <$> ex_terms
ex_cooc :: IO (Map (Label, Label) Int)
ex_cooc = cooc <$> ex_terms
-- | Tests
-- | Tests the specificity and genericity
--
-- >>> ex_cooc_mat
-- (fromList [(["glass"],0),(["spoon"],1),(["table"],2),(["wine"],3)],Matrix (Z :. 4 :. 4)
-- [ 4, 0, 0, 0,
-- 1, 2, 0, 0,
-- 3, 2, 4, 0,
-- 3, 1, 2, 3],Matrix (Z :. 4 :. 4)
-- [ 1.0, 0.25, 0.75, 0.75,
-- 0.0, 1.0, 1.0, 0.5,
-- 0.0, 0.0, 1.0, 0.5,
-- 0.0, 0.0, 0.0, 1.0],(Vector (Z :. 4) [0.5833333333333334,0.5833333333333334,0.75,0.5833333333333334],Vector (Z :. 4) [-0.5833333333333334,-0.4166666666666667,0.41666666666666674,0.5833333333333334]))
ex_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
ex_cooc_mat = do
m <- ex_cooc
......
......@@ -36,7 +36,7 @@ import qualified Data.Array.Accelerate as DAA
import GHC.Real (round)
--import Debug.Trace
import Debug.Trace (trace)
data MapListSize = MapListSize Int
data InclusionSize = InclusionSize Int
......@@ -51,19 +51,19 @@ data FilterConfig = FilterConfig { fc_mapListSize :: MapListSize
, fc_defaultValue :: DefaultValue
}
filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
M.empty selection
where
selection = [(x,y) | x <- ts
, y <- ts
-- , x >= y
-- , x >= y
]
......@@ -75,7 +75,7 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScore
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l' $ sortWith (Down . _scored_incExc) scores
$ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
-- splitKmeans k scores
where
-- TODO: benchmark with accelerate-example kmeans version
......@@ -95,10 +95,10 @@ takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Cluste
$ sortWith (Down . _scored_speGen) xs
data Scored t = Scored { _scored_terms :: !t
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
data Scored ts = Scored { _scored_terms :: !ts
, _scored_incExc :: !InclusionExclusion
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
-- TODO in the textflow we end up needing these indices, it might be better
-- to compute them earlier and pass them around.
......
......@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..)
, Tokenizer
, tokenize
, run
, defaultTokenizer
, whitespace
, uris
......@@ -64,8 +63,6 @@ import Gargantext.Prelude
-- ,"Hyphen-words"
-- ,"Yes/No questions"
-- ]
---
type Tokenizer = Text -> EitherList Text Text
-- | The EitherList is a newtype-wrapped list of Eithers.
......
......@@ -21,15 +21,22 @@ import GHC.IO (FilePath)
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Data.Maybe (catMaybes)
import qualified Data.Set as DS
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
----------------------------------------------
import Gargantext.Database (Connection)
import Gargantext.Database.Node
import Gargantext.Core.Types.Node
import Gargantext.Core (Lang)
import Gargantext.Prelude
import Gargantext.Viz.Graph.Index (createIndices, toIndex, map2mat, mat2map)
import Gargantext.Viz.Graph.Distances.Matrice (distributional)
import Gargantext.Viz.Graph.Distances.Matrice (distributional, measureConditional)
import Gargantext.Viz.Graph (Graph(..), data2graph)
import Gargantext.Text.Metrics.Count (cooc)
import Gargantext.Text.Metrics (filterCooc, FilterConfig(..), Clusters(..), SampleBins(..), DefaultValue(..), MapListSize(..), InclusionSize(..))
......@@ -38,7 +45,7 @@ import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
import Gargantext.Text.Parsers.CSV
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
{-
____ _ _
......@@ -50,11 +57,27 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain)
-}
contextText :: [T.Text]
contextText = ["The dog is an animal."
,"The bird is an animal."
,"The bird is an animal."
,"The bird and the dog are an animal."
,"The table is an object."
,"The pen is an object."
,"This object is a pen or a table?"
,"The girl has a human body."
,"The girl has a human body."
,"The boy has a human body."
,"The boy has a human body."
]
data TextFlow = CSV FilePath
| FullText FilePath
| Contexts [T.Text]
| SQL Int
| Database T.Text
| DB Connection CorpusId
| Query T.Text
-- ExtDatabase Query
-- IntDatabase NodeId
......@@ -64,6 +87,7 @@ textFlow termType workType = do
FullText path -> splitBy (Sentences 5) <$> readFile path
CSV path -> readCsvOn [csv_title, csv_abstract] path
Contexts ctxt -> pure ctxt
SQL con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (node_hyperdata n) <> hyperdataDocumentV3_abstract (node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
_ -> undefined
textFlow' termType contexts
......@@ -78,53 +102,61 @@ textFlow' termType contexts = do
-- TermsType = Mono | Multi | MonoMulti
-- myterms # filter (\t -> not . elem t stopList)
-- # groupBy (Stem|GroupList|Ontology)
printDebug "terms" myterms
printDebug "myterms" (sum $ map length myterms)
-- Bulding the map list
-- compute copresences of terms, i.e. cooccurrences of terms in same context of text
-- Cooc = Map (Term, Term) Int
let myCooc1 = cooc myterms
printDebug "myCooc1" (M.size myCooc1)
printDebug "myCooc1 size" (M.size myCooc1)
-- Remove Apax: appears one time only => lighting the matrix
let myCooc2 = M.filter (>1) myCooc1
printDebug "myCooc2" (M.size myCooc2)
let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 100 )
(InclusionSize 900 )
let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
(InclusionSize 500 )
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc2
printDebug "myCooc3" $ M.size myCooc3
-- putStrLn $ show myCooc3
printDebug "myCooc3 size" $ M.size myCooc3
printDebug "myCooc3" myCooc3
-- Cooc -> Matrix
let (ti, _) = createIndices myCooc3
printDebug "ti" $ M.size ti
printDebug "ti size" $ M.size ti
printDebug "ti" ti
let myCooc4 = toIndex ti myCooc3
printDebug "myCooc4" $ M.size myCooc4
printDebug "myCooc4 size" $ M.size myCooc4
printDebug "myCooc4" myCooc4
let matCooc = map2mat (0) (M.size ti) myCooc4
printDebug "matCooc shape" $ A.arrayShape matCooc
printDebug "matCooc" matCooc
-- Matrix -> Clustering
--let distanceMat = conditional' matCooc
let distanceMat = distributional matCooc
printDebug "distanceMat" $ A.arrayShape distanceMat
let distanceMat = measureConditional matCooc
--let distanceMat = distributional matCooc
printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat
--
let distanceMap = mat2map distanceMat
printDebug "distanceMap" $ M.size distanceMap
--let distanceMap = M.filter (>0) $ mat2map distanceMat
let distanceMap = M.map (\n -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap
printDebug "distanceMap" distanceMap
-- let distance = fromIndex fi distanceMap
-- printDebug "distance" $ M.size distance
partitions <- cLouvain distanceMap
-- Building : -> Graph -> JSON
printDebug "partitions" $ length partitions
printDebug "partitions" $ DS.size $ DS.fromList $ map (l_community_id) partitions
--printDebug "partitions" partitions
pure $ data2graph (M.toList ti) myCooc4 distanceMap partitions
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment