[BREAKING phylo] refactor to use uniform TimeUnitCriteria

NOTE: Golden tests don't pass

This is a proposal

Should go together with

https://gitlab.iscpif.fr/gargantext/purescript-gargantext/tree/dev-phylo-document-dates-with-hh-mm-ss
parent f38fcc6a
Pipeline #7778 failed with stages
in 18 minutes and 48 seconds
......@@ -478,7 +478,7 @@ Sometimes, however, we genuinely want to modify the output so that it's the new
golden reference). To do so, it's enough to run the testsuite passing the `--accept` flag, for example:
```shell
cabal v2-test garg-test-tasty --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs' --test-option=--pattern='/Phylo/' --test-option=--accept"
cabal v2-test --test-show-details=streaming --flags 'test-crypto no-phylo-debug-logs' --test-option=--match='/Phylo/' --test-option=--accept
```
# Async workers <a name="async-workers"></a>
......
......@@ -45,7 +45,18 @@ phyloCLI (PhyloArgs configPath) = do
mapList <- fileToList (listParser config) (listPath config)
corpus <- if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
then
fileToDocsDefault (corpusParser config)
(corpusPath config)
[Year $ TimeUnitCriteria { _tuc_step = 3
, _tuc_period = 1
, _tuc_matchingFrame = 5 }
, Month $ TimeUnitCriteria { _tuc_step = 3
, _tuc_period = 1
, _tuc_matchingFrame = 5 }
, Week $ TimeUnitCriteria { _tuc_step = 4
, _tuc_period = 2
, _tuc_matchingFrame = 5 } ] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
......
......@@ -157,11 +157,14 @@ fileToList parser path =
-- Config time parameters to label
timeToLabel :: PhyloConfig -> [Char]
timeToLabel config = case (timeUnit config) of
Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Epoch tuc -> "time_epochs" <> toLabel tuc
Year tuc -> "time_years" <> toLabel tuc
Month tuc -> "time_months" <> toLabel tuc
Week tuc -> "time_weeks" <> toLabel tuc
Day tuc -> "time_days" <> toLabel tuc
where
toLabel (TimeUnitCriteria { .. }) =
show _tuc_period <> "_" <> show _tuc_step <> "_" <> show _tuc_matchingFrame
seaToLabel :: PhyloConfig -> [Char]
......
......@@ -36,7 +36,9 @@ phyloConfig outdir = PhyloConfig {
, findAncestors = False
, phyloSynchrony = ByProximityThreshold {_bpt_threshold = 0.5, _bpt_sensibility = 0.0, _bpt_scope = AllBranches, _bpt_strategy = MergeAllGroups}
, phyloQuality = Quality {_qua_granularity = 0.8, _qua_minBranch = 3}
, timeUnit = Year {_year_period = 3, _year_step = 1, _year_matchingFrame = 5}
, timeUnit = Year $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, clique = MaxClique {_mcl_size = 5, _mcl_threshold = 1.0e-4, _mcl_filter = ByThreshold}
, exportLabel = [ BranchLabel {_branch_labelTagger = MostEmergentTfIdf, _branch_labelSize = 2}
, GroupLabel {_group_labelTagger = MostEmergentInclusive, _group_labelSize = 2}
......@@ -65,9 +67,22 @@ phyloProfileCLI = do
mapList <- liftIO $ fileToList (listParser config) (listPath config)
corpus <- liftIO $ if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
corpus <- liftIO $
if (defaultMode config)
then
fileToDocsDefault (corpusParser config)
(corpusPath config)
[ Year $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Month $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Week $ TimeUnitCriteria { _tuc_period = 4
, _tuc_step = 2
, _tuc_matchingFrame = 5 } ] mapList
else
fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
liftIO $ do
printIOComment (show (length corpus) <> " parsed docs from the corpus")
......
......@@ -844,6 +844,7 @@ test-suite garg-test
Test.API.UpdateList
Test.API.UpdateList
Test.API.Worker
Test.Core.Aeson
Test.Core.LinearAlgebra
Test.Core.Notifications
Test.Core.Orchestrator
......
......@@ -21,19 +21,22 @@ in science evolution — the rise and fall of scientific fields. PloS
one 8, e54847.
-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
module Gargantext.Core.Viz.Phylo where
import Control.Lens (over)
import Control.Monad.Fail (fail)
import Data.Aeson (object, withObject, (.=), (.:))
import Data.Aeson.Types qualified as JS
import Data.List.NonEmpty qualified as NE
import Data.Swagger (ToSchema(..), genericDeclareNamedSchema)
import Data.Text (pack)
import Data.Text (pack, unpack)
import Data.Text.Lazy qualified as TextLazy
import Data.TreeDiff (ToExpr (..))
import Data.Vector (Vector)
......@@ -109,33 +112,33 @@ data Synchrony =
| ByProximityDistribution
{ _bpd_sensibility :: Double
, _bpd_strategy :: SynchronyStrategy }
deriving (Show,Generic,Eq,ToExpr)
deriving (Show, Generic, Eq, ToExpr)
instance ToSchema Synchrony where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_")
data TimeUnitCriteria =
TimeUnitCriteria { _tuc_period :: Int
, _tuc_step :: Int
, _tuc_matchingFrame :: Int }
deriving (Show, Generic, Eq, NFData, ToExpr)
instance ToSchema TimeUnitCriteria where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_tuc")
defaultTimeUnitCriteria :: TimeUnitCriteria
defaultTimeUnitCriteria = TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
data TimeUnit =
Epoch
{ _epoch_period :: Int
, _epoch_step :: Int
, _epoch_matchingFrame :: Int }
| Year
{ _year_period :: Int
, _year_step :: Int
, _year_matchingFrame :: Int }
| Month
{ _month_period :: Int
, _month_step :: Int
, _month_matchingFrame :: Int }
| Week
{ _week_period :: Int
, _week_step :: Int
, _week_matchingFrame :: Int }
| Day
{ _day_period :: Int
, _day_step :: Int
, _day_matchingFrame :: Int }
deriving (Show,Generic,Eq,NFData,ToExpr)
Epoch TimeUnitCriteria
| Year TimeUnitCriteria
| Month TimeUnitCriteria
| Week TimeUnitCriteria
| Day TimeUnitCriteria
deriving (Show, Generic, Eq, NFData, ToExpr)
instance ToSchema TimeUnit where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
......@@ -146,6 +149,13 @@ data MaxCliqueFilter = ByThreshold | ByNeighbours deriving (Show,Generic,Eq,ToEx
instance ToSchema MaxCliqueFilter where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "")
timeUnitTUC :: TimeUnit -> TimeUnitCriteria
timeUnitTUC (Epoch tuc) = tuc
timeUnitTUC (Year tuc) = tuc
timeUnitTUC (Month tuc) = tuc
timeUnitTUC (Week tuc) = tuc
timeUnitTUC (Day tuc) = tuc
data Cluster =
......@@ -237,7 +247,7 @@ defaultConfig =
, findAncestors = False
, phyloSynchrony = ByProximityThreshold 0.6 0 AllBranches MergeAllGroups
, phyloQuality = Quality 0.5 3
, timeUnit = Year 3 1 5
, timeUnit = Year defaultTimeUnitCriteria
, clique = Fis 2 3
, exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2]
, exportSort = ByHierarchy Desc
......@@ -266,8 +276,51 @@ instance ToJSON PhyloSimilarity
instance FromJSON SeaElevation
instance ToJSON SeaElevation
instance FromJSON TimeUnit
instance ToJSON TimeUnit
instance FromJSON TimeUnitCriteria where
parseJSON = withObject "TimeUnitCriteria" $ \o -> do
_tuc_period <- o .: "period"
_tuc_step <- o .: "step"
_tuc_matchingFrame <- o .: "matchingFrame"
pure $ TimeUnitCriteria { .. }
instance ToJSON TimeUnitCriteria where
toJSON (TimeUnitCriteria { .. }) =
object [ ("period" .= toJSON _tuc_period)
, ("step" .= toJSON _tuc_step)
, ("matchingFrame" .= toJSON _tuc_matchingFrame) ]
instance FromJSON TimeUnit where
parseJSON = withObject "TimeUnit" $ \o -> do
tag :: Text <- o .: "tag"
tuc :: TimeUnitCriteria <- o .: "time_unit_criteria"
case tag of
"Epoch" -> do
pure $ Epoch tuc
"Year" -> do
pure $ Year tuc
"Month" -> do
pure $ Month tuc
"Week" -> do
pure $ Week tuc
"Day" -> do
pure $ Day tuc
s -> fail $ "Unknown tag " <> unpack s
instance ToJSON TimeUnit where
toJSON (Epoch tuc) =
object [ ("tag" .= toJSON ("Epoch" :: Text))
, ("time_unit_criteria") .= toJSON tuc ]
toJSON (Year tuc) =
object [ ("tag" .= toJSON ("Year" :: Text))
, ("time_unit_criteria") .= toJSON tuc ]
toJSON (Month tuc) =
object [ ("tag" .= toJSON ("Month" :: Text))
, ("time_unit_criteria") .= toJSON tuc ]
toJSON (Week tuc) =
object [ ("tag" .= toJSON ("Week" :: Text))
, ("time_unit_criteria") .= toJSON tuc ]
toJSON (Day tuc) =
object [ ("tag" .= toJSON ("Day" :: Text))
, ("time_unit_criteria") .= toJSON tuc ]
instance FromJSON MaxCliqueFilter
instance ToJSON MaxCliqueFilter
......
......@@ -33,7 +33,7 @@ import Gargantext.Core.NodeStory.Types (HasNodeStory, NodeStoryEnv, HasNodeStory
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(MapTerm))
import Gargantext.Core.Viz.Phylo (TimeUnit(..), Date, Document(..), PhyloConfig(..), Phylo (_phylo_computeTime), trackComputeTime, ComputeTimeHistory)
import Gargantext.Core.Viz.Phylo (TimeUnit(..), Date, Document(..), PhyloConfig(..), Phylo (_phylo_computeTime), trackComputeTime, ComputeTimeHistory, defaultTimeUnitCriteria)
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools ({-printIOMsg, printIOComment,-} setConfig)
......@@ -181,7 +181,7 @@ context2phyloDocument timeUnit context (ngs_terms, ngs_sources) = do
text' = maybe [] toText $ Map.lookup (nodeId2ContextId contextId) ngs_terms
sources' = maybe [] toText $ Map.lookup (nodeId2ContextId contextId) ngs_sources
pure $ Document date date' text' Nothing sources' (Year 3 1 5)
pure $ Document date date' text' Nothing sources' (Year defaultTimeUnitCriteria)
-- TODO better default date and log the errors to improve data quality
......
......@@ -110,7 +110,7 @@ docs = map (\(d,t)
(filter (\n -> isRoots n (foundations ^. foundations_roots)) $ monoTexts t)
Nothing
[]
(Year 3 1 5)
(Year defaultTimeUnitCriteria)
) corpus
......
......@@ -189,28 +189,13 @@ toTimeScale dates step =
getTimeStep :: TimeUnit -> Int
getTimeStep time = case time of
Epoch { .. } -> _epoch_step
Year { .. } -> _year_step
Month { .. } -> _month_step
Week { .. } -> _week_step
Day { .. } -> _day_step
getTimeStep time = _tuc_step $ timeUnitTUC time
getTimePeriod :: TimeUnit -> Int
getTimePeriod time = case time of
Epoch { .. } -> _epoch_period
Year { .. } -> _year_period
Month { .. } -> _month_period
Week { .. } -> _week_period
Day { .. } -> _day_period
getTimePeriod time = _tuc_period $ timeUnitTUC time
getTimeFrame :: TimeUnit -> Int
getTimeFrame time = case time of
Epoch { .. } -> _epoch_matchingFrame
Year { .. } -> _year_matchingFrame
Month { .. } -> _month_matchingFrame
Week { .. } -> _week_matchingFrame
Day { .. } -> _day_matchingFrame
getTimeFrame time = _tuc_matchingFrame $ timeUnitTUC time
-------------
-- | Fis | --
......
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
{-|
Module : Gargantext.Core.Worker.Types
Description : Some useful worker types
......
......@@ -15,6 +15,7 @@ import Test.Database.Operations qualified as DB
import Test.Database.Transactions qualified as DBT
import Test.Hspec
import Test.Server.ReverseProxy qualified as ReverseProxy
import Test.Core.Aeson qualified as AesonTest
import Test.Core.LinearAlgebra qualified as LinearAlgebra
import Test.Core.Notifications qualified as Notifications
import Test.Core.Orchestrator qualified as Orchestrator
......@@ -93,6 +94,7 @@ main = do
DBT.tests
DB.nodeStoryTests
describe "Utils" $ Utils.test
describe "Aeson" $ AesonTest.tests
describe "Graph Clustering" $ Clustering.test
describe "Graph Distance" $ Distance.test
describe "Date split" $ PD.testDateSplit
......
{-|
Module : Graph.Clustering
Module : Test.Core.Text
Description : Basic tests to avoid quick regression
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......
......@@ -11,19 +11,15 @@ Portability : POSIX
module Test.Core.Worker where
import Data.Aeson qualified as Aeson
import Gargantext.Core.Worker.Jobs.Types (Job(..))
import Gargantext.Prelude
import Test.Instances ()
import Test.Hspec
import Test.Hspec.QuickCheck
tests :: Spec
tests = describe "worker unit tests" $
prop "Worker Job to/from JSON serialization is correct" $
\job -> Aeson.decode (Aeson.encode (job :: Job)) == Just job
tests =
describe "worker unit tests" $ pure ()
-- , testProperty "JobInfo to/from JSON serialization is correct" $
-- \ji -> Aeson.decode (Aeson.encode (ji :: JobInfo)) == Just ji
......@@ -273,6 +273,7 @@ instance Arbitrary Phylo.Synchrony where arbitrary = genericArbitrary
instance Arbitrary Phylo.SynchronyScope where arbitrary = genericArbitrary
instance Arbitrary Phylo.SynchronyStrategy where arbitrary = genericArbitrary
instance Arbitrary Phylo.Tagger where arbitrary = genericArbitrary
instance Arbitrary Phylo.TimeUnitCriteria where arbitrary = genericArbitrary
instance Arbitrary Phylo.TimeUnit where arbitrary = genericArbitrary
instance Arbitrary PhyloData where arbitrary = genericArbitrary
......
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-}
module Test.Offline.Phylo (tests) where
......@@ -45,7 +45,7 @@ phyloTestConfig = PhyloConfig {
, findAncestors = False
, phyloSynchrony = ByProximityThreshold {_bpt_threshold = 0.5, _bpt_sensibility = 0.0, _bpt_scope = AllBranches, _bpt_strategy = MergeAllGroups}
, phyloQuality = Quality {_qua_granularity = 0.8, _qua_minBranch = 3}
, timeUnit = Year {_year_period = 3, _year_step = 1, _year_matchingFrame = 5}
, timeUnit = Year defaultTimeUnitCriteria
, clique = MaxClique {_mcl_size = 5, _mcl_threshold = 1.0e-4, _mcl_filter = ByThreshold}
, exportLabel = [ BranchLabel {_branch_labelTagger = MostEmergentTfIdf, _branch_labelSize = 2}
, GroupLabel {_group_labelTagger = MostEmergentInclusive, _group_labelSize = 2}
......@@ -136,7 +136,15 @@ testNadalWithoutLinkExpectedOutput = do
mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocsDefault (corpusParser config)
(corpusPath config)
[Year 3 1 5,Month 3 1 5,Week 4 2 5]
[ Year $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Month $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Week $ TimeUnitCriteria { _tuc_period = 4
, _tuc_step = 2
, _tuc_matchingFrame = 5 } ]
mapList
pure ( "test-data/phylo/nadal.golden.json"
, JSON.encodePretty $ setConfig phyloTestConfig $ toPhyloWithoutLink corpus config
......@@ -153,7 +161,15 @@ testSmallPhyloWithoutLinkExpectedOutput = do
mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocsDefault (corpusParser config)
(corpusPath config)
[Year 3 1 5,Month 3 1 5,Week 4 2 5]
[ Year $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Month $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Week $ TimeUnitCriteria { _tuc_period = 4
, _tuc_step = 2
, _tuc_matchingFrame = 5 } ]
mapList
let actual = setConfig phyloTestConfig $ toPhyloWithoutLink corpus config
expected <- setConfig phyloTestConfig <$> (readPhylo =<< getDataFileName "test-data/phylo/small-phylo.golden.json")
......@@ -286,7 +302,15 @@ testToPhyloDeterminism = do
mapList <- tsvMapTermList (listPath config)
corpus <- fileToDocsDefault (corpusParser config)
(corpusPath config)
[Year 3 1 5,Month 3 1 5,Week 4 2 5]
[ Year $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Month $ TimeUnitCriteria { _tuc_period = 3
, _tuc_step = 1
, _tuc_matchingFrame = 5 }
, Week $ TimeUnitCriteria { _tuc_period = 4
, _tuc_step = 2
, _tuc_matchingFrame = 5 } ]
mapList
let actual = setConfig phyloTestConfig $ toPhylo $ toPhyloWithoutLink corpus config
pure ( "test-data/phylo/187481.json"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment