Commit 15c6b984 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Add gargantext-phylo-profile executable

It also adds a semi-interesting phylo.
parent 988f1c3f
......@@ -38,3 +38,6 @@ tmp*repo*json
data
devops/docker/js-cache
cabal.project.local
gargantext_profile_out.dot
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
{
"corpusPath" : "Gargantext_DocsList-nodeId-185487.csv",
"listPath" : "Gargantext_NgramsList-185488.csv",
"outputPath" : "data",
"corpusParser" : {
"tag" : "Csv",
"_csv_limit" : 1500000
},
"listParser" : "V3",
"phyloName" : "bpa",
"phyloScale" : 2,
"similarity" : {
"tag" : "WeightedLogJaccard",
"_wlj_sensibility" : 0.5,
"_wlj_minSharedNgrams" : 1
},
"seaElevation" : {
"tag" : "Evolving",
"_evol_neighborhood" : true
},
"defaultMode" : false,
"findAncestors" : true,
"phyloSynchrony" : {
"tag" : "ByProximityThreshold",
"_bpt_threshold" : 0.6,
"_bpt_sensibility" : 0,
"_bpt_scope" : "AllBranches",
"_bpt_strategy" : "MergeAllGroups"
},
"phyloQuality" : {
"tag" : "Quality",
"_qua_granularity" : 0.1,
"_qua_minBranch" : 2
},
"timeUnit" : {
"tag" : "Week",
"_week_period" : 4,
"_week_step" : 2,
"_week_matchingFrame" : 5
},
"clique" : {
"tag" : "Fis",
"_fis_support" : 3,
"_fis_size" : 1
},
"exportLabel" : [
{
"tag" : "BranchLabel",
"_branch_labelTagger" : "MostEmergentTfIdf",
"_branch_labelSize" : 2
},
{
"tag" : "GroupLabel",
"_group_labelTagger" : "MostEmergentInclusive",
"_group_labelSize" : 2
}
],
"exportSort" : {
"tag" : "ByHierarchy",
"_sort_order" : "Desc"
},
"exportFilter" : [
{
"tag" : "ByBranchSize",
"_branch_size" : 2
}
]
}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Common
import Data.Aeson
import Data.List (nub)
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
import GHC.IO.Encoding
import GHC.Stack
import Paths_gargantext
import Prelude
import qualified Data.Text as T
import Shelly
import System.Directory
--------------
-- | Main | --
--------------
phyloConfig :: FilePath -> PhyloConfig
phyloConfig outdir = PhyloConfig {
corpusPath = "corpus.csv"
, listPath = "list.csv"
, outputPath = outdir
, corpusParser = Csv {_csv_limit = 150000}
, listParser = V4
, phyloName = "phylo_profile_test"
, phyloScale = 2
, similarity = WeightedLogJaccard {_wlj_sensibility = 0.5, _wlj_minSharedNgrams = 2}
, seaElevation = Constante {_cons_start = 0.1, _cons_gap = 0.1}
, defaultMode = True
, findAncestors = True
, phyloSynchrony = ByProximityThreshold {_bpt_threshold = 0.5, _bpt_sensibility = 0.0, _bpt_scope = AllBranches, _bpt_strategy = MergeAllGroups}
, phyloQuality = Quality {_qua_granularity = 0.8, _qua_minBranch = 3}
, timeUnit = Year {_year_period = 3, _year_step = 1, _year_matchingFrame = 5}
, clique = MaxClique {_mcl_size = 5, _mcl_threshold = 1.0e-4, _mcl_filter = ByThreshold}
, exportLabel = [ BranchLabel {_branch_labelTagger = MostEmergentTfIdf, _branch_labelSize = 2}
, GroupLabel {_group_labelTagger = MostEmergentInclusive, _group_labelSize = 2}
]
, exportSort = ByHierarchy {_sort_order = Desc}
, exportFilter = [ByBranchSize {_branch_size = 3.0}]
}
main :: HasCallStack => IO ()
main = do
shelly $ escaping False $ withTmpDir $ \tdir -> do
curDir <- pwd
let output = curDir <> "/" <> "gargantext_profile_out.dot"
chdir tdir $ do
liftIO $ setLocaleEncoding utf8
bpaConfig <- liftIO $ getDataFileName "bench-data/phylo/bpa-config.json"
corpusPath' <- liftIO $ getDataFileName "bench-data/phylo/GarganText_DocsList-nodeId-185487.csv"
listPath' <- liftIO $ getDataFileName "bench-data/phylo/GarganText_NgramsList-185488.csv"
(Right config) <- fmap (\pcfg -> pcfg { outputPath = tdir
, corpusPath = corpusPath'
, listPath = listPath'
}) <$> liftIO (eitherDecodeFileStrict' bpaConfig)
mapList <- liftIO $ fileToList (listParser config) (listPath config)
corpus <- liftIO $ if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
liftIO $ do
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
printIOComment (show (length mapList) <> " Size ngs_terms List Map Ngrams")
printIOMsg "Reconstruct the phylo"
-- check the existing backup files
let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
phyloExists <- doesFileExist backupPhylo
-- reconstruct the phylo
phylo <- if phyloExists
then do
printIOMsg "Reconstruct the phylo from an existing file"
readPhylo backupPhylo
else do
if phyloWithoutLinkExists
then do
printIOMsg "Reconstruct the phylo from an existing file without links"
phyloWithoutLink <- readPhylo backupPhyloWithoutLink
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
else do
printIOMsg "Reconstruct the phylo from scratch"
phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
writePhylo backupPhylo phylo
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport (setConfig config phylo)
dotToFile output dot
echo "Done."
......@@ -46,8 +46,7 @@ import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude hiding (hash, replace)
import Prelude qualified
import System.Directory (listDirectory,doesFileExist)
data Backup = BackupPhyloWithoutLink | BackupPhylo deriving (Show)
import Common
---------------
-- | Tools | --
......@@ -145,87 +144,6 @@ fileToDocsDefault parser path timeUnits lst =
-- dans le phyloMaker si default est true alors dans le setDefault ou pense à utiliser la TimeUnit de la conf
---------------
-- | Label | --
---------------
-- Config time parameters to label
timeToLabel :: PhyloConfig -> [Char]
timeToLabel config = case (timeUnit config) of
Epoch p s f -> ("time_epochs" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Year p s f -> ("time_years" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Month p s f -> ("time_months" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Week p s f -> ("time_weeks" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
Day p s f -> ("time_days" <> "_" <> (show p) <> "_" <> (show s) <> "_" <> (show f))
seaToLabel :: PhyloConfig -> [Char]
seaToLabel config = case (seaElevation config) of
Constante start step -> ("sea_cst_" <> (show start) <> "_" <> (show step))
Adaptative granularity -> ("sea_adapt" <> (show granularity))
Evolving _ -> ("sea_evolv")
sensToLabel :: PhyloConfig -> [Char]
sensToLabel config = case (similarity config) of
Hamming _ _ -> Prelude.error "sensToLabel: unimplemented"
WeightedLogJaccard s _ -> ("WeightedLogJaccard_" <> show s)
WeightedLogSim s _ -> ( "WeightedLogSim-sens_" <> show s)
cliqueToLabel :: PhyloConfig -> [Char]
cliqueToLabel config = case (clique config) of
Fis s s' -> "fis_" <> (show s) <> "_" <> (show s')
MaxClique s t f -> "clique_" <> (show s)<> "_" <> (show f)<> "_" <> (show t)
syncToLabel :: PhyloConfig -> [Char]
syncToLabel config = case (phyloSynchrony config) of
ByProximityThreshold scl sync_sens scope _ -> ("scale_" <> (show scope) <> "_" <> (show sync_sens) <> "_" <> (show scl))
ByProximityDistribution _ _ -> "syncToLabel: unimplemented"
qualToConfig :: PhyloConfig -> [Char]
qualToConfig config = case (phyloQuality config) of
Quality g m -> "quality_" <> (show g) <> "_" <> (show m)
-- To set up the export file's label from the configuration
configToLabel :: PhyloConfig -> [Char]
configToLabel config = outputPath config
<> (unpack $ phyloName config)
<> "-" <> (timeToLabel config)
<> "-scale_" <> (show (phyloScale config))
<> "-" <> (seaToLabel config)
<> "-" <> (sensToLabel config)
<> "-" <> (cliqueToLabel config)
<> "-level_" <> (show (_qua_granularity $ phyloQuality config))
<> "-" <> (syncToLabel config)
<> ".dot"
-- To write a sha256 from a set of config's parameters
configToSha :: Backup -> PhyloConfig -> [Char]
configToSha stage config = unpack
$ replace "/" "-"
$ T.pack (show (hash $ C8.pack label))
where
label :: [Char]
label = case stage of
BackupPhyloWithoutLink -> (corpusPath config)
<> (listPath config)
<> (timeToLabel config)
<> (cliqueToLabel config)
BackupPhylo -> (corpusPath config)
<> (listPath config)
<> (timeToLabel config)
<> (cliqueToLabel config)
<> (sensToLabel config)
<> (seaToLabel config)
<> (syncToLabel config)
<> (qualToConfig config)
<> (show (phyloScale config))
readListV4 :: [Char] -> IO NgramsList
readListV4 path = do
......
This diff is collapsed.
......@@ -17,6 +17,9 @@ license: AGPL-3
license-file: LICENSE
build-type: Simple
data-files:
bench-data/phylo/bpa-config.json
bench-data/phylo/GarganText_DocsList-nodeId-185487.csv
bench-data/phylo/GarganText_NgramsList-185488.csv
bench-data/phylo/issue-290.json
bench-data/phylo/issue-290-small.json
devops/postgres/extensions.sql
......@@ -846,8 +849,9 @@ executable gargantext-phylo
main-is: Main.hs
other-modules:
Paths_gargantext
Common
hs-source-dirs:
bin/gargantext-phylo
bin/gargantext-phylo bin/gargantext-phylo/Phylo
default-extensions:
DataKinds
DeriveGeneric
......@@ -1199,3 +1203,27 @@ benchmark garg-bench
ghc-options: "-with-rtsopts=-T -A32m"
if impl(ghc >= 8.6)
ghc-options: "-with-rtsopts=--nonmoving-gc"
executable gargantext-phylo-profile
main-is: Main.hs
other-modules:
Paths_gargantext
Common
hs-source-dirs:
bin/gargantext-phylo-profile bin/gargantext-phylo/Phylo
default-extensions: GHC2021
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N -O2 -Wmissing-signatures
build-depends:
base
, bytestring
, gargantext
, gargantext-prelude
, shelly
, text
, async
, cryptohash
, aeson
, split
, vector
, directory
default-language: Haskell2010
......@@ -64,10 +64,11 @@ instance ToNamedRecord CsvList where
]
------------------------------------------------------------------------
instance FromField CsvListType where
parseField "map" = pure CsvMap
parseField "main" = pure CsvCandidate
parseField "stop" = pure CsvStop
parseField _ = mzero
parseField "map" = pure CsvMap
parseField "main" = pure CsvCandidate
parseField "candidate" = pure CsvCandidate -- backward compat
parseField "stop" = pure CsvStop
parseField _ = mzero
instance ToField CsvListType where
toField CsvMap = "map"
......
......@@ -195,7 +195,7 @@ toPhyloDate' y m d _ = pack $ showGregorian $ fromGregorian (toIntege
-- Utils
writePhylo :: [Char] -> Phylo -> IO ()
writePhylo :: HasCallStack => [Char] -> Phylo -> IO ()
writePhylo path phylo = Lazy.writeFile path $ encode phylo
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment