Commit 1905b024 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Port phylo to CLI executable

parent 409c8423
{-|
Module : Phylo.hs
Description : Gargantext starter binary with Adaptative Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adaptative Phylo binaries
-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module CLI.Phylo where
import CLI.Phylo.Common
import CLI.Types
import Data.Aeson (eitherDecodeFileStrict')
import Data.List (nub)
import Data.Text qualified as T
import GHC.IO.Encoding
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig)
import Gargantext.Prelude hiding (hash, replace)
import Options.Applicative
import System.Directory (doesFileExist)
phyloCLI :: PhyloArgs -> IO ()
phyloCLI (PhyloArgs configPath) = do
setLocaleEncoding utf8
config_e <- eitherDecodeFileStrict' configPath
case config_e of
Left err -> panicTrace $ T.pack err
Right config -> do
currentLocale <- getLocaleEncoding
printIOMsg $ "Machine locale: " <> show currentLocale
printIOMsg "Starting the reconstruction"
printIOMsg "Parse the corpus"
mapList <- fileToList (listParser config) (listPath config)
corpus <- if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
printIOComment (show (length mapList) <> " Size ngs_terms List Map Ngrams")
printIOMsg "Reconstruct the phylo"
-- check the existing backup files
let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
phyloExists <- doesFileExist backupPhylo
-- reconstruct the phylo
phylo <- if phyloExists
then do
printIOMsg "Reconstruct the phylo from an existing file"
readPhylo backupPhylo
else do
if phyloWithoutLinkExists
then do
printIOMsg "Reconstruct the phylo from an existing file without links"
phyloWithoutLink <- readPhylo backupPhyloWithoutLink
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
else do
printIOMsg "Reconstruct the phylo from scratch"
phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
writePhylo backupPhylo phylo
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport (setConfig config phylo)
let output = configToLabel config
dotToFile output dot
phyloCmd :: HasCallStack => Mod CommandFields CLI
phyloCmd = command "phylo" (info (helper <*> fmap CLISub phylo_p) (progDesc "Phylo toolkit."))
phylo_p :: Parser CLICmd
phylo_p = fmap CCMD_phylo $ PhyloArgs
<$> ( strOption ( long "config"
<> metavar "FILEPATH"
<> help "Path to a file containing a JSON to be parsed into a PhyloConfig"
) )
{-# LANGUAGE OverloadedStrings #-}
module Common where
module CLI.Phylo.Common where
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
......@@ -99,6 +99,8 @@ tsvToDocs parser patterns time path =
(map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row)))
time
) <$> snd <$> Tsv.readWeightedTsv path
Csv _ -> panicTrace "CSV is currently not supported."
Csv' _ -> panicTrace "CSV is currently not supported."
-- To parse a file into a list of Document
......@@ -109,6 +111,8 @@ fileToDocsAdvanced parser path time lst = do
Wos limit -> wosToDocs limit patterns time path
Tsv _ -> tsvToDocs parser patterns time path
Tsv' _ -> tsvToDocs parser patterns time path
Csv _ -> panicTrace "CSV is currently not supported."
Csv' _ -> panicTrace "CSV is currently not supported."
fileToDocsDefault :: CorpusParser -> FilePath -> [TimeUnit] -> TermList -> IO [Document]
fileToDocsDefault parser path timeUnits lst =
......
......@@ -55,6 +55,10 @@ data InvitationsArgs = InvitationsArgs
, inv_email :: !Text
} deriving (Show, Eq)
data PhyloArgs = PhyloArgs
{ phylo_config :: !FilePath
} deriving (Show, Eq)
data CLICmd
= CCMD_clean_csv_corpus
| CCMD_filter_terms_and_cooc !CorpusFile !TermListFile !OutputFile
......@@ -63,6 +67,7 @@ data CLICmd
| CCMD_import !ImportArgs
| CCMD_init !InitArgs
| CCMD_invitations !InvitationsArgs
| CCMD_phylo !PhyloArgs
deriving (Show, Eq)
data CLI =
......
......@@ -27,6 +27,7 @@ import CLI.Admin (adminCLI, adminCmd)
import CLI.Import (importCLI, importCmd)
import CLI.Init (initCLI, initCmd)
import CLI.Invitations (invitationsCLI, invitationsCmd)
import CLI.Phylo (phyloCLI, phyloCmd)
runCLI :: CLI -> IO ()
runCLI = \case
......@@ -44,6 +45,8 @@ runCLI = \case
-> initCLI args
CLISub (CCMD_invitations args)
-> invitationsCLI args
CLISub (CCMD_phylo args)
-> phyloCLI args
main :: IO ()
main = runCLI =<< execParser opts
......@@ -60,5 +63,6 @@ allOptions = subparser (
adminCmd <>
importCmd <>
initCmd <>
invitationsCmd
invitationsCmd <>
phyloCmd
)
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Common
import CLI.Phylo.Common
import Data.Aeson
import Data.List (nub)
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
import Gargantext.Core.Viz.Phylo.PhyloTools
import GHC.IO.Encoding
import GHC.Stack
import Paths_gargantext
import Prelude
import qualified Data.Text as T
import Shelly
import System.Directory
......
{-|
Module : Main.hs
Description : Gargantext starter binary with Adaptative Phylo
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Adaptative Phylo binaries
-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson
import Data.ByteString.Char8 qualified as C8
import Data.List (nub, isSuffixOf, tail)
import Data.List.Split
import Data.Maybe (fromJust)
import Data.Text (unpack, replace, pack)
import Data.Text qualified as T
import Data.Vector qualified as Vector
import GHC.IO.Encoding
import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.TSV (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day, tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Viz.Phylo
import Gargantext.Core.Viz.Phylo.API.Tools
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
import Gargantext.Core.Viz.Phylo.PhyloMaker (toPhylo, toPhyloWithoutLink)
import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setConfig, toPeriods, getTimePeriod, getTimeStep)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (hash, replace)
import Prelude qualified
import System.Directory (listDirectory,doesFileExist)
import Common
main :: IO ()
main = do
setLocaleEncoding utf8
currentLocale <- getLocaleEncoding
printIOMsg $ "Machine locale: " <> show currentLocale
printIOMsg "Starting the reconstruction"
printIOMsg "Read the configuration file"
[args] <- getArgs
jsonArgs <- (eitherDecode <$> readJson args) :: IO (Either Prelude.String PhyloConfig)
case jsonArgs of
Left err -> putStrLn err
Right config -> do
printIOMsg "Parse the corpus"
mapList <- fileToList (listParser config) (listPath config)
corpus <- if (defaultMode config)
then fileToDocsDefault (corpusParser config) (corpusPath config) [Year 3 1 5,Month 3 1 5,Week 4 2 5] mapList
else fileToDocsAdvanced (corpusParser config) (corpusPath config) (timeUnit config) mapList
printIOComment (show (length corpus) <> " parsed docs from the corpus")
printIOComment (show (length $ nub $ concat $ map text corpus) <> " Size ngs_coterms")
printIOComment (show (length mapList) <> " Size ngs_terms List Map Ngrams")
printIOMsg "Reconstruct the phylo"
-- check the existing backup files
let backupPhyloWithoutLink = (outputPath config) <> "backupPhyloWithoutLink_" <> (configToSha BackupPhyloWithoutLink config) <> ".json"
let backupPhylo = (outputPath config) <> "backupPhylo_" <> (configToSha BackupPhylo config) <> ".json"
phyloWithoutLinkExists <- doesFileExist backupPhyloWithoutLink
phyloExists <- doesFileExist backupPhylo
-- reconstruct the phylo
phylo <- if phyloExists
then do
printIOMsg "Reconstruct the phylo from an existing file"
readPhylo backupPhylo
else do
if phyloWithoutLinkExists
then do
printIOMsg "Reconstruct the phylo from an existing file without links"
phyloWithoutLink <- readPhylo backupPhyloWithoutLink
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
else do
printIOMsg "Reconstruct the phylo from scratch"
phyloWithoutLink <- pure $ toPhyloWithoutLink corpus config
writePhylo backupPhyloWithoutLink phyloWithoutLink
pure $ toPhylo (setConfig config phyloWithoutLink)
writePhylo backupPhylo phylo
printIOMsg "End of reconstruction, start the export"
let dot = toPhyloExport (setConfig config phylo)
let output = configToLabel config
dotToFile output dot
......@@ -705,6 +705,8 @@ executable gargantext-cli
CLI.Init
CLI.Invitations
CLI.ObfuscateDB
CLI.Phylo
CLI.Phylo.Common
CLI.Types
CLI.Utils
Paths_gargantext
......@@ -716,6 +718,8 @@ executable gargantext-cli
, bytestring ^>= 0.10.12.0
, cassava ^>= 0.5.2.0
, containers ^>= 0.6.5.1
, cryptohash ^>= 0.11.9
, directory ^>= 1.3.6.0
, extra ^>= 1.7.9
, full-text-search ^>= 0.2.1.4
, gargantext
......@@ -723,55 +727,11 @@ executable gargantext-cli
, ini ^>= 0.4.1
, optparse-applicative
, optparse-generic ^>= 1.4.7
, parallel ^>= 3.2.2.0
, postgresql-simple ^>= 0.6.4
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.12.3.0
executable gargantext-phylo
import:
defaults
, optimized
main-is: Main.hs
other-modules:
Paths_gargantext
Common
hs-source-dirs:
bin/gargantext-phylo bin/gargantext-phylo/Phylo
default-extensions:
DataKinds
DeriveGeneric
FlexibleContexts
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
ImportQualifiedPost
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
OverloadedStrings
RankNTypes
RecordWildCards
StrictData
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N -O2 -Wmissing-signatures
build-depends:
aeson ^>= 1.5.6.0
, async ^>= 2.2.4
, bytestring ^>= 0.10.12.0
, cassava ^>= 0.5.2.0
, containers ^>= 0.6.5.1
, cryptohash ^>= 0.11.9
, directory ^>= 1.3.6.0
, extra ^>= 1.7.9
, gargantext
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, parallel ^>= 3.2.2.0
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, time ^>= 1.9.3
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.7.3
......@@ -823,7 +783,7 @@ test-suite garg-test-tasty
main-is: drivers/tasty/Main.hs
other-modules:
Test.API.Routes
Common
CLI.Phylo.Common
Test.API.Setup
Test.Core.Similarity
Test.Core.Text
......@@ -859,7 +819,7 @@ test-suite garg-test-tasty
Test.Utils.Jobs
Paths_gargantext
hs-source-dirs:
test bin/gargantext-phylo/Phylo
test bin/gargantext-cli
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N
build-depends:
QuickCheck ^>= 2.14.2
......@@ -1042,9 +1002,9 @@ executable gargantext-phylo-profile
main-is: Main.hs
other-modules:
Paths_gargantext
Common
CLI.Phylo.Common
hs-source-dirs:
bin/gargantext-phylo-profile bin/gargantext-phylo/Phylo
bin/gargantext-phylo-profile bin/gargantext-cli
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N -O2 -Wmissing-signatures
build-depends:
base
......
......@@ -5,7 +5,7 @@
module Test.Offline.Phylo (tests) where
import Common
import CLI.Phylo.Common
import Data.Aeson as JSON
import Data.Aeson.Types qualified as JSON
import Data.GraphViz.Attributes.Complete qualified as Graphviz
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment