Commit 19c7c2db authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/adinapoli/issue-355' into dev

parents 5589b7ae 7c1bc974
......@@ -12,7 +12,7 @@ compress the contexts around the main terms of the query.
-}
module CleanCsvCorpus where
module CLI.CleanCsvCorpus where
import Data.SearchEngine qualified as S
import Data.Set qualified as S
......
module CLI.FilterTermsAndCooc (
filterTermsAndCoocCmd
, filterTermsAndCoocCLI
-- * Testing functions
, testCorpus
, testTermList
) where
import CLI.Types
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Aeson ( encode )
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import GHC.Generics
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
import Options.Applicative
------------------------------------------------------------------------
-- OUTPUT format
data CoocByYear = CoocByYear { year :: Int
, nbContexts :: NbContexts
, coocurrences :: Map (Text, Text) Coocs
} deriving (Show, Generic)
data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
type NbContexts = Int
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCoocCLI :: CorpusFile -> TermListFile -> OutputFile -> IO ()
filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (OutputFile outputFile) = do
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile <- readTSVFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- tsvMapTermList termListFile
putText $ show $ length termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panicTrace $ "Error: " <> e
filterTermsAndCooc
:: Patterns
-> (Int, [Text])
-> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
logWork "start"
r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
logWork "stop"
pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
where
logWork m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
, (1999, [ pack "The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList :: TermList
testTermList = [ ([pack "bee"], [[pack "bees"]])
, ([pack "flower"], [[pack "flowers"]])
]
--
-- CLI API
--
filterTermsAndCoocCmd :: HasCallStack => Mod CommandFields CLI
filterTermsAndCoocCmd = command "filter-terms" (info (helper <*> fmap CLISub filterTerms) (progDesc "Filter Terms and Cooc."))
filterTerms :: Parser CLICmd
filterTerms = CCMD_filter_terms_and_cooc
<$> (option str (long "corpus-file" <> metavar "FILE"))
<*> (option str (long "terms-list-file" <> metavar "FILE"))
<*> (option str (long "output-file" <> metavar "FILE"))
......@@ -28,8 +28,10 @@ https://stackoverflow.com/questions/876522/creating-a-copy-of-a-database-in-post
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE Strict #-}
module Main where
module CLI.ObfuscateDB (
obfuscateDB
, obfuscateDBCmd
) where
import Data.Text qualified as T
import Database.PostgreSQL.Simple qualified as PSQL
......@@ -37,23 +39,16 @@ import Database.PostgreSQL.Simple.SqlQQ (sql)
import Gargantext.Core (toDBid)
import Gargantext.Database.Admin.Config ()
import Gargantext.Database.Admin.Types.Node (NodeType(..))
import Gargantext.Prelude hiding (option)
import Gargantext.Prelude
import Gargantext.Prelude.Database (runPGSExecute, runPGSQuery)
import Options.Applicative.Simple
import CLI.Types
import Options.Applicative
data Args = Args {
dbHost :: Text
, dbPort :: Int
, dbName :: Text
, dbUser :: Text
, dbPassword :: Text
} deriving (Show, Eq)
obfuscateDBCmd :: HasCallStack => Mod CommandFields CLI
obfuscateDBCmd = command "obfuscate-db" (info (helper <*> fmap CLISub obfuscateDB_p) (progDesc "Obfuscate a cloned Gargantext DB."))
args :: Parser Args
args = Args
obfuscateDB_p :: Parser CLICmd
obfuscateDB_p = fmap CCMD_obfuscate_db $ ObfuscateDBArgs
<$> ( strOption ( long "db-host"
<> metavar "db-host"
<> help "Location of the DB server"
......@@ -71,17 +66,9 @@ args = Args
<*> ( strOption ( long "db-password"
<> metavar "db-password"
<> value "" ))
main :: IO ()
main = do
(opts, ()) <-
simpleOptions "0.0.1"
"gargantext DB obfuscation"
"Obfuscates a cloned Gargantext DB"
args
empty
obfuscateDB :: ObfuscateDBArgs -> IO ()
obfuscateDB opts = do
putText $ show opts
let ci = PSQL.ConnectInfo { connectHost = T.unpack $ dbHost opts
......@@ -101,7 +88,7 @@ main = do
obfuscateNotes :: PSQL.Connection -> IO ()
obfuscateNotes c = do
let nt = toDBid Notes
_ <- runPGSExecute c [sql|UPDATE nodes SET name = concat('notes-', id) WHERE typename = ?;|] (PSQL.Only nt)
nsNew <- runPGSQuery c [sql|SELECT id, name FROM nodes WHERE typename = ?|] (PSQL.Only nt) :: IO [(Int, Text)]
......
module CLI.Types where
import Prelude
import Data.String
import Data.Text (Text)
newtype CorpusFile = CorpusFile { _CorpusFile :: FilePath }
deriving (Show, Eq, IsString)
newtype TermListFile = TermListFile { _TermsListFile :: FilePath }
deriving (Show, Eq, IsString)
newtype OutputFile = OutputFile { _OutputFile :: FilePath }
deriving (Show, Eq, IsString)
data ObfuscateDBArgs = ObfuscateDBArgs {
dbHost :: !Text
, dbPort :: !Int
, dbName :: !Text
, dbUser :: !Text
, dbPassword :: !Text
} deriving (Show, Eq)
data CLICmd
= CCMD_clean_csv_corpus
| CCMD_filter_terms_and_cooc !CorpusFile !TermListFile !OutputFile
| CCMD_obfuscate_db !ObfuscateDBArgs
deriving (Show, Eq)
data CLI =
CLISub CLICmd
deriving (Show, Eq)
module CLI.Utils (
mapMP
, mapConcurrentlyChunked
) where
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.List.Split (chunksOf)
import Gargantext.Prelude
import System.IO (hFlush)
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
......@@ -12,129 +12,37 @@ Main specifications to index a corpus with a term list
-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TypeApplications #-}
module Main where
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Aeson ( ToJSON, encode )
import Data.List.Split (chunksOf)
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
import System.IO (hFlush)
import Prelude
------------------------------------------------------------------------
-- OUTPUT format
import CLI.FilterTermsAndCooc
import CLI.ObfuscateDB (obfuscateDB, obfuscateDBCmd)
import CLI.Types
import Options.Applicative
data CoocByYear = CoocByYear { year :: Int
, nbContexts :: NbContexts
, coocurrences :: Map (Text, Text) Coocs
} deriving (Show, Generic)
data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
type NbContexts = Int
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
:: Patterns
-> (Int, [Text])
-> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
logWork "start"
r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
logWork "stop"
pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
where
logWork m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
runCLI :: CLI -> IO ()
runCLI = \case
CLISub CCMD_clean_csv_corpus
-> putStrLn "TODO."
CLISub (CCMD_filter_terms_and_cooc corpusFile termListFile outputFile)
-> filterTermsAndCoocCLI corpusFile termListFile outputFile
CLISub (CCMD_obfuscate_db args)
-> obfuscateDB args
main :: IO ()
main = do
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile <- readTSVFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- tsvMapTermList termListFile
putText $ show $ length termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panicTrace $ "Error: " <> e
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs
main = runCLI =<< execParser opts
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
, (1999, [ pack "The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList :: TermList
testTermList = [ ([pack "bee"], [[pack "bees"]])
, ([pack "flower"], [[pack "flowers"]])
]
opts = info (helper <*> allOptions)
( fullDesc
<> progDesc "CLI for the gargantext-server"
<> header "gargantext-cli tools" )
allOptions :: Parser CLI
allOptions = subparser (
filterTermsAndCoocCmd <>
obfuscateDBCmd
)
......@@ -81,7 +81,7 @@ common optimized
-rtsopts
-with-rtsopts=-N
-Wmissing-signatures
-- When enabled, it swaps the hashing algorithm
-- with a quicker (and less secure) version, which
-- runs faster in tests.
......@@ -89,10 +89,6 @@ flag test-crypto
default: False
manual: True
flag disable-db-obfuscation-executable
default: False
manual: True
-- When enabled, it suppresses at compile time the
-- debug output for the phylo code, so that it doesn't
-- hinder its performance.
......@@ -710,7 +706,11 @@ executable gargantext-cli
, optimized
main-is: Main.hs
other-modules:
CleanCsvCorpus
CLI.CleanCsvCorpus
CLI.FilterTermsAndCooc
CLI.ObfuscateDB
CLI.Types
CLI.Utils
Paths_gargantext
hs-source-dirs:
bin/gargantext-cli
......@@ -720,38 +720,21 @@ executable gargantext-cli
, bytestring ^>= 0.10.12.0
, cassava ^>= 0.5.2.0
, containers ^>= 0.6.5.1
, extra
, extra ^>= 1.7.9
, full-text-search ^>= 0.2.1.4
, gargantext
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-applicative
, optparse-generic ^>= 1.4.7
, postgresql-simple ^>= 0.6.4
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
, vector ^>= 0.12.3.0
executable gargantext-db-obfuscation
import:
defaults
, optimized
main-is: Main.hs
other-modules:
Paths_gargantext
hs-source-dirs:
bin/gargantext-db-obfuscation
if flag(disable-db-obfuscation-executable)
buildable: False
else
build-depends:
extra
, gargantext
, gargantext-prelude
, optparse-simple
, postgresql-simple ^>= 0.6.4
, text
executable gargantext-import
import:
defaults
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment