Commit f789b2e4 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

CLI: Add the filter-terms command

parent 0087a0f8
......@@ -12,7 +12,7 @@ compress the contexts around the main terms of the query.
-}
module CleanCsvCorpus where
module CLI.CleanCsvCorpus where
import Data.SearchEngine qualified as S
import Data.Set qualified as S
......
module CLI.FilterTermsAndCooc (
filterTermsAndCoocCmd
, filterTermsAndCoocCLI
-- * Testing functions
, testCorpus
, testTermList
) where
import CLI.Types
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Aeson ( encode )
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import GHC.Generics
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
import Options.Applicative
------------------------------------------------------------------------
-- OUTPUT format
data CoocByYear = CoocByYear { year :: Int
, nbContexts :: NbContexts
, coocurrences :: Map (Text, Text) Coocs
} deriving (Show, Generic)
data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
type NbContexts = Int
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCoocCLI :: CorpusFile -> TermListFile -> OutputFile -> IO ()
filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (OutputFile outputFile) = do
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile <- readTSVFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- tsvMapTermList termListFile
putText $ show $ length termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panicTrace $ "Error: " <> e
filterTermsAndCooc
:: Patterns
-> (Int, [Text])
-> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
logWork "start"
r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
logWork "stop"
pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
where
logWork m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
, (1999, [ pack "The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList :: TermList
testTermList = [ ([pack "bee"], [[pack "bees"]])
, ([pack "flower"], [[pack "flowers"]])
]
--
-- CLI API
--
filterTermsAndCoocCmd :: HasCallStack => Mod CommandFields CLI
filterTermsAndCoocCmd = command "filter-terms" (info (helper <*> fmap CLISub filterTerms) (progDesc "Filter Terms and Cooc."))
filterTerms :: Parser CLICmd
filterTerms = CCMD_filter_terms_and_cooc
<$> (option str (long "corpus-file" <> metavar "FILE"))
<*> (option str (long "terms-list-file" <> metavar "FILE"))
<*> (option str (long "output-file" <> metavar "FILE"))
module CLI.Types where
import Prelude
import Data.String
newtype CorpusFile = CorpusFile { _CorpusFile :: FilePath }
deriving (Show, Eq, IsString)
newtype TermListFile = TermListFile { _TermsListFile :: FilePath }
deriving (Show, Eq, IsString)
newtype OutputFile = OutputFile { _OutputFile :: FilePath }
deriving (Show, Eq, IsString)
data CLICmd
= CCMD_clean_csv_corpus
| CCMD_filter_terms_and_cooc !CorpusFile !TermListFile !OutputFile
deriving (Show, Eq)
data CLI =
CLISub CLICmd
deriving (Show, Eq)
module CLI.Utils (
mapMP
, mapConcurrentlyChunked
) where
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.List.Split (chunksOf)
import Gargantext.Prelude
import System.IO (hFlush)
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
......@@ -12,129 +12,33 @@ Main specifications to index a corpus with a term list
-}
{-# LANGUAGE Strict #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TypeApplications #-}
module Main where
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Aeson ( ToJSON, encode )
import Data.List.Split (chunksOf)
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
import System.IO (hFlush)
import Prelude
------------------------------------------------------------------------
-- OUTPUT format
import CLI.Types
import CLI.FilterTermsAndCooc
import Options.Applicative
data CoocByYear = CoocByYear { year :: Int
, nbContexts :: NbContexts
, coocurrences :: Map (Text, Text) Coocs
} deriving (Show, Generic)
data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
type NbContexts = Int
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
:: Patterns
-> (Int, [Text])
-> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
logWork "start"
r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
logWork "stop"
pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
where
logWork m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
runCLI :: CLI -> IO ()
runCLI = \case
CLISub CCMD_clean_csv_corpus
-> putStrLn "TODO."
CLISub (CCMD_filter_terms_and_cooc corpusFile termListFile outputFile)
-> filterTermsAndCoocCLI corpusFile termListFile outputFile
main :: IO ()
main = do
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile <- readTSVFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- tsvMapTermList termListFile
putText $ show $ length termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panicTrace $ "Error: " <> e
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs
main = runCLI =<< execParser opts
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
, (1999, [ pack "The bees and the flowers"
--, pack "The bees and the flowers"
])
]
testTermList :: TermList
testTermList = [ ([pack "bee"], [[pack "bees"]])
, ([pack "flower"], [[pack "flowers"]])
]
opts = info (helper <*> allOptions)
( fullDesc
<> progDesc "CLI for the gargantext-server"
<> header "gargantext-cli tools" )
allOptions :: Parser CLI
allOptions = subparser (
filterTermsAndCoocCmd
)
......@@ -710,7 +710,10 @@ executable gargantext-cli
, optimized
main-is: Main.hs
other-modules:
CleanCsvCorpus
CLI.CleanCsvCorpus
CLI.Types
CLI.FilterTermsAndCooc
CLI.Utils
Paths_gargantext
hs-source-dirs:
bin/gargantext-cli
......@@ -726,6 +729,7 @@ executable gargantext-cli
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, optparse-applicative
, protolude ^>= 0.3.3
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment