module CLI.FilterTermsAndCooc ( filterTermsAndCoocCmd , filterTermsAndCoocCLI -- * Testing functions , testCorpus , testTermList ) where import CLI.Types import Control.Concurrent.Async as CCA (mapConcurrently) import Data.Aeson ( encode ) import Data.Map.Strict qualified as DM import Data.Text (pack) import Data.Text qualified as DT import Data.Text.Lazy qualified as DTL import Data.Text.Lazy.Encoding qualified as TLE import Data.Tuple.Extra (both) import Data.Vector qualified as DV import GHC.Generics import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear) import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList ) import Gargantext.Prelude import Options.Applicative ------------------------------------------------------------------------ -- OUTPUT format data CoocByYear = CoocByYear { year :: Int , nbContexts :: NbContexts , coocurrences :: Map (Text, Text) Coocs } deriving (Show, Generic) data CoocByYears = CoocByYears { years :: [CoocByYear] } deriving (Show, Generic) type NbContexts = Int instance ToJSON CoocByYear instance ToJSON CoocByYears ------------------------------------------------------------------------ filterTermsAndCoocCLI :: CorpusFile -> TermListFile -> OutputFile -> IO () filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (OutputFile outputFile) = do --corpus :: IO (DM.IntMap [[Text]]) eCorpusFile <- readTSVFile corpusFile case eCorpusFile of Right cf -> do let corpus = DM.fromListWith (<>) . DV.toList . DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)])) . snd $ cf -- termListMap :: [Text] termList <- tsvMapTermList termListFile putText $ show $ length termList let patterns = buildPatterns termList -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus) r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus) writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r) Left e -> panicTrace $ "Error: " <> e filterTermsAndCooc :: Patterns -> (Int, [Text]) -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs)) filterTermsAndCooc patterns (year, ts) = do logWork "start" r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts logWork "stop" pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r) where logWork m = do tid <- myThreadId (p, _) <- threadCapability tid putText . unwords $ ["filterTermsAndCooc:", m, show year, "on proc", show p] --terms' :: Patterns -> Text -> Corpus [[Text]] terms' :: Applicative f => Patterns -> Text -> f [[Text]] terms' pats txt = pure $ concat $ extractTermsWithList pats txt -- | TODO Minimal Example --testCooc = do -- let patterns = buildPatterns testTermList -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus testCorpus :: [(Int, [Text])] testCorpus = [ (1998, [pack "The beees"]) , (1999, [ pack "The bees and the flowers" --, pack "The bees and the flowers" ]) ] testTermList :: TermList testTermList = [ ([pack "bee"], [[pack "bees"]]) , ([pack "flower"], [[pack "flowers"]]) ] -- -- CLI API -- filterTermsAndCoocCmd :: HasCallStack => Mod CommandFields CLI filterTermsAndCoocCmd = command "filter-terms" (info (helper <*> fmap CLISub filterTerms) (progDesc "Filter Terms and Cooc.")) filterTerms :: Parser CLICmd filterTerms = CCMD_filter_terms_and_cooc <$> (option str (long "corpus-file" <> metavar "FILE")) <*> (option str (long "terms-list-file" <> metavar "FILE")) <*> (option str (long "output-file" <> metavar "FILE"))