{-| Module : Main.hs Description : Gargantext starter Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX Main specifications to index a corpus with a term list -} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE Strict #-} module Main where import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Monad.IO.Class import Data.Aeson import Data.List.Split (chunksOf) import Data.Map.Strict qualified as DM import Data.Text (pack) import Data.Text qualified as DT import Data.Text.Lazy qualified as DTL import Data.Text.Lazy.Encoding qualified as TLE import Data.Tuple.Extra (both) import Data.Vector qualified as DV import GHC.Generics import Gargantext.Core.Text.Context import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, fromMIntOrDec, defaultYear) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Terms.WithList import Gargantext.Prelude hiding (show) import Protolude import System.IO (hFlush) ------------------------------------------------------------------------ -- OUTPUT format data CoocByYear = CoocByYear { year :: Int , nbContexts :: NbContexts , coocurrences :: Map (Text, Text) Coocs } deriving (Show, Generic) data CoocByYears = CoocByYears { years :: [CoocByYear] } deriving (Show, Generic) type NbContexts = Int instance ToJSON CoocByYear instance ToJSON CoocByYears ------------------------------------------------------------------------ filterTermsAndCooc :: Patterns -> (Int, [Text]) -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs)) filterTermsAndCooc patterns (year, ts) = do logWork "start" r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts logWork "stop" pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r) where logWork m = do tid <- myThreadId (p, _) <- threadCapability tid putText . unwords $ ["filterTermsAndCooc:", m, show year, "on proc", show p] main :: IO () main = do [corpusFile, termListFile, outputFile] <- getArgs --corpus :: IO (DM.IntMap [[Text]]) eCorpusFile <- readCSVFile corpusFile case eCorpusFile of Right cf -> do let corpus = DM.fromListWith (<>) . DV.toList . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)])) . snd $ cf -- termListMap :: [Text] termList <- csvMapTermList termListFile putText $ show $ length termList let patterns = buildPatterns termList -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus) r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus) writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r) Left e -> panic $ "Error: " <> e ------------------------------------------------------------------------ -- | Tools mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b] mapMP f xs = do bs <- zipWithM g (cycle "-\\|/") xs liftIO $ hPutStr stderr ("\rDone\n" :: Text) pure bs where g c x = do liftIO $ hPutStr stderr ['\r',c] liftIO $ hFlush stderr f x -- | Optimi that need further developments (not used yet) mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b] mapConcurrentlyChunked f ts = do caps <- getNumCapabilities let n = 1 `max` (length ts `div` caps) concat <$> mapConcurrently (mapM f) (chunksOf n ts) --terms' :: Patterns -> Text -> Corpus [[Text]] terms' :: Applicative f => Patterns -> Text -> f [[Text]] terms' pats txt = pure $ concat $ extractTermsWithList pats txt -- | TODO Minimal Example --testCooc = do -- let patterns = buildPatterns testTermList -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus -- --mapConcurrently (filterTermsAndCooc patterns) testCorpus testCorpus :: [(Int, [Text])] testCorpus = [ (1998, [pack "The beees"]) , (1999, [ pack "The bees and the flowers" --, pack "The bees and the flowers" ]) ] testTermList :: TermList testTermList = [ ([pack "bee"], [[pack "bees"]]) , ([pack "flower"], [[pack "flowers"]]) ]