
module CLI.FilterTermsAndCooc (
    filterTermsAndCoocCmd
  , filterTermsAndCoocCLI

  -- * Testing functions
  , testCorpus
  , testTermList
  ) where

import CLI.Types
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Aeson ( encode )
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import GHC.Generics
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
import Options.Applicative

------------------------------------------------------------------------
-- OUTPUT format

data CoocByYear = CoocByYear { year         :: Int
                             , nbContexts   :: NbContexts
                             , coocurrences :: Map (Text, Text) Coocs
                             } deriving (Show, Generic)

data CoocByYears = CoocByYears { years :: [CoocByYear] }
  deriving (Show, Generic)

type NbContexts = Int

instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------

filterTermsAndCoocCLI :: CorpusFile -> TermListFile -> OutputFile -> IO ()
filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (OutputFile outputFile) = do
  --corpus :: IO (DM.IntMap [[Text]])
  eCorpusFile <- readTSVFile corpusFile
  case eCorpusFile of
    Right cf -> do
      let corpus = DM.fromListWith (<>)
                   . DV.toList
                   . DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
                   . snd $ cf

      -- termListMap :: [Text]
      termList <- tsvMapTermList termListFile

      putText $ show $ length termList

      let patterns = buildPatterns termList

      -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
      r <-  mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
      writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
    Left e -> panicTrace $ "Error: " <> e




filterTermsAndCooc
  :: Patterns
     -> (Int, [Text])
     -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
  logWork "start"
  r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
  logWork "stop"
  pure $ CoocByYear year (length ts) (DM.mapKeys (both DT.unwords) r)
  where

    logWork m = do
      tid    <- myThreadId
      (p, _) <- threadCapability tid
      putText . unwords $
        ["filterTermsAndCooc:", m, show year, "on proc", show p]

--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt


-- | TODO Minimal Example
--testCooc = do
--  let patterns = buildPatterns testTermList
--  mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
--  --mapConcurrently (filterTermsAndCooc patterns) testCorpus


testCorpus :: [(Int, [Text])]
testCorpus = [ (1998, [pack "The beees"])
             , (1999, [ pack "The bees and the flowers"
                      --, pack "The bees and the flowers"
                      ])
             ]

testTermList :: TermList
testTermList = [ ([pack "bee"], [[pack "bees"]])
               , ([pack "flower"], [[pack "flowers"]])
               ]

--
-- CLI API
--

filterTermsAndCoocCmd :: HasCallStack => Mod CommandFields CLI
filterTermsAndCoocCmd = command "filter-terms" (info (helper <*> fmap CLISub filterTerms) (progDesc "Filter Terms and Cooc."))

filterTerms :: Parser CLICmd
filterTerms = CCMD_filter_terms_and_cooc
  <$> (option str (long "corpus-file" <> metavar "FILE"))
  <*> (option str (long "terms-list-file" <> metavar "FILE"))
  <*> (option str (long "output-file" <> metavar "FILE"))
