Commit d63df339 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[upload] fix csv upload job response

Now the job doesn't return 'succeeded' fields when it actually fails.
parent 8667dfeb
...@@ -16,11 +16,24 @@ Adaptative Phylo binaries ...@@ -16,11 +16,24 @@ Adaptative Phylo binaries
module Main where module Main where
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson import Data.Aeson
import Data.Either (Either(..))
import Data.List (concat, nub, isSuffixOf) import Data.List (concat, nub, isSuffixOf)
import Data.String (String) import Data.String (String)
import GHC.IO (FilePath)
import qualified Prelude as Prelude
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Data.Text (Text, unwords, unpack, replace, pack) import Data.Text (Text, unwords, unpack, replace, pack)
import Crypto.Hash.SHA256 (hash) import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
...@@ -36,20 +49,6 @@ import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setCon ...@@ -36,20 +49,6 @@ import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setCon
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile) import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.API.Ngrams.Prelude (toTermList) -- import Gargantext.API.Ngrams.Prelude (toTermList)
import GHC.IO (FilePath)
import Prelude (Either(Left, Right),toInteger)
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Control.Concurrent.Async (mapConcurrently)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
-- import Debug.Trace (trace) -- import Debug.Trace (trace)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show) data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
...@@ -84,13 +83,13 @@ toDays y m d = fromIntegral ...@@ -84,13 +83,13 @@ toDays y m d = fromIntegral
toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of toPhyloDate y m d tu = case tu of
Year _ _ _ -> y Year _ _ _ -> y
Month _ _ _ -> toMonths (toInteger y) m d Month _ _ _ -> toMonths (Prelude.toInteger y) m d
Week _ _ _ -> div (toDays (toInteger y) m d) 7 Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
Day _ _ _ -> toDays (toInteger y) m d Day _ _ _ -> toDays (Prelude.toInteger y) m d
toPhyloDate' :: Int -> Int -> Int -> Text toPhyloDate' :: Int -> Int -> Int -> Text
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
-------------- --------------
...@@ -113,43 +112,53 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p ...@@ -113,43 +112,53 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs -- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document]) wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
wosToDocs limit patterns time path = do wosToDocs limit patterns time path = do
files <- getFilesFromPath path files <- getFilesFromPath path
take limit let parseFile' file = do
<$> map (\d -> let title = fromJust $ _hd_title d eParsed <- parseFile WOS (path <> file)
abstr = if (isJust $ _hd_abstract d) case eParsed of
then fromJust $ _hd_abstract d Right ps -> pure ps
else "" Left e -> panic $ "Error: " <> (pack e)
in Document (toPhyloDate take limit
(fromIntegral $ fromJust $ _hd_publication_year d) <$> map (\d -> let title = fromJust $ _hd_title d
(fromJust $ _hd_publication_month d) abstr = if (isJust $ _hd_abstract d)
(fromJust $ _hd_publication_day d) time) then fromJust $ _hd_abstract d
(toPhyloDate' else ""
(fromIntegral $ fromJust $ _hd_publication_year d) in Document (toPhyloDate
(fromJust $ _hd_publication_month d) (fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_day d)) (fromJust $ _hd_publication_month d)
(termsInText patterns $ title <> " " <> abstr) Nothing []) (fromJust $ _hd_publication_day d) time)
<$> concat (toPhyloDate'
<$> mapConcurrently (\file -> (fromIntegral $ fromJust $ _hd_publication_year d)
filter (\d -> (isJust $ _hd_publication_year d) (fromJust $ _hd_publication_month d)
&& (isJust $ _hd_title d)) (fromJust $ _hd_publication_day d))
<$> parseFile WOS (path <> file) ) files (termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> parseFile' file) files
-- To transform a Csv file into a list of Document -- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document]) csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
csvToDocs parser patterns time path = csvToDocs parser patterns time path =
case parser of case parser of
Wos _ -> undefined Wos _ -> undefined
Csv limit -> Vector.toList Csv limit -> do
<$> Vector.take limit eR <- Csv.readFile path
<$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time) case eR of
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row)) Right r ->
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row)) pure $ Vector.toList
Nothing $ Vector.take limit
[] $ Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
) <$> snd <$> Csv.readFile path (toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) $ snd r
Left e -> panic $ "Error: " <> (pack e)
Csv' limit -> Vector.toList Csv' limit -> Vector.toList
<$> Vector.take limit <$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time) <$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
......
...@@ -15,6 +15,7 @@ compress the contexts around the main terms of the query. ...@@ -15,6 +15,7 @@ compress the contexts around the main terms of the query.
module CleanCsvCorpus where module CleanCsvCorpus where
--import GHC.IO (FilePath) --import GHC.IO (FilePath)
import Data.Either (Either(..))
import Data.SearchEngine as S import Data.SearchEngine as S
import qualified Data.Set as S import qualified Data.Set as S
import Data.Text (pack) import Data.Text (pack)
...@@ -39,17 +40,19 @@ main = do ...@@ -39,17 +40,19 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"] --let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"] let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- CSV.readFile rPath eDocs <- CSV.readFile rPath
case eDocs of
Right (h, csvDocs) -> do
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
putStrLn $ "Number of documents before:" <> show (V.length csvDocs) let docs = CSV.toDocs csvDocs
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs) let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
let docs = CSV.toDocs csvDocs putStrLn $ "Number of documents after:" <> show (V.length docs')
let engine = insertDocs docs initialDocSearchEngine putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
let docIds = S.query engine (map pack q)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs') CSV.writeFile wPath (h, docs')
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs') Left e -> panic $ "Error: " <> (pack e)
CSV.writeFile wPath (h, docs')
...@@ -17,30 +17,24 @@ Main specifications to index a corpus with a term list ...@@ -17,30 +17,24 @@ Main specifications to index a corpus with a term list
module Main where module Main where
import Data.ByteString.Lazy (writeFile) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Data.Text (pack)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import Control.Monad (zipWithM) import Control.Monad (zipWithM)
import Control.Monad.IO.Class import Control.Monad.IO.Class
import Data.Map (Map)
import qualified Data.Map as DM
import GHC.Generics
import Data.Aeson import Data.Aeson
import Data.ByteString.Lazy (writeFile)
import Data.Text (Text) import Data.Either (Either(..))
import Data.List (cycle, concat, unwords) import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf) import Data.List.Split (chunksOf)
import Data.Map (Map)
import qualified Data.Map as DM
import Data.Text (pack, Text)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import GHC.Generics
import System.IO (hPutStr, hFlush, stderr) import System.IO (hPutStr, hFlush, stderr)
import System.Environment import System.Environment
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core import Gargantext.Core
...@@ -92,22 +86,25 @@ main = do ...@@ -92,22 +86,25 @@ main = do
[corpusFile, termListFile, outputFile] <- getArgs [corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]]) --corpus :: IO (DM.IntMap [[Text]])
corpus <- DM.fromListWith (<>) eCorpusFile <- readFile corpusFile
. DV.toList case eCorpusFile of
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)])) Right cf -> do
. snd let corpus = DM.fromListWith (<>)
<$> readFile corpusFile . DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd $ cf
-- termListMap :: [Text] -- termListMap :: [Text]
termList <- csvMapTermList termListFile termList <- csvMapTermList termListFile
putStrLn $ show $ length termList putStrLn $ show $ length termList
let patterns = buildPatterns termList let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus) -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus) r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r) writeFile outputFile $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> (pack e)
......
...@@ -24,6 +24,16 @@ import Data.Maybe ...@@ -24,6 +24,16 @@ import Data.Maybe
import Data.Text (Text, unwords) import Data.Text (Text, unwords)
import GHC.Generics import GHC.Generics
import GHC.IO (FilePath) import GHC.IO (FilePath)
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
...@@ -36,15 +46,6 @@ import Gargantext.Core.Viz.Phylo.LevelMaker ...@@ -36,15 +46,6 @@ import Gargantext.Core.Viz.Phylo.LevelMaker
import Gargantext.Core.Viz.Phylo.Tools import Gargantext.Core.Viz.Phylo.Tools
import Gargantext.Core.Viz.Phylo.View.Export import Gargantext.Core.Viz.Phylo.View.Export
import Gargantext.Core.Viz.Phylo.View.ViewMaker import Gargantext.Core.Viz.Phylo.View.ViewMaker
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
-------------- --------------
......
...@@ -131,18 +131,6 @@ type ScrapersEnv = JobEnv JobLog JobLog ...@@ -131,18 +131,6 @@ type ScrapersEnv = JobEnv JobLog JobLog
type ScraperAPI = AsyncJobsAPI JobLog ScraperInput JobLog type ScraperAPI = AsyncJobsAPI JobLog ScraperInput JobLog
jobLogInit :: Int -> JobLog
jobLogInit n = JobLog { _scst_succeeded = Just n
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just [] }
jobLogSucc :: JobLog -> JobLog
jobLogSucc jl = over (scst_succeeded . _Just) (+ 1) $ over (scst_remaining . _Just) (\c -> c - 1) jl
jobLogErr :: JobLog -> JobLog
jobLogErr jl = over (scst_failed . _Just) (+ 1) $ over (scst_remaining . _Just) (\c -> c - 1) jl
------------------------------------------------------------------------ ------------------------------------------------------------------------
type AsyncJobs event ctI input output = type AsyncJobs event ctI input output =
AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
module Gargantext.API.Job where module Gargantext.API.Job where
import Control.Lens (over, _Just)
import Data.IORef import Data.IORef
import Data.Maybe import Data.Maybe
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..)) import Gargantext.API.Admin.Orchestrator.Types
jobLogInit :: Int -> JobLog jobLogInit :: Int -> JobLog
...@@ -16,25 +17,29 @@ jobLogInit rem = ...@@ -16,25 +17,29 @@ jobLogInit rem =
, _scst_events = Just [] } , _scst_events = Just [] }
jobLogSuccess :: JobLog -> JobLog jobLogSuccess :: JobLog -> JobLog
jobLogSuccess (JobLog { _scst_succeeded = mSucc jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
, _scst_remaining = mRem over (scst_remaining . _Just) (\x -> x - 1) jl
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = (+ 1) <$> mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem
, _scst_failed = mFail
, _scst_events = evt }
jobLogFail :: JobLog -> JobLog jobLogFail :: JobLog -> JobLog
jobLogFail (JobLog { _scst_succeeded = mSucc jobLogFail jl = over (scst_failed . _Just) (+ 1) $
, _scst_remaining = mRem over (scst_remaining . _Just) (\x -> x - 1) jl
, _scst_failed = mFail
, _scst_events = evt }) = jobLogFailTotal :: JobLog -> JobLog
jobLogFailTotal (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = mSucc JobLog { _scst_succeeded = mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem , _scst_remaining = newRem
, _scst_failed = (+ 1) <$> mFail , _scst_failed = newFail
, _scst_events = evt } , _scst_events = evt }
where
(newRem, newFail) = case mRem of
Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
runJobLog :: MonadBase IO m => Int -> (JobLog -> m ()) -> m (m (), m (), m JobLog) runJobLog :: MonadBase IO m => Int -> (JobLog -> m ()) -> m (m (), m (), m JobLog)
runJobLog num logStatus = do runJobLog num logStatus = do
......
...@@ -27,6 +27,8 @@ import Data.Swagger ...@@ -27,6 +27,8 @@ import Data.Swagger
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import GHC.Generics (Generic) import GHC.Generics (Generic)
import qualified Prelude as Prelude
import Protolude (readFile)
import Servant import Servant
import Servant.Job.Utils (jsonOptions) import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart -- import Servant.Multipart
...@@ -35,8 +37,9 @@ import Test.QuickCheck.Arbitrary ...@@ -35,8 +37,9 @@ import Test.QuickCheck.Arbitrary
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, jobLogSucc) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
...@@ -248,28 +251,41 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -248,28 +251,41 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
PresseRIS -> Parser.parseFormat Parser.RisPresse PresseRIS -> Parser.parseFormat Parser.RisPresse
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
docs <- liftBase $ splitEvery 500 eDocs <- liftBase $ parse $ cs d
<$> take 1000000 case eDocs of
<$> parse (cs d) Right docs' -> do
let docs = splitEvery 500 $ take 1000000 docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2 printDebug "Parsing corpus finished : " cid
logStatus jobLog2
printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus printDebug "Starting extraction : " cid
_cid' <- flowCorpus user -- TODO granularity of the logStatus
(Right [cid]) _cid' <- flowCorpus user
(Multi $ fromMaybe EN l) (Right [cid])
(map (map toHyperdataDocument) docs) (Multi $ fromMaybe EN l)
(map (map toHyperdataDocument) docs)
printDebug "Extraction finished : " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
printDebug "Extraction finished : " cid logStatus jobLog3
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) pure $ jobLog3
sendMail user Left e -> do
printDebug "Error" e
pure jobLog3 logStatus jobLogE
pure jobLogE
where where
jobLog2 = jobLogSucc jobLog jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSucc jobLog2 jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog
parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
parseCsvGargV3Path fp = do
contents <- readFile fp
Parser.parseFormat Parser.CsvGargV3 $ cs contents
{- {-
addToCorpusWithFile :: FlowCmdM env err m addToCorpusWithFile :: FlowCmdM env err m
......
...@@ -17,7 +17,8 @@ import Servant ...@@ -17,7 +17,8 @@ import Servant
import Servant.Job.Async import Servant.Job.Async
import Web.FormUrlEncoded (FromForm) import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, jobLogInit, jobLogSucc, jobLogErr) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm) import Gargantext.API.Node.Corpus.New (addToCorpusWithForm)
import Gargantext.API.Node.Corpus.New.File (FileType(..)) import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Node.Types (NewWithForm(..)) import Gargantext.API.Node.Types (NewWithForm(..))
...@@ -82,8 +83,8 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do ...@@ -82,8 +83,8 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
-- printDebug "[frameCalcUploadAsync] mCId" mCId -- printDebug "[frameCalcUploadAsync] mCId" mCId
jobLog2 <- case mCId of jobLog2 <- case mCId of
Nothing -> pure $ jobLogErr jobLog Nothing -> pure $ jobLogFail jobLog
Just cId -> Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog
pure $ jobLogSucc jobLog2 pure $ jobLogSuccess jobLog2
...@@ -43,8 +43,8 @@ import qualified Gargantext.API.Public as Public ...@@ -43,8 +43,8 @@ import qualified Gargantext.API.Public as Public
import Gargantext.API.Admin.Auth.Types (AuthRequest, AuthResponse, AuthenticatedUser(..), PathId(..)) import Gargantext.API.Admin.Auth.Types (AuthRequest, AuthResponse, AuthenticatedUser(..), PathId(..))
import Gargantext.API.Admin.Auth (withAccess) import Gargantext.API.Admin.Auth (withAccess)
import Gargantext.API.Admin.FrontEnd (FrontEndAPI) import Gargantext.API.Admin.FrontEnd (FrontEndAPI)
import Gargantext.API.Admin.Orchestrator.Types (jobLogInit)
import Gargantext.API.Count (CountAPI, count, Query) import Gargantext.API.Count (CountAPI, count, Query)
import Gargantext.API.Job (jobLogInit)
import Gargantext.API.Ngrams (TableNgramsApi, apiNgramsTableDoc) import Gargantext.API.Ngrams (TableNgramsApi, apiNgramsTableDoc)
import Gargantext.API.Node import Gargantext.API.Node
import Gargantext.API.Prelude import Gargantext.API.Prelude
...@@ -285,7 +285,7 @@ addCorpusWithForm user cid = ...@@ -285,7 +285,7 @@ addCorpusWithForm user cid =
JobFunction (\i log' -> JobFunction (\i log' ->
let let
log'' x = do log'' x = do
printDebug "addToCorpusWithForm" x printDebug "[addToCorpusWithForm] " x
liftBase $ log' x liftBase $ log' x
in New.addToCorpusWithForm user cid i log'' (jobLogInit 3)) in New.addToCorpusWithForm user cid i log'' (jobLogInit 3))
...@@ -295,7 +295,7 @@ addCorpusWithFile user cid = ...@@ -295,7 +295,7 @@ addCorpusWithFile user cid =
JobFunction (\i log' -> JobFunction (\i log' ->
let let
log'' x = do log'' x = do
printDebug "addToCorpusWithFile" x printDebug "[addToCorpusWithFile]" x
liftBase $ log' x liftBase $ log' x
in New.addToCorpusWithFile user cid i log'') in New.addToCorpusWithFile user cid i log'')
......
...@@ -11,14 +11,17 @@ Portability : POSIX ...@@ -11,14 +11,17 @@ Portability : POSIX
module Gargantext.Core.Ext.IMT where module Gargantext.Core.Ext.IMT where
import Gargantext.Prelude import Data.Either (Either(..))
import Data.Text (Text, splitOn)
import Data.Map (Map) import Data.Map (Map)
import Data.Text (Text, splitOn)
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.List as DL import qualified Data.List as DL
import qualified Data.Vector as DV import qualified Data.Vector as DV
import qualified Data.Map as M import qualified Data.Map as M
import qualified Prelude as Prelude
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.Utils as Utils import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
...@@ -98,8 +101,10 @@ schools = [ School ...@@ -98,8 +101,10 @@ schools = [ School
mapIdSchool :: Map Text Text mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
hal_data :: IO (DV.Vector CsvHal) hal_data :: IO (Either Prelude.String (DV.Vector CsvHal))
hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv" hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r
names :: S.Set Text names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
......
...@@ -16,15 +16,21 @@ Format Converter. ...@@ -16,15 +16,21 @@ Format Converter.
module Gargantext.Core.Text.Convert (risPress2csvWrite) module Gargantext.Core.Text.Convert (risPress2csvWrite)
where where
import Data.Either (Either(..))
import qualified Data.Text as T
import System.FilePath (FilePath()) -- , takeExtension) import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv) import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..)) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..))
risPress2csvWrite :: FilePath -> IO () risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = parseFile RisPresse (f <> ".ris") risPress2csvWrite f = do
>>= \hs -> writeDocs2Csv (f <> ".csv") hs eContents <- parseFile RisPresse (f <> ".ris")
case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e)
...@@ -36,18 +36,20 @@ import Data.String() ...@@ -36,18 +36,20 @@ import Data.String()
import Data.Text (Text) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second) import Data.Tuple.Extra (both, first, second)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Prelude as Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
...@@ -75,30 +77,40 @@ data FileFormat = WOS | RIS | RisPresse ...@@ -75,30 +77,40 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument] parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = mapM (toDoc RIS) parseFormat RisPresse bs = do
<$> snd docs <- mapM (toDoc RIS)
<$> enrichWith RisPresse <$> snd
$ partitionEithers <$> enrichWith RisPresse
$ [runParser' RisPresse bs] $ partitionEithers
parseFormat WOS bs = mapM (toDoc WOS) $ [runParser' RisPresse bs]
<$> snd pure $ Right docs
<$> enrichWith WOS parseFormat WOS bs = do
$ partitionEithers docs <- mapM (toDoc WOS)
$ [runParser' WOS bs] <$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
parseFormat _ _ = undefined parseFormat _ _ = undefined
-- | Parse file into documents -- | Parse file into documents
-- TODO manage errors here -- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message -- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument] parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal p = parseHal p parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p parseFile RisPresse p = do
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p pure $ Right docs
parseFile WOS p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile ff p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS -- TODO use language for RIS
......
...@@ -19,7 +19,7 @@ import qualified Data.ByteString as BS ...@@ -19,7 +19,7 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(..))
import Data.Text (Text, pack, length, intercalate) import Data.Text (Text, pack, length, intercalate)
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import qualified Data.Vector as V import qualified Data.Vector as V
...@@ -27,6 +27,8 @@ import Data.Vector (Vector) ...@@ -27,6 +27,8 @@ import Data.Vector (Vector)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import GHC.Word (Word8) import GHC.Word (Word8)
import qualified Prelude as Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
import Gargantext.Core.Text import Gargantext.Core.Text
...@@ -192,52 +194,47 @@ delimiter :: Word8 ...@@ -192,52 +194,47 @@ delimiter :: Word8
delimiter = fromIntegral $ ord '\t' delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text] readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
readCsvOn' fields fp = V.toList readCsvOn' fields fp = do
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields) r <- readFile fp
<$> snd pure $ ( V.toList
<$> readFile fp . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
. snd ) <$> r
------------------------------------------------------------------------ ------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a) readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a) readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a) readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a) readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
readFile :: FilePath -> IO (Header, Vector CsvDoc) readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
readFile = fmap readCsvLazyBS . BL.readFile readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc) readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
Left e -> panic (pack e)
Right csvDocs -> csvDocs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal) readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
readCsvHal = fmap readCsvHalLazyBS . BL.readFile readCsvHal = fmap readCsvHalLazyBS . BL.readFile
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal) readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal) readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -382,18 +379,22 @@ csv2doc (CsvDoc title source ...@@ -382,18 +379,22 @@ csv2doc (CsvDoc title source
Nothing Nothing
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument] parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp parseHal fp = do
r <- readCsvHal fp
pure $ (V.toList . V.map csvHal2doc . snd) <$> r
parseHal' :: BL.ByteString -> [HyperdataDocument] parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseCsv :: FilePath -> IO [HyperdataDocument] parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp parseCsv fp = do
r <- readFile fp
pure $ (V.toList . V.map csv2doc . snd) <$> r
parseCsv' :: BL.ByteString -> [HyperdataDocument] parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
...@@ -425,4 +426,4 @@ readWeightedCsv fp = ...@@ -425,4 +426,4 @@ readWeightedCsv fp =
case decodeByNameWith csvDecodeOptions bs of case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e) Left e -> panic (pack e)
Right corpus -> corpus Right corpus -> corpus
) $ BL.readFile fp ) $ BL.readFile fp
\ No newline at end of file
...@@ -56,7 +56,7 @@ import Data.Map (Map, lookup) ...@@ -56,7 +56,7 @@ import Data.Map (Map, lookup)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import Data.Monoid import Data.Monoid
import Data.Swagger import Data.Swagger
import Data.Text (splitOn) import qualified Data.Text as T
import Data.Traversable (traverse) import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic) import GHC.Generics (Generic)
...@@ -178,11 +178,12 @@ flowCorpusFile :: (FlowCmdM env err m) ...@@ -178,11 +178,12 @@ flowCorpusFile :: (FlowCmdM env err m)
-> TermType Lang -> FileFormat -> FilePath -> TermType Lang -> FileFormat -> FilePath
-> m CorpusId -> m CorpusId
flowCorpusFile u n l la ff fp = do flowCorpusFile u n l la ff fp = do
docs <- liftBase ( splitEvery 500 eParsed <- liftBase $ parseFile ff fp
<$> take l case eParsed of
<$> parseFile ff fp Right parsed -> do
) let docs = splitEvery 500 $ take l parsed
flowCorpus u n la (map (map toHyperdataDocument) docs) flowCorpus u n la (map (map toHyperdataDocument) docs)
Left e -> panic $ "Error: " <> (T.pack e)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus -- | TODO improve the needed type to create/update a corpus
...@@ -425,11 +426,11 @@ instance ExtractNgramsT HyperdataDocument ...@@ -425,11 +426,11 @@ instance ExtractNgramsT HyperdataDocument
$ _hd_source doc $ _hd_source doc
institutes = map text2ngrams institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
$ _hd_institutes doc $ _hd_institutes doc
authors = map text2ngrams authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ") $ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc $ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP) terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment