Commit d63df339 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[upload] fix csv upload job response

Now the job doesn't return 'succeeded' fields when it actually fails.
parent 8667dfeb
......@@ -16,11 +16,24 @@ Adaptative Phylo binaries
module Main where
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson
import Data.Either (Either(..))
import Data.List (concat, nub, isSuffixOf)
import Data.String (String)
import GHC.IO (FilePath)
import qualified Prelude as Prelude
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Data.Text (Text, unwords, unpack, replace, pack)
import Crypto.Hash.SHA256 (hash)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
......@@ -36,20 +49,6 @@ import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setCon
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.API.Ngrams.Prelude (toTermList)
import GHC.IO (FilePath)
import Prelude (Either(Left, Right),toInteger)
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Control.Concurrent.Async (mapConcurrently)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
-- import Debug.Trace (trace)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
......@@ -84,13 +83,13 @@ toDays y m d = fromIntegral
toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of
Year _ _ _ -> y
Month _ _ _ -> toMonths (toInteger y) m d
Week _ _ _ -> div (toDays (toInteger y) m d) 7
Day _ _ _ -> toDays (toInteger y) m d
Month _ _ _ -> toMonths (Prelude.toInteger y) m d
Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
Day _ _ _ -> toDays (Prelude.toInteger y) m d
toPhyloDate' :: Int -> Int -> Int -> Text
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
--------------
......@@ -113,9 +112,14 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
wosToDocs limit patterns time path = do
files <- getFilesFromPath path
let parseFile' file = do
eParsed <- parseFile WOS (path <> file)
case eParsed of
Right ps -> pure ps
Left e -> panic $ "Error: " <> (pack e)
take limit
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
......@@ -134,22 +138,27 @@ wosToDocs limit patterns time path = do
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> parseFile WOS (path <> file) ) files
<$> parseFile' file) files
-- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
csvToDocs parser patterns time path =
case parser of
Wos _ -> undefined
Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
Csv limit -> do
eR <- Csv.readFile path
case eR of
Right r ->
pure $ Vector.toList
$ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) <$> snd <$> Csv.readFile path
) $ snd r
Left e -> panic $ "Error: " <> (pack e)
Csv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
......
......@@ -15,6 +15,7 @@ compress the contexts around the main terms of the query.
module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.Either (Either(..))
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Text (pack)
......@@ -39,8 +40,9 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- CSV.readFile rPath
eDocs <- CSV.readFile rPath
case eDocs of
Right (h, csvDocs) -> do
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
......@@ -53,3 +55,4 @@ main = do
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
CSV.writeFile wPath (h, docs')
Left e -> panic $ "Error: " <> (pack e)
......@@ -17,30 +17,24 @@ Main specifications to index a corpus with a term list
module Main where
import Data.ByteString.Lazy (writeFile)
import Data.Text (pack)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import Data.Map (Map)
import qualified Data.Map as DM
import GHC.Generics
import Data.Aeson
import Data.Text (Text)
import Data.ByteString.Lazy (writeFile)
import Data.Either (Either(..))
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
import Data.Map (Map)
import qualified Data.Map as DM
import Data.Text (pack, Text)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Gargantext.Prelude
import Gargantext.Core
......@@ -92,11 +86,13 @@ main = do
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
corpus <- DM.fromListWith (<>)
eCorpusFile <- readFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd
<$> readFile corpusFile
. snd $ cf
-- termListMap :: [Text]
termList <- csvMapTermList termListFile
......@@ -108,6 +104,7 @@ main = do
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> (pack e)
......
......@@ -24,6 +24,16 @@ import Data.Maybe
import Data.Text (Text, unwords)
import GHC.Generics
import GHC.IO (FilePath)
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Prelude
import Gargantext.Core.Text.Context (TermList)
......@@ -36,15 +46,6 @@ import Gargantext.Core.Viz.Phylo.LevelMaker
import Gargantext.Core.Viz.Phylo.Tools
import Gargantext.Core.Viz.Phylo.View.Export
import Gargantext.Core.Viz.Phylo.View.ViewMaker
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
--------------
......
......@@ -131,18 +131,6 @@ type ScrapersEnv = JobEnv JobLog JobLog
type ScraperAPI = AsyncJobsAPI JobLog ScraperInput JobLog
jobLogInit :: Int -> JobLog
jobLogInit n = JobLog { _scst_succeeded = Just n
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just [] }
jobLogSucc :: JobLog -> JobLog
jobLogSucc jl = over (scst_succeeded . _Just) (+ 1) $ over (scst_remaining . _Just) (\c -> c - 1) jl
jobLogErr :: JobLog -> JobLog
jobLogErr jl = over (scst_failed . _Just) (+ 1) $ over (scst_remaining . _Just) (\c -> c - 1) jl
------------------------------------------------------------------------
type AsyncJobs event ctI input output =
AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
module Gargantext.API.Job where
import Control.Lens (over, _Just)
import Data.IORef
import Data.Maybe
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.API.Admin.Orchestrator.Types
jobLogInit :: Int -> JobLog
......@@ -16,25 +17,29 @@ jobLogInit rem =
, _scst_events = Just [] }
jobLogSuccess :: JobLog -> JobLog
jobLogSuccess (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = (+ 1) <$> mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem
, _scst_failed = mFail
, _scst_events = evt }
jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
jobLogFail :: JobLog -> JobLog
jobLogFail (JobLog { _scst_succeeded = mSucc
jobLogFail jl = over (scst_failed . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
jobLogFailTotal :: JobLog -> JobLog
jobLogFailTotal (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem
, _scst_failed = (+ 1) <$> mFail
, _scst_remaining = newRem
, _scst_failed = newFail
, _scst_events = evt }
where
(newRem, newFail) = case mRem of
Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
runJobLog :: MonadBase IO m => Int -> (JobLog -> m ()) -> m (m (), m (), m JobLog)
runJobLog num logStatus = do
......
......@@ -27,6 +27,8 @@ import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import qualified Prelude as Prelude
import Protolude (readFile)
import Servant
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
......@@ -35,8 +37,9 @@ import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, jobLogSucc)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
......@@ -248,9 +251,10 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
PresseRIS -> Parser.parseFormat Parser.RisPresse
-- TODO granularity of the logStatus
docs <- liftBase $ splitEvery 500
<$> take 1000000
<$> parse (cs d)
eDocs <- liftBase $ parse $ cs d
case eDocs of
Right docs' -> do
let docs = splitEvery 500 $ take 1000000 docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
......@@ -266,10 +270,22 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
pure jobLog3
logStatus jobLog3
pure $ jobLog3
Left e -> do
printDebug "Error" e
logStatus jobLogE
pure jobLogE
where
jobLog2 = jobLogSucc jobLog
jobLog3 = jobLogSucc jobLog2
jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog
parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
parseCsvGargV3Path fp = do
contents <- readFile fp
Parser.parseFormat Parser.CsvGargV3 $ cs contents
{-
addToCorpusWithFile :: FlowCmdM env err m
......
......@@ -17,7 +17,8 @@ import Servant
import Servant.Job.Async
import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, jobLogInit, jobLogSucc, jobLogErr)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm)
import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Node.Types (NewWithForm(..))
......@@ -82,8 +83,8 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
-- printDebug "[frameCalcUploadAsync] mCId" mCId
jobLog2 <- case mCId of
Nothing -> pure $ jobLogErr jobLog
Nothing -> pure $ jobLogFail jobLog
Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog
pure $ jobLogSucc jobLog2
pure $ jobLogSuccess jobLog2
......@@ -43,8 +43,8 @@ import qualified Gargantext.API.Public as Public
import Gargantext.API.Admin.Auth.Types (AuthRequest, AuthResponse, AuthenticatedUser(..), PathId(..))
import Gargantext.API.Admin.Auth (withAccess)
import Gargantext.API.Admin.FrontEnd (FrontEndAPI)
import Gargantext.API.Admin.Orchestrator.Types (jobLogInit)
import Gargantext.API.Count (CountAPI, count, Query)
import Gargantext.API.Job (jobLogInit)
import Gargantext.API.Ngrams (TableNgramsApi, apiNgramsTableDoc)
import Gargantext.API.Node
import Gargantext.API.Prelude
......@@ -285,7 +285,7 @@ addCorpusWithForm user cid =
JobFunction (\i log' ->
let
log'' x = do
printDebug "addToCorpusWithForm" x
printDebug "[addToCorpusWithForm] " x
liftBase $ log' x
in New.addToCorpusWithForm user cid i log'' (jobLogInit 3))
......@@ -295,7 +295,7 @@ addCorpusWithFile user cid =
JobFunction (\i log' ->
let
log'' x = do
printDebug "addToCorpusWithFile" x
printDebug "[addToCorpusWithFile]" x
liftBase $ log' x
in New.addToCorpusWithFile user cid i log'')
......
......@@ -11,14 +11,17 @@ Portability : POSIX
module Gargantext.Core.Ext.IMT where
import Gargantext.Prelude
import Data.Text (Text, splitOn)
import Data.Either (Either(..))
import Data.Map (Map)
import Data.Text (Text, splitOn)
import qualified Data.Set as S
import qualified Data.List as DL
import qualified Data.Vector as DV
import qualified Data.Map as M
import qualified Prelude as Prelude
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
......@@ -98,8 +101,10 @@ schools = [ School
mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
hal_data :: IO (DV.Vector CsvHal)
hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
hal_data :: IO (Either Prelude.String (DV.Vector CsvHal))
hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r
names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
......
......@@ -16,15 +16,21 @@ Format Converter.
module Gargantext.Core.Text.Convert (risPress2csvWrite)
where
import Data.Either (Either(..))
import qualified Data.Text as T
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..))
risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = parseFile RisPresse (f <> ".ris")
>>= \hs -> writeDocs2Csv (f <> ".csv") hs
risPress2csvWrite f = do
eContents <- parseFile RisPresse (f <> ".ris")
case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e)
......@@ -36,18 +36,20 @@ import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Prelude as Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
......@@ -75,30 +77,40 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = mapM (toDoc RIS)
parseFormat RisPresse bs = do
docs <- mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
parseFormat WOS bs = mapM (toDoc WOS)
pure $ Right docs
parseFormat WOS bs = do
docs <- mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
parseFile RisPresse p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
parseFile WOS p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile ff p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
......
......@@ -19,7 +19,7 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Either (Either(..))
import Data.Text (Text, pack, length, intercalate)
import Data.Time.Segment (jour)
import qualified Data.Vector as V
......@@ -27,6 +27,8 @@ import Data.Vector (Vector)
import GHC.IO (FilePath)
import GHC.Word (Word8)
import qualified Prelude as Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Core.Text
......@@ -192,52 +194,47 @@ delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn' fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
readCsvOn' fields fp = do
r <- readFile fp
pure $ ( V.toList
. V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
. snd ) <$> r
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------
......@@ -382,18 +379,22 @@ csv2doc (CsvDoc title source
Nothing
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseHal fp = do
r <- readCsvHal fp
pure $ (V.toList . V.map csvHal2doc . snd) <$> r
parseHal' :: BL.ByteString -> [HyperdataDocument]
parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------
parseCsv :: FilePath -> IO [HyperdataDocument]
parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseCsv fp = do
r <- readFile fp
pure $ (V.toList . V.map csv2doc . snd) <$> r
parseCsv' :: BL.ByteString -> [HyperdataDocument]
parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......
......@@ -56,7 +56,7 @@ import Data.Map (Map, lookup)
import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
import Data.Text (splitOn)
import qualified Data.Text as T
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
......@@ -178,11 +178,12 @@ flowCorpusFile :: (FlowCmdM env err m)
-> TermType Lang -> FileFormat -> FilePath
-> m CorpusId
flowCorpusFile u n l la ff fp = do
docs <- liftBase ( splitEvery 500
<$> take l
<$> parseFile ff fp
)
eParsed <- liftBase $ parseFile ff fp
case eParsed of
Right parsed -> do
let docs = splitEvery 500 $ take l parsed
flowCorpus u n la (map (map toHyperdataDocument) docs)
Left e -> panic $ "Error: " <> (T.pack e)
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
......@@ -425,11 +426,11 @@ instance ExtractNgramsT HyperdataDocument
$ _hd_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
$ _hd_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment