Commit f095ca6e authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[zip] implement zip for parsers

parent d1ffbb9b
Pipeline #2558 failed with stage
in 48 minutes and 11 seconds
......@@ -35,7 +35,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (toHyperdataDocument)
import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..))
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..))
main :: IO ()
main = do
......@@ -51,10 +51,10 @@ main = do
Nothing -> panic $ "Cannot read limit: " <> (Text.pack limit)
Just l -> l
corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format corpusPath Nothing (\_ -> pure ())
corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format Plain corpusPath Nothing (\_ -> pure ())
corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal corpusPath Nothing (\_ -> pure ())
corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal Plain corpusPath Nothing (\_ -> pure ())
annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath (\_ -> pure ())
......
......@@ -30,7 +30,7 @@ import GHC.IO (FilePath)
import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
......@@ -94,7 +94,7 @@ wosToDocs limit patterns time path = do
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> fromRight [] <$> parseFile WOS (path <> file) ) files
<$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
-- To transform a Csv file into a list of Document
......
......@@ -42,7 +42,7 @@ import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Node.Corpus.New.Types
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
......@@ -50,7 +50,7 @@ import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), FileType(..), parseFormatC)
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
......@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] fileFormat" ff
logStatus jobLog
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
let limit = fromIntegral limit' :: Integer
let
parseC = case ft of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
......@@ -315,7 +315,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
(Multi $ fromMaybe EN l)
Nothing
--(Just $ fromIntegral $ length docs, docsC')
(Just 0, docsC') -- TODO fix number of docs
(Just 0, transPipe liftBase docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs)
(logStatus)
......
......@@ -20,7 +20,7 @@ import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm)
import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..), FileType(..))
import Gargantext.API.Node.Types (NewWithForm(..))
import Gargantext.API.Prelude
import Gargantext.Core.Types.Individu (User(..))
......@@ -87,6 +87,6 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
jobLog2 <- case mCId of
Nothing -> pure $ jobLogFail jobLog
Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV Plain body Nothing "calc-upload.csv") logStatus jobLog
pure $ jobLogSuccess jobLog2
......@@ -35,7 +35,7 @@ import Data.List (concat, lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Text (Text)
import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
......@@ -70,8 +70,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileType = WOS | RIS | RisPresse
| CsvGargV3 | CsvHal
data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
deriving (Show)
-- Implemented (ISI Format)
......@@ -96,12 +95,24 @@ parseFormatC WOS Plain bs = do
.| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) <$> eDocs
parseFormatC _ft ZIP bs = do
parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs
parsedZip <- liftBase $ withArchive path $ do
DM.keys <$> getEntries
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
fileContents <- liftBase $ withArchive path $ do
files <- DM.keys <$> getEntries
mapM getEntry files
--printDebug "[parseFormatC] fileContents" fileContents
eContents <- mapM (parseFormatC ft Plain) fileContents
--printDebug "[parseFormatC] contents" contents
--pure $ Left $ "Not implemented for ZIP"
let (errs, contents) = partitionEithers eContents
case errs of
[] ->
case contents of
[] -> pure $ Left "No files in zip"
_ -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment