Commit f095ca6e authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[zip] implement zip for parsers

parent d1ffbb9b
...@@ -35,7 +35,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (toHyperdataDocument) ...@@ -35,7 +35,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (toHyperdataDocument)
import Gargantext.Database.Admin.Types.Node (CorpusId) import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..)) import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..))
main :: IO () main :: IO ()
main = do main = do
...@@ -51,10 +51,10 @@ main = do ...@@ -51,10 +51,10 @@ main = do
Nothing -> panic $ "Cannot read limit: " <> (Text.pack limit) Nothing -> panic $ "Cannot read limit: " <> (Text.pack limit)
Just l -> l Just l -> l
corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format corpusPath Nothing (\_ -> pure ()) corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format Plain corpusPath Nothing (\_ -> pure ())
corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal corpusPath Nothing (\_ -> pure ()) corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal Plain corpusPath Nothing (\_ -> pure ())
annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath (\_ -> pure ()) annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath (\_ -> pure ())
......
...@@ -30,7 +30,7 @@ import GHC.IO (FilePath) ...@@ -30,7 +30,7 @@ import GHC.IO (FilePath)
import Gargantext.API.Ngrams.Prelude (toTermList) import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile) import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight) import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList) import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
...@@ -94,7 +94,7 @@ wosToDocs limit patterns time path = do ...@@ -94,7 +94,7 @@ wosToDocs limit patterns time path = do
<$> mapConcurrently (\file -> <$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d) filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d)) && (isJust $ _hd_title d))
<$> fromRight [] <$> parseFile WOS (path <> file) ) files <$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
-- To transform a Csv file into a list of Document -- To transform a Csv file into a list of Document
......
...@@ -42,7 +42,7 @@ import Gargantext.Prelude ...@@ -42,7 +42,7 @@ import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage) import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Node.Corpus.New.Types import Gargantext.API.Node.Corpus.New.Types
import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
...@@ -50,7 +50,7 @@ import Gargantext.API.Node.Types ...@@ -50,7 +50,7 @@ import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-}) import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), FileType(..), parseFormatC) import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-}) import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
...@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do ...@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] fileFormat" ff printDebug "[addToCorpusWithForm] fileFormat" ff
logStatus jobLog logStatus jobLog
limit' <- view $ hasConfig . gc_max_docs_parsers limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' let limit = fromIntegral limit' :: Integer
let let
parseC = case ft of parseC = case ft of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal CSV_HAL -> Parser.parseFormatC Parser.CsvHal
...@@ -315,7 +315,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do ...@@ -315,7 +315,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
(Multi $ fromMaybe EN l) (Multi $ fromMaybe EN l)
Nothing Nothing
--(Just $ fromIntegral $ length docs, docsC') --(Just $ fromIntegral $ length docs, docsC')
(Just 0, docsC') -- TODO fix number of docs (Just 0, transPipe liftBase docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs) --(map (map toHyperdataDocument) docs)
(logStatus) (logStatus)
......
...@@ -20,7 +20,7 @@ import Web.FormUrlEncoded (FromForm) ...@@ -20,7 +20,7 @@ import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail) import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm) import Gargantext.API.Node.Corpus.New (addToCorpusWithForm)
import Gargantext.API.Node.Corpus.New.File (FileType(..)) import Gargantext.API.Node.Corpus.New.Types (FileFormat(..), FileType(..))
import Gargantext.API.Node.Types (NewWithForm(..)) import Gargantext.API.Node.Types (NewWithForm(..))
import Gargantext.API.Prelude import Gargantext.API.Prelude
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
...@@ -87,6 +87,6 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do ...@@ -87,6 +87,6 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
jobLog2 <- case mCId of jobLog2 <- case mCId of
Nothing -> pure $ jobLogFail jobLog Nothing -> pure $ jobLogFail jobLog
Just cId -> Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV Plain body Nothing "calc-upload.csv") logStatus jobLog
pure $ jobLogSuccess jobLog2 pure $ jobLogSuccess jobLog2
...@@ -35,7 +35,7 @@ import Data.List (concat, lookup) ...@@ -35,7 +35,7 @@ import Data.List (concat, lookup)
import Data.Ord() import Data.Ord()
import Data.String (String()) import Data.String (String())
import Data.String() import Data.String()
import Data.Text (Text) import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second) import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
...@@ -70,8 +70,7 @@ type ParseError = String ...@@ -70,8 +70,7 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileType = WOS | RIS | RisPresse data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
| CsvGargV3 | CsvHal
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -96,12 +95,24 @@ parseFormatC WOS Plain bs = do ...@@ -96,12 +95,24 @@ parseFormatC WOS Plain bs = do
.| mapC (map $ first WOS.keys) .| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8) .| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) <$> eDocs .| mapMC (toDoc WOS)) <$> eDocs
parseFormatC _ft ZIP bs = do parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip" path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs liftBase $ DB.writeFile path bs
parsedZip <- liftBase $ withArchive path $ do fileContents <- liftBase $ withArchive path $ do
DM.keys <$> getEntries files <- DM.keys <$> getEntries
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip mapM getEntry files
--printDebug "[parseFormatC] fileContents" fileContents
eContents <- mapM (parseFormatC ft Plain) fileContents
--printDebug "[parseFormatC] contents" contents
--pure $ Left $ "Not implemented for ZIP"
let (errs, contents) = partitionEithers eContents
case errs of
[] ->
case contents of
[] -> pure $ Left "No files in zip"
_ -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined parseFormatC _ _ _ = undefined
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument]) -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment