Commit d1ffbb9b authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[conduit] some work towards flow migration to conduit

Doesn't compile yet.

This is somewhere in Flow.hs in types for ConduitT () a m ()
where m monad is IO.
parent c8d0773c
Pipeline #2552 failed with stage
in 25 minutes and 53 seconds
...@@ -42,7 +42,7 @@ import Gargantext.Database.Prelude (HasConnectionPool, HasConfig) ...@@ -42,7 +42,7 @@ import Gargantext.Database.Prelude (HasConnectionPool, HasConfig)
import Gargantext.Prelude import Gargantext.Prelude
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Network.HTTP.Media ((//), (/:)) import Network.HTTP.Media ((//), (/:))
import qualified Prelude as Prelude import qualified Prelude
import Servant import Servant
( (:<|>) (..) ( (:<|>) (..)
, (:>) , (:>)
......
...@@ -22,7 +22,7 @@ import Gargantext.Database.Prelude (HasConnectionPool, HasConfig) ...@@ -22,7 +22,7 @@ import Gargantext.Database.Prelude (HasConnectionPool, HasConfig)
import qualified Gargantext.Database.Schema.Node as N import qualified Gargantext.Database.Schema.Node as N
import Gargantext.Prelude import Gargantext.Prelude
import GHC.Generics (Generic) import GHC.Generics (Generic)
import qualified Prelude as Prelude import qualified Prelude
import Text.Read (readEither) import Text.Read (readEither)
data Node = Node data Node = Node
......
...@@ -58,7 +58,7 @@ import qualified Data.Map as Map ...@@ -58,7 +58,7 @@ import qualified Data.Map as Map
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Vector as Vec import qualified Data.Vector as Vec
import qualified Prelude as Prelude import qualified Prelude
import qualified Protolude as P import qualified Protolude as P
------------------------------------------------------------------------ ------------------------------------------------------------------------
type GETAPI = Summary "Get List" type GETAPI = Summary "Get List"
......
...@@ -14,7 +14,7 @@ import Web.FormUrlEncoded (FromForm, ToForm) ...@@ -14,7 +14,7 @@ import Web.FormUrlEncoded (FromForm, ToForm)
import Protolude import Protolude
import Gargantext.API.Ngrams.Types (NgramsList) import Gargantext.API.Ngrams.Types (NgramsList)
import Gargantext.API.Node.Corpus.New.File (FileType(..)) import Gargantext.API.Node.Corpus.New.Types (FileType(..))
import Gargantext.Core.Utils.Prefix (unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -25,7 +25,7 @@ import Servant.Job.Types ...@@ -25,7 +25,7 @@ import Servant.Job.Types
import Servant.Job.Utils (jsonOptions) import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded (FromForm) import Web.FormUrlEncoded (FromForm)
import qualified Gargantext.API.Node.Corpus.New.File as NewFile import qualified Gargantext.API.Node.Corpus.New.Types as NewTypes
import Gargantext.API.Admin.Orchestrator.Types hiding (AsyncJobs) import Gargantext.API.Admin.Orchestrator.Types hiding (AsyncJobs)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Utils.Prefix (unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
...@@ -40,7 +40,7 @@ type Api = Summary "New Annuaire endpoint" ...@@ -40,7 +40,7 @@ type Api = Summary "New Annuaire endpoint"
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
data AnnuaireWithForm = AnnuaireWithForm data AnnuaireWithForm = AnnuaireWithForm
{ _wf_filetype :: !NewFile.FileType { _wf_filetype :: !NewTypes.FileType
, _wf_data :: !Text , _wf_data :: !Text
, _wf_lang :: !(Maybe Lang) , _wf_lang :: !(Maybe Lang)
} deriving (Eq, Show, Generic) } deriving (Eq, Show, Generic)
......
...@@ -43,14 +43,14 @@ import Gargantext.Prelude ...@@ -43,14 +43,14 @@ import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage) import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.New.Types
import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-}) import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat) import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), FileType(..), parseFormatC)
import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-}) import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
...@@ -264,9 +264,10 @@ addToCorpusWithForm :: (FlowCmdM env err m) ...@@ -264,9 +264,10 @@ addToCorpusWithForm :: (FlowCmdM env err m)
-> (JobLog -> m ()) -> (JobLog -> m ())
-> JobLog -> JobLog
-> m JobLog -> m JobLog
addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] Parsing corpus: " cid printDebug "[addToCorpusWithForm] Parsing corpus: " cid
printDebug "[addToCorpusWithForm] fileType" ft printDebug "[addToCorpusWithForm] fileType" ft
printDebug "[addToCorpusWithForm] fileFormat" ff
logStatus jobLog logStatus jobLog
limit' <- view $ hasConfig . gc_max_docs_parsers limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' let limit = fromIntegral limit'
...@@ -276,33 +277,33 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -276,33 +277,33 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
CSV -> Parser.parseFormatC Parser.CsvGargV3 CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormatC Parser.WOS WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse PresseRIS -> Parser.parseFormatC Parser.RisPresse
ZIP -> Parser.parseFormatC Parser.ZIP
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
let data' = case ft of let data' = case ff of
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of Plain -> cs d
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded Right decoded -> decoded
_ -> cs d eDocsC <- liftBase $ parseC ff data'
eDocsC <- liftBase $ parseC data'
case eDocsC of case eDocsC of
Right docsC -> do Right docsC -> do
-- TODO Add progress (jobStatus) update for docs - this is a -- TODO Add progress (jobStatus) update for docs - this is a
-- long action -- long action
let docsC' = zipSources (yieldMany [1..]) docsC let docsC' = zipSources (yieldMany [1..]) docsC
.| mapMC \(idx, doc) -> do .| mapMC (\(idx, doc) ->
if idx > limit then do if idx > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit) --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
let panicMsg' = [ "[addToCorpusWithForm] number of docs " let panicMsg' = [ "[addToCorpusWithForm] number of docs "
, "exceeds the MAX_DOCS_PARSERS limit (" , "exceeds the MAX_DOCS_PARSERS limit ("
, show limit , show limit
, ")" ] , ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg' let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg panic panicMsg
else else
pure doc pure doc)
.| mapC toHyperdataDocument
--printDebug "Parsing corpus finished : " cid --printDebug "Parsing corpus finished : " cid
--logStatus jobLog2 --logStatus jobLog2
...@@ -313,16 +314,18 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -313,16 +314,18 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
(Right [cid]) (Right [cid])
(Multi $ fromMaybe EN l) (Multi $ fromMaybe EN l)
Nothing Nothing
(Just $ fromIntegral $ length docs, docsC' .| mapC toHyperdataDocument) --(Just $ fromIntegral $ length docs, docsC')
(Just 0, docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs) --(map (map toHyperdataDocument) docs)
logStatus (logStatus)
printDebug "Extraction finished : " cid printDebug "Extraction finished : " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user -- TODO uncomment this
--sendMail user
logStatus jobLog3 logStatus jobLog3
pure $ jobLog3 pure jobLog3
Left e -> do Left e -> do
printDebug "[addToCorpusWithForm] parse error" e printDebug "[addToCorpusWithForm] parse error" e
......
...@@ -20,19 +20,16 @@ module Gargantext.API.Node.Corpus.New.File ...@@ -20,19 +20,16 @@ module Gargantext.API.Node.Corpus.New.File
import Control.Lens ((.~), (?~)) import Control.Lens ((.~), (?~))
import Control.Monad (forM) import Control.Monad (forM)
import Data.Aeson
import Data.Maybe import Data.Maybe
import Data.Monoid (mempty) import Data.Monoid (mempty)
import Data.Swagger import Data.Swagger
import Data.Text (Text()) import Data.Text (Text())
import GHC.Generics (Generic)
import Servant import Servant
import Servant.Multipart import Servant.Multipart
import Servant.Swagger.Internal import Servant.Swagger.Internal
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
import Gargantext.API.Node.Corpus.New.Types
import Gargantext.Core.Types (TODO) import Gargantext.Core.Types (TODO)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude -- (Cmd, CmdM) import Gargantext.Database.Prelude -- (Cmd, CmdM)
...@@ -41,38 +38,9 @@ import Gargantext.Prelude.Crypto.Hash (hash) ...@@ -41,38 +38,9 @@ import Gargantext.Prelude.Crypto.Hash (hash)
------------------------------------------------------------- -------------------------------------------------------------
type Hash = Text type Hash = Text
data FileType = CSV
| CSV_HAL
| PresseRIS
| WOS
| ZIP
deriving (Eq, Show, Generic)
instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
instance ToParamSchema FileType
instance FromJSON FileType
instance ToJSON FileType
instance ToParamSchema (MultipartData Mem) where toParamSchema _ = toParamSchema (Proxy :: Proxy TODO) instance ToParamSchema (MultipartData Mem) where toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
instance FromHttpApiData FileType
where
parseUrlPiece "CSV" = pure CSV
parseUrlPiece "CSV_HAL" = pure CSV_HAL
parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "ZIP" = pure ZIP
parseUrlPiece "WOS" = pure WOS
parseUrlPiece _ = pure CSV -- TODO error here
instance ToHttpApiData FileType where
toUrlPiece t = case t of
CSV -> "CSV"
CSV_HAL -> "CSV_HAL"
PresseRIS -> "PresseRis"
ZIP -> "ZIP"
WOS -> "WOS"
instance (ToParamSchema a, HasSwagger sub) => instance (ToParamSchema a, HasSwagger sub) =>
HasSwagger (MultipartForm tag a :> sub) where HasSwagger (MultipartForm tag a :> sub) where
-- TODO -- TODO
...@@ -89,6 +57,7 @@ instance (ToParamSchema a, HasSwagger sub) => ...@@ -89,6 +57,7 @@ instance (ToParamSchema a, HasSwagger sub) =>
type WithUpload' = Summary "Upload file(s) to a corpus" type WithUpload' = Summary "Upload file(s) to a corpus"
:> QueryParam "fileType" FileType :> QueryParam "fileType" FileType
:> QueryParam "fileFormat" FileFormat
:> MultipartForm Mem (MultipartData Mem) :> MultipartForm Mem (MultipartData Mem)
:> Post '[JSON] [Hash] :> Post '[JSON] [Hash]
...@@ -96,11 +65,14 @@ type WithUpload' = Summary "Upload file(s) to a corpus" ...@@ -96,11 +65,14 @@ type WithUpload' = Summary "Upload file(s) to a corpus"
--postUpload :: NodeId -> GargServer UploadAPI --postUpload :: NodeId -> GargServer UploadAPI
postUpload :: NodeId postUpload :: NodeId
-> Maybe FileType -> Maybe FileType
-> Maybe FileFormat
-> MultipartData Mem -> MultipartData Mem
-> Cmd err [Hash] -> Cmd err [Hash]
postUpload _ Nothing _ = panic "fileType is a required parameter" postUpload _ Nothing _ _ = panic "fileType is a required parameter"
postUpload _ (Just fileType) multipartData = do postUpload _ _ Nothing _ = panic "fileFormat is a required parameter"
postUpload _ (Just fileType) (Just fileFormat) multipartData = do
printDebug "File Type: " fileType printDebug "File Type: " fileType
printDebug "File format: " fileFormat
is <- liftBase $ do is <- liftBase $ do
printDebug "Inputs:" () printDebug "Inputs:" ()
forM (inputs multipartData) $ \input -> do forM (inputs multipartData) $ \input -> do
......
module Gargantext.API.Node.Corpus.New.Types where
import Data.Aeson
import Data.Swagger
import Data.Text (pack)
import GHC.Generics (Generic)
import Servant
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
import Gargantext.Prelude
data FileType = CSV
| CSV_HAL
| PresseRIS
| WOS
deriving (Eq, Show, Generic)
instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
instance ToParamSchema FileType
instance FromJSON FileType
instance ToJSON FileType
instance FromHttpApiData FileType where
parseUrlPiece "CSV" = pure CSV
parseUrlPiece "CSV_HAL" = pure CSV_HAL
parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS
parseUrlPiece _ = pure CSV -- TODO error here
instance ToHttpApiData FileType where
toUrlPiece = pack . show
data FileFormat = Plain | ZIP
deriving (Eq, Show, Generic)
instance ToSchema FileFormat
instance Arbitrary FileFormat where arbitrary = elements [ Plain, ZIP ]
instance ToParamSchema FileFormat
instance FromJSON FileFormat
instance ToJSON FileFormat
instance FromHttpApiData FileFormat where
parseUrlPiece "Plain" = pure Plain
parseUrlPiece "ZIP" = pure ZIP
parseUrlPiece _ = pure Plain -- TODO error here
instance ToHttpApiData FileFormat where
toUrlPiece = pack . show
...@@ -17,7 +17,7 @@ import GHC.Generics (Generic) ...@@ -17,7 +17,7 @@ import GHC.Generics (Generic)
import Network.HTTP.Client import Network.HTTP.Client
import Network.HTTP.Client.TLS import Network.HTTP.Client.TLS
import qualified Prelude as Prelude import qualified Prelude
import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text) import Protolude (catMaybes, encodeUtf8, rightToMaybe, Text)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Config import Gargantext.Prelude.Config
......
...@@ -19,14 +19,15 @@ import Gargantext.Core (Lang(..){-, allLangs-}) ...@@ -19,14 +19,15 @@ import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Core.Utils.Prefix (unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Prelude import Gargantext.Prelude
import qualified Gargantext.Database.GargDB as GargDB import qualified Gargantext.Database.GargDB as GargDB
import Gargantext.API.Node.Corpus.New.File (FileType) import Gargantext.API.Node.Corpus.New.Types (FileType, FileFormat)
------------------------------------------------------- -------------------------------------------------------
data NewWithForm = NewWithForm data NewWithForm = NewWithForm
{ _wf_filetype :: !FileType { _wf_filetype :: !FileType
, _wf_data :: !Text -- NOTE for binary files, this represents base-64 data , _wf_fileformat :: !FileFormat
, _wf_lang :: !(Maybe Lang) , _wf_data :: !Text -- NOTE for binary files, this represents base-64 data
, _wf_name :: !Text , _wf_lang :: !(Maybe Lang)
, _wf_name :: !Text
} deriving (Eq, Show, Generic) } deriving (Eq, Show, Generic)
makeLenses ''NewWithForm makeLenses ''NewWithForm
......
...@@ -19,7 +19,7 @@ import qualified Data.Set as S ...@@ -19,7 +19,7 @@ import qualified Data.Set as S
import qualified Data.List as DL import qualified Data.List as DL
import qualified Data.Vector as DV import qualified Data.Vector as DV
import qualified Data.Map as M import qualified Data.Map as M
import qualified Prelude as Prelude import qualified Prelude
import Gargantext.Prelude import Gargantext.Prelude
......
...@@ -22,12 +22,12 @@ import System.FilePath (FilePath()) -- , takeExtension) ...@@ -22,12 +22,12 @@ import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv) import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..)) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
risPress2csvWrite :: FilePath -> IO () risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = do risPress2csvWrite f = do
eContents <- parseFile RisPresse (f <> ".ris") eContents <- parseFile RisPresse Plain (f <> ".ris")
case eContents of case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e) Left e -> panic $ "Error: " <> (T.pack e)
......
...@@ -20,15 +20,15 @@ please follow the types. ...@@ -20,15 +20,15 @@ please follow the types.
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat) module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
where where
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad.Identity (runIdentity) import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Control.Monad (join) import Control.Monad (join)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..)) import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
import Data.List (concat, lookup) import Data.List (concat, lookup)
...@@ -44,13 +44,14 @@ import qualified Data.ByteString.Char8 as DBC ...@@ -44,13 +44,14 @@ import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Prelude as Prelude import qualified Prelude
import System.IO.Temp (emptySystemTempFile) import System.IO.Temp (emptySystemTempFile)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC) import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault) -- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
...@@ -69,9 +70,8 @@ type ParseError = String ...@@ -69,9 +70,8 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS | RIS | RisPresse data FileType = WOS | RIS | RisPresse
| CsvGargV3 | CsvHal | CsvGargV3 | CsvHal
| ZIP
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -80,71 +80,72 @@ data FileFormat = WOS | RIS | RisPresse ...@@ -80,71 +80,72 @@ data FileFormat = WOS | RIS | RisPresse
-- | PDF -- Not Implemented / pdftotext and import Pandoc ? -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ())) parseFormatC :: MonadBaseControl IO m => FileType -> FileFormat -> DB.ByteString -> m (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
parseFormatC CsvGargV3 bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs) parseFormatC CsvGargV3 Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
parseFormatC CsvHal bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs) parseFormatC CsvHal Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
parseFormatC RisPresse bs = do parseFormatC RisPresse Plain bs = do
docs <- snd --docs <- enrichWith RisPresse
<$> enrichWith RisPresse let eDocs = runParser' RisPresse bs
$ partitionEithers pure $ (\docs -> yieldMany docs
$ [runParser' RisPresse bs] .| mapC presseEnrich
pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc RIS)) <$> docs .| mapC (map $ both decodeUtf8)
parseFormatC WOS bs = do .| mapMC (toDoc RIS)) <$> eDocs
docs <- snd parseFormatC WOS Plain bs = do
<$> enrichWith WOS let eDocs = runParser' WOS bs
$ partitionEithers pure $ (\docs -> yieldMany docs
$ [runParser' WOS bs] .| mapC (map $ first WOS.keys)
pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc WOS)) <$> docs .| mapC (map $ both decodeUtf8)
parseFormatC ZIP bs = do .| mapMC (toDoc WOS)) <$> eDocs
path <- emptySystemTempFile "parsed-zip" parseFormatC _ft ZIP bs = do
DB.writeFile path bs path <- liftBase $ emptySystemTempFile "parsed-zip"
parsedZip <- withArchive path $ do liftBase $ DB.writeFile path bs
DM.keys <$> getEntries parsedZip <- liftBase $ withArchive path $ do
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
parseFormatC _ _ = undefined
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = do
docs <- mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
pure $ Right docs
parseFormat WOS bs = do
docs <- mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
parseFormat ZIP bs = do
path <- emptySystemTempFile "parsed-zip"
DB.writeFile path bs
parsedZip <- withArchive path $ do
DM.keys <$> getEntries DM.keys <$> getEntries
pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
parseFormat _ _ = undefined parseFormatC _ _ _ = undefined
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
-- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
-- <$> enrichWith RisPresse
-- $ partitionEithers
-- $ [runParser' RisPresse bs]
-- pure $ Right docs
-- parseFormat WOS bs = do
-- docs <- mapM (toDoc WOS)
-- <$> snd
-- <$> enrichWith WOS
-- $ partitionEithers
-- $ [runParser' WOS bs]
-- pure $ Right docs
-- parseFormat ZIP bs = do
-- path <- emptySystemTempFile "parsed-zip"
-- DB.writeFile path bs
-- parsedZip <- withArchive path $ do
-- DM.keys <$> getEntries
-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
-- parseFormat _ _ = undefined
-- | Parse file into documents -- | Parse file into documents
-- TODO manage errors here -- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message -- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal p = parseHal p parseFile CsvHal Plain p = parseHal p
parseFile CsvGargV3 p = parseCsv p parseFile CsvGargV3 Plain p = parseCsv p
parseFile RisPresse p = do parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs pure $ Right docs
parseFile WOS p = do parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs pure $ Right docs
parseFile ff p = do parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs pure $ Right docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS -- TODO use language for RIS
toDoc ff d = do toDoc ff d = do
-- let abstract = lookup "abstract" d -- let abstract = lookup "abstract" d
...@@ -174,7 +175,7 @@ toDoc ff d = do ...@@ -174,7 +175,7 @@ toDoc ff d = do
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang } , _hd_language_iso2 = Just $ (DT.pack . show) lang }
enrichWith :: FileFormat enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]]) -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys)) enrichWith WOS = enrichWith' (map (first WOS.keys))
...@@ -189,7 +190,7 @@ enrichWith' f = second (map both' . map f . concat) ...@@ -189,7 +190,7 @@ enrichWith' f = second (map both' . map f . concat)
readFileWith :: FileFormat -> FilePath readFileWith :: FileType -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]]) -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do readFileWith format path = do
files <- case takeExtension path of files <- case takeExtension path of
...@@ -200,19 +201,19 @@ readFileWith format path = do ...@@ -200,19 +201,19 @@ readFileWith format path = do
-- | withParser: -- | withParser:
-- According to the format of the text, choose the right parser. -- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document] -- TODO withParser :: FileType -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]] withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = WOS.parser withParser WOS = WOS.parser
withParser RIS = RIS.parser withParser RIS = RIS.parser
--withParser ODT = odtParser --withParser ODT = odtParser
--withParser XML = xmlParser --withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet" withParser _ = panic "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString runParser :: FileType -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]]) -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ runParser' format text runParser format text = pure $ runParser' format text
runParser' :: FileFormat -> DB.ByteString runParser' :: FileType -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]]) -> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' format text = parseOnly (withParser format) text runParser' format text = parseOnly (withParser format) text
......
...@@ -29,7 +29,7 @@ import Data.Vector (Vector) ...@@ -29,7 +29,7 @@ import Data.Vector (Vector)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import GHC.Word (Word8) import GHC.Word (Word8)
import qualified Prelude as Prelude import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
......
...@@ -38,7 +38,7 @@ import Gargantext.Database.Query.Table.Node.Error ...@@ -38,7 +38,7 @@ import Gargantext.Database.Query.Table.Node.Error
import Gargantext.Database.Query.Tree import Gargantext.Database.Query.Tree
import Gargantext.Database.Schema.Ngrams import Gargantext.Database.Schema.Ngrams
import Gargantext.Prelude import Gargantext.Prelude
import qualified Prelude as Prelude import qualified Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -42,7 +42,7 @@ import Gargantext.Database.Schema.Context ...@@ -42,7 +42,7 @@ import Gargantext.Database.Schema.Context
import Gargantext.Database.Schema.Node import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
import Prelude as Prelude import Prelude
import System.Process as Shell import System.Process as Shell
import qualified Data.ByteString.Lazy as Lazy import qualified Data.ByteString.Lazy as Lazy
import qualified Data.List as List import qualified Data.List as List
......
...@@ -74,7 +74,7 @@ import Gargantext.Core.Ext.IMT (toSchoolName) ...@@ -74,7 +74,7 @@ import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (readFile_Annuaire) import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Core.Text import Gargantext.Core.Text
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat, FileType)
import Gargantext.Core.Text.List (buildNgramsLists) import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..)) import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith) import Gargantext.Core.Text.List.Social (FlowSocialListWith)
...@@ -106,7 +106,7 @@ import Gargantext.Prelude ...@@ -106,7 +106,7 @@ import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash) import Gargantext.Prelude.Crypto.Hash (Hash)
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
import qualified Prelude as Prelude import qualified Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Imports for upgrade function -- Imports for upgrade function
...@@ -189,18 +189,21 @@ flowCorpusFile :: (FlowCmdM env err m) ...@@ -189,18 +189,21 @@ flowCorpusFile :: (FlowCmdM env err m)
=> User => User
-> Either CorpusName [CorpusId] -> Either CorpusName [CorpusId]
-> Limit -- Limit the number of docs (for dev purpose) -> Limit -- Limit the number of docs (for dev purpose)
-> TermType Lang -> FileFormat -> FilePath -> TermType Lang
-> FileType
-> FileFormat
-> FilePath
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> (JobLog -> m ()) -> (JobLog -> m ())
-> m CorpusId -> m CorpusId
flowCorpusFile u n _l la ff fp mfslw logStatus = do flowCorpusFile u n _l la ft ff fp mfslw logStatus = do
eParsed <- liftBase $ parseFile ff fp eParsed <- liftBase $ parseFile ft ff fp
case eParsed of case eParsed of
Right parsed -> do Right parsed -> do
flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
--let docs = splitEvery 500 $ take l parsed --let docs = splitEvery 500 $ take l parsed
--flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus --flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
Left e -> panic $ "Error: " <> (T.pack e) Left e -> panic $ "Error: " <> T.pack e
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus -- | TODO improve the needed type to create/update a corpus
......
...@@ -3,7 +3,7 @@ module Gargantext.Utils.Servant where ...@@ -3,7 +3,7 @@ module Gargantext.Utils.Servant where
import qualified Data.ByteString.Lazy.Char8 as BSC import qualified Data.ByteString.Lazy.Char8 as BSC
import Data.Csv (encodeDefaultOrderedByName, DefaultOrdered, ToNamedRecord) import Data.Csv (encodeDefaultOrderedByName, DefaultOrdered, ToNamedRecord)
import Network.HTTP.Media ((//), (/:)) import Network.HTTP.Media ((//), (/:))
import qualified Prelude as Prelude import qualified Prelude
import Protolude import Protolude
import Protolude.Partial (read) import Protolude.Partial (read)
import Servant import Servant
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment