Commit 5da469ed authored by Alexandre Delanoë's avatar Alexandre Delanoë

[UPLOAD] 2 others file format

parent e4cbfa19
......@@ -51,12 +51,12 @@ main = do
let
--tt = (Unsupervised EN 6 0 Nothing)
tt = (Multi EN)
format = CsvGargV3 -- CsvHalFormat --WOS
format = CsvGargV3 -- CsvHal --WOS
corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpus = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt format corpusPath
corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpusCsvHal = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt CsvHalFormat corpusPath
corpusCsvHal = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt CsvHal corpusPath
annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
annuaire = flowAnnuaire (cs user) (Left "Annuaire") (Multi EN) corpusPath
......
......@@ -43,7 +43,7 @@ import Gargantext.Database.Types.Node (CorpusId)
import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
import Gargantext.Database.Types.Node (UserId)
import Gargantext.Prelude
import Gargantext.Text.Corpus.Parsers.CSV (parseCsv', parseHal')
import Gargantext.Text.Corpus.Parsers (FileFormat(..), parseFormat)
import Gargantext.Text.Terms (TermType(..))
import Servant
import Servant.API.Flatten (Flat)
......@@ -227,13 +227,14 @@ addToCorpusWithForm cid (WithForm ft d) logStatus = do
let
parse = case ft of
CSV_HAL -> parseHal'
CSV -> parseCsv'
_ -> parseHal'
docs = splitEvery 500
$ take 1000000
$ parse (cs d)
CSV_HAL -> parseFormat CsvHal
CSV -> parseFormat CsvGargV3
_ -> parseFormat CsvHal
docs <- liftIO
$ splitEvery 500
<$> take 1000000
<$> parse (cs d)
logStatus ScraperStatus { _scst_succeeded = Just 1
, _scst_failed = Just 0
......
......@@ -22,38 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText)
module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
where
--import Data.ByteString (ByteString)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
import qualified Data.ByteString.Char8 as DBC
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat)
import Data.List (lookup)
import Data.List (concat, lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import Gargantext.Prelude
import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
import Gargantext.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Lazy as DBL
import qualified Data.ByteString.Char8 as DBC
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
------------------------------------------------------------------------
type ParseError = String
......@@ -68,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHalFormat
| CsvGargV3 | CsvHal
deriving (Show)
-- Implemented (ISI Format)
......@@ -78,20 +78,30 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
parseFormat WOS bs = mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p
parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
......@@ -130,12 +140,14 @@ enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith _ = enrichWith' identity
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
readFileWith :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do
......@@ -157,7 +169,11 @@ withParser _ = panic "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
runParser format text = pure $ runParser' format text
runParser' :: FileFormat -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' format text = parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
......
......@@ -32,8 +32,8 @@ import Gargantext.Prelude hiding (length)
import Gargantext.Text
import Gargantext.Text.Context
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as BS
import qualified Data.Vector as V
import qualified Data.ByteString as BS
import qualified Data.Vector as V
---------------------------------------------------------------
headerCsvGargV3 :: Header
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment