Commit 5da469ed authored by Alexandre Delanoë's avatar Alexandre Delanoë

[UPLOAD] 2 others file format

parent e4cbfa19
...@@ -51,12 +51,12 @@ main = do ...@@ -51,12 +51,12 @@ main = do
let let
--tt = (Unsupervised EN 6 0 Nothing) --tt = (Unsupervised EN 6 0 Nothing)
tt = (Multi EN) tt = (Multi EN)
format = CsvGargV3 -- CsvHalFormat --WOS format = CsvGargV3 -- CsvHal --WOS
corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpus = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt format corpusPath corpus = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt format corpusPath
corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpusCsvHal = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt CsvHalFormat corpusPath corpusCsvHal = flowCorpusFile (cs user) (Left (cs name :: Text)) (read limit :: Int) tt CsvHal corpusPath
annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
annuaire = flowAnnuaire (cs user) (Left "Annuaire") (Multi EN) corpusPath annuaire = flowAnnuaire (cs user) (Left "Annuaire") (Multi EN) corpusPath
......
...@@ -43,7 +43,7 @@ import Gargantext.Database.Types.Node (CorpusId) ...@@ -43,7 +43,7 @@ import Gargantext.Database.Types.Node (CorpusId)
import Gargantext.Database.Types.Node (ToHyperdataDocument(..)) import Gargantext.Database.Types.Node (ToHyperdataDocument(..))
import Gargantext.Database.Types.Node (UserId) import Gargantext.Database.Types.Node (UserId)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Corpus.Parsers.CSV (parseCsv', parseHal') import Gargantext.Text.Corpus.Parsers (FileFormat(..), parseFormat)
import Gargantext.Text.Terms (TermType(..)) import Gargantext.Text.Terms (TermType(..))
import Servant import Servant
import Servant.API.Flatten (Flat) import Servant.API.Flatten (Flat)
...@@ -227,13 +227,14 @@ addToCorpusWithForm cid (WithForm ft d) logStatus = do ...@@ -227,13 +227,14 @@ addToCorpusWithForm cid (WithForm ft d) logStatus = do
let let
parse = case ft of parse = case ft of
CSV_HAL -> parseHal' CSV_HAL -> parseFormat CsvHal
CSV -> parseCsv' CSV -> parseFormat CsvGargV3
_ -> parseHal' _ -> parseFormat CsvHal
docs = splitEvery 500 docs <- liftIO
$ take 1000000 $ splitEvery 500
$ parse (cs d) <$> take 1000000
<$> parse (cs d)
logStatus ScraperStatus { _scst_succeeded = Just 1 logStatus ScraperStatus { _scst_succeeded = Just 1
, _scst_failed = Just 0 , _scst_failed = Just 0
......
...@@ -22,38 +22,38 @@ please follow the types. ...@@ -22,38 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText) module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
where where
--import Data.ByteString (ByteString) --import Data.ByteString (ByteString)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join) import Control.Monad (join)
import qualified Data.ByteString.Char8 as DBC
import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..)) import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
import Data.List (concat) import Data.List (concat, lookup)
import Data.List (lookup)
import Data.Ord() import Data.Ord()
import Data.String (String()) import Data.String (String())
import Data.String() import Data.String()
import Data.Text (Text) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second) import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS import Gargantext.Prelude
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
import Gargantext.Text.Learn (detectLangDefault) import Gargantext.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Lazy as DBL
import qualified Data.ByteString.Char8 as DBC
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = String type ParseError = String
...@@ -68,7 +68,7 @@ type ParseError = String ...@@ -68,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS | RIS | RisPresse data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHalFormat | CsvGargV3 | CsvHal
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -78,20 +78,30 @@ data FileFormat = WOS | RIS | RisPresse ...@@ -78,20 +78,30 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
{- parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument] parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat = undefined parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-} parseFormat RisPresse bs = mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
parseFormat WOS bs = mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
parseFormat _ _ = undefined
-- | Parse file into documents -- | Parse file into documents
-- TODO manage errors here -- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message -- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument] parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS -- TODO use language for RIS
...@@ -130,12 +140,14 @@ enrichWith WOS = enrichWith' (map (first WOS.keys)) ...@@ -130,12 +140,14 @@ enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith _ = enrichWith' identity enrichWith _ = enrichWith' identity
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)]) enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]]) -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat) enrichWith' f = second (map both' . map f . concat)
where where
both' = map (both decodeUtf8) both' = map (both decodeUtf8)
readFileWith :: FileFormat -> FilePath readFileWith :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]]) -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do readFileWith format path = do
...@@ -157,7 +169,11 @@ withParser _ = panic "[ERROR] Parser not implemented yet" ...@@ -157,7 +169,11 @@ withParser _ = panic "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]]) -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text runParser format text = pure $ runParser' format text
runParser' :: FileFormat -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' format text = parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString] openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do openZip fp = do
......
...@@ -32,8 +32,8 @@ import Gargantext.Prelude hiding (length) ...@@ -32,8 +32,8 @@ import Gargantext.Prelude hiding (length)
import Gargantext.Text import Gargantext.Text
import Gargantext.Text.Context import Gargantext.Text.Context
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.Vector as V import qualified Data.Vector as V
--------------------------------------------------------------- ---------------------------------------------------------------
headerCsvGargV3 :: Header headerCsvGargV3 :: Header
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment