[istex] implement zip file upload for istex

parent 03bd0bf6
Pipeline #5213 failed with stages
in 12 minutes and 11 seconds
......@@ -55,4 +55,4 @@ main = do
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
CSV.writeFile wPath (h, docs')
Left e -> panic $ "Error: " <> (pack e)
Left e -> panic $ "Error: " <> e
......@@ -18,34 +18,25 @@ Main specifications to index a corpus with a term list
module Main where
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import Data.Aeson
import Data.ByteString.Lazy (writeFile)
import Data.Either (Either(..))
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as DM
import Data.Text (pack, Text)
import qualified Data.Text as DT
import Data.Map.Strict qualified as DM
import Data.Text (pack)
import Data.Text qualified as DT
import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import Data.Vector qualified as DV
import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Prelude hiding (show)
import Protolude
import System.IO (hFlush)
------------------------------------------------------------------------
-- OUTPUT format
......@@ -78,7 +69,7 @@ filterTermsAndCooc patterns (year, ts) = do
log m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putStrLn . unwords $
putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
main :: IO ()
......@@ -97,14 +88,14 @@ main = do
-- termListMap :: [Text]
termList <- csvMapTermList termListFile
putStrLn $ show $ length termList
putText $ show $ length termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> (pack e)
writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> e
......@@ -113,7 +104,7 @@ main = do
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs
where
g c x = do
......@@ -130,6 +121,7 @@ mapConcurrentlyChunked f ts = do
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
......
......@@ -94,7 +94,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git
tag: a34bb341236d82cf3d488210bc1d8448a98f5808
tag: 9b1bd17f3ed38eab83e675bb68278922217a9c73
source-repository-package
type: git
......@@ -167,3 +167,4 @@ package hmatrix
package sparse-linear
ghc-options: -O2 -fsimpl-tick-factor=10000 -fdicts-cheap -fdicts-strict -flate-dmd-anal -fno-state-hack
......@@ -147,6 +147,7 @@ library
Gargantext.Utils.Jobs.State
Gargantext.Utils.SpacyNLP
Gargantext.Utils.Tuple
Gargantext.Utils.Zip
other-modules:
Gargantext.API.Admin.Auth
Gargantext.API.Admin.FrontEnd
......@@ -229,6 +230,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
......@@ -665,6 +667,7 @@ executable gargantext-cli
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, protolude ^>= 0.3.0
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
......
......@@ -87,7 +87,7 @@ addContact :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
addContact u nId (AddContactParams fn ln) jobHandle = do
markStarted 2 jobHandle
_ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (Just 1, yield $ hyperdataContact fn ln) jobHandle
_ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (1, yield $ hyperdataContact fn ln) jobHandle
markComplete jobHandle
addContact _uId _nId _p jobHandle = do
......
......@@ -42,7 +42,7 @@ import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..), withDefaultLanguage)
import Gargantext.Core (Lang(..), withDefaultLanguage, defaultLanguage)
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion)
import Gargantext.Core.Text.Corpus.API qualified as API
import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC)
......@@ -278,33 +278,35 @@ addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
-> NewWithForm
-> JobHandle m
-> m ()
addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n sel) jobHandle = do
addToCorpusWithForm user cid nwf jobHandle = do
-- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
let l = nwf ^. wf_lang . non defaultLanguage
addLanguageToCorpus cid l
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' :: Integer
let
parseC = case ft of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
parseC = case (nwf ^. wf_filetype) of
CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
Iramuteq -> Parser.parseFormatC Parser.Iramuteq
Istex -> Parser.parseFormatC Parser.Istex
JSON -> Parser.parseFormatC Parser.JSON
PresseRIS -> Parser.parseFormatC Parser.RisPresse
WOS -> Parser.parseFormatC Parser.WOS
-- TODO granularity of the logStatus
let data' = case ff of
Plain -> cs d
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
let data' = case (nwf ^. wf_fileformat) of
Plain -> cs (nwf ^. wf_data)
ZIP -> case BSB64.decode $ TE.encodeUtf8 (nwf ^. wf_data) of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded
eDocsC <- liftBase $ parseC ff data'
eDocsC <- liftBase $ parseC (nwf ^. wf_fileformat) data'
case eDocsC of
Right (mCount, docsC) -> do
Right (count, docsC) -> do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
......@@ -333,9 +335,9 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
_cid' <- flowCorpus user
(Right [cid])
(Multi l)
(Just sel)
(Just (nwf ^. wf_selection))
--(Just $ fromIntegral $ length docs, docsC')
(mCount, transPipe liftBase docsC') -- TODO fix number of docs
(count, transPipe liftBase docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs)
jobHandle
......@@ -347,7 +349,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
markComplete jobHandle
Left e -> do
printDebug "[addToCorpusWithForm] parse error" e
markFailed (Just $ T.pack e) jobHandle
markFailed (Just e) jobHandle
{-
addToCorpusWithFile :: FlowCmdM env err m
......
......@@ -12,6 +12,7 @@ import Gargantext.Prelude
data FileType = CSV
| CSV_HAL
| Istex
| PresseRIS
| WOS
| Iramuteq
......@@ -26,6 +27,7 @@ instance ToJSON FileType
instance FromHttpApiData FileType where
parseUrlPiece "CSV" = pure CSV
parseUrlPiece "CSV_HAL" = pure CSV_HAL
parseUrlPiece "Istex" = pure Istex
parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS
parseUrlPiece "Iramuteq" = pure Iramuteq
......
......@@ -90,6 +90,13 @@ frameCalcUploadAsync uId nId (FrameCalcUpload _wf_lang _wf_selection) jobHandle
case mCId of
Nothing -> markFailure 1 Nothing jobHandle
Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV Plain body _wf_lang "calc-upload.csv" _wf_selection) jobHandle
addToCorpusWithForm (RootId (NodeId uId))
cId
(NewWithForm { _wf_filetype = CSV
, _wf_fileformat = Plain
, _wf_data = body
, _wf_lang
, _wf_name = "calc-upload.csv"
, _wf_selection }) jobHandle
markComplete jobHandle
......@@ -14,22 +14,17 @@ Portability : POSIX
module Gargantext.Core.Ext.IMT where
import Data.Either (Either(..))
import Data.List qualified as DL
import Data.Map.Strict (Map)
import Data.Text (Text, splitOn)
import qualified Data.Set as S
import qualified Data.List as DL
import qualified Data.Vector as DV
import qualified Data.Map.Strict as M
import qualified Prelude
import Data.Map.Strict qualified as M
import Data.Morpheus.Types (GQLType)
import Data.Set qualified as S
import Data.Text (Text, splitOn)
import Data.Vector qualified as DV
import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Prelude
data School = School { school_shortName :: Text
, school_longName :: Text
......@@ -112,7 +107,7 @@ mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map
(\(School { school_shortName, school_id }) -> (school_id, school_shortName)) schools
hal_data :: IO (Either Prelude.String (DV.Vector CsvHal))
hal_data :: IO (Either Text (DV.Vector CsvHal))
hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r
......
......@@ -17,12 +17,10 @@ module Gargantext.Core.Text.Convert (risPress2csvWrite)
where
import Data.Either (Either(..))
import qualified Data.Text as T
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Prelude
import System.FilePath (FilePath()) -- , takeExtension)
risPress2csvWrite :: FilePath -> IO ()
......@@ -30,7 +28,7 @@ risPress2csvWrite f = do
eContents <- parseFile RisPresse Plain (f <> ".ris")
case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e)
Left e -> panic $ "Error: " <> e
......@@ -14,19 +14,16 @@ module Gargantext.Core.Text.Corpus.API.Istex
where
import Data.Either (Either(..))
import Data.List (concat)
import Data.List qualified as List
import Data.Maybe
import Data.Text (Text, pack)
import qualified Data.Text as Text
import qualified Data.List as List
import Data.Text (Text)
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified ISTEX as ISTEX
import qualified ISTEX.Client as ISTEX
import ISTEX qualified as ISTEX
import ISTEX.Client qualified as ISTEX
type Query = Text
type MaxResults = Maybe Int
......@@ -76,31 +73,4 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs')
-- | TODO remove dateSplit here
-- TODO current year as default
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
, _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
, _hd_abstract = ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la
}
This diff is collapsed.
......@@ -16,25 +16,19 @@ module Gargantext.Core.Text.Corpus.Parsers.CSV where
import Conduit
import Control.Applicative
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL
import Data.Char (ord)
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Either (Either(..))
import Data.Maybe (fromMaybe)
import Data.Text (Text, pack, length, intercalate)
import Data.Text (pack)
import Data.Text qualified as T
import Data.Time.Segment (jour)
import qualified Data.Vector as V
import Data.Vector (Vector)
import GHC.IO (FilePath)
import GHC.Word (Word8)
import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Data.Vector qualified as V
import Gargantext.Core.Text
import Gargantext.Core.Text.Context
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length, show)
import Protolude
---------------------------------------------------------------
headerCsvGargV3 :: Header
......@@ -117,7 +111,7 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
splitDoc m splt doc = let docSize = (T.length $ csv_abstract doc) in
if docSize > 1000
then
if (mod (round m) docSize) >= 10
......@@ -148,7 +142,7 @@ type Mean = Double
docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
ls = V.toList $ V.map (fromIntegral . T.length . csv_abstract) csvDoc
---------------------------------------------------------------
......@@ -158,7 +152,7 @@ unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
Left _err -> IntOrDec <$> floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
......@@ -230,44 +224,40 @@ csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
readCsvOn' fields fp = do
r <- readCSVFile fp
pure $ ( V.toList
. V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
. snd ) <$> r
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Prelude.String (Header, Vector a))
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BL.ByteString
-> Either Prelude.String (Header, Vector a)
readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BS.ByteString
-> Either Prelude.String (Header, Vector a)
-> Either Text (Header, Vector a)
readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCSVFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
readCSVFile :: FilePath -> IO (Either Text (Header, Vector CsvDoc))
readCSVFile fp = do
result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
case result of
......@@ -277,20 +267,24 @@ readCSVFile fp = do
-- | TODO use readByteStringLazy
readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
readCsvLazyBS :: Delimiter
-> BL.ByteString
-> Either Text (Header, Vector CsvDoc)
readCsvLazyBS d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
readCsvHal :: FilePath -> IO (Either Text (Header, Vector CsvHal))
readCsvHal fp = do
c <- BL.readFile fp
pure $ readCsvHalLazyBS c
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
readCsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector CsvHal)
readCsvHalLazyBS bs = first pack $ decodeByNameWith (csvDecodeOptions Tab) bs
readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
readCsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector CsvHal)
readCsvHalBSStrict bs = readCsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
......@@ -437,25 +431,25 @@ csv2doc (CsvDoc { .. })
pubDay = fromMaybe defaultDay csv_publication_day
------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readCsvHal fp
pure $ (V.toList . V.map csvHal2doc . snd) <$> r
parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------
parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseCsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readCSVFile fp
{-
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
-}
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseCsv' bs = do
let
result = case readCsvLazyBS Comma bs of
......@@ -464,15 +458,13 @@ parseCsv' bs = do
(V.toList . V.map csv2doc . snd) <$> result
parseCsvC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseCsvC bs = do
let
result = case readCsvLazyBS Comma bs of
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseCsvC bs =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC csv2doc)) <$> eResult
where
eResult = case readCsvLazyBS Comma bs of
Left _err -> readCsvLazyBS Tab bs
Right res -> Right res
case result of
Left err -> Left err
Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......
......@@ -53,12 +53,11 @@ dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit (Just txt) = do
utcTime <- parse txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
pure (Just utcTime, (Just y, Just m, Just d))
split' :: UTCTime -> (Year, Month, Day)
split' utcTime = (fromIntegral y, m, d)
split' (UTCTime day _) = (fromIntegral y, m, d)
where
(UTCTime day _) = utcTime
(y,m,d) = toGregorian day
type Year = Int
......
......@@ -15,18 +15,17 @@ JSON parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.JSON where
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Conduit
import Data.Aeson
import qualified Data.ByteString.Lazy as BL
import Data.Either (Either(..))
import Data.Text
import Data.ByteString.Lazy qualified as BL
import Data.Text qualified as T
import GHC.Generics
import qualified Prelude
import Gargantext.Core (Lang)
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex qualified as Istex
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude hiding (length)
import Protolude
data JSONStruct =
......@@ -62,18 +61,27 @@ instance FromJSON JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct
readJSONLazyBS bs = eitherDecode bs
readJSONLazyBS :: (FromJSON a) => BL.ByteString -> Either Text a
readJSONLazyBS bs = first T.pack $ eitherDecode bs
parseJSONC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = do
case readJSONLazyBS bs of
Left err -> Left err
Right (JSONStruct { documents }) ->
Right ( Just $ Prelude.fromIntegral $ Prelude.length documents
, yieldMany documents .| mapC doc2hyperdoc )
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = f <$> readJSONLazyBS bs
where
f (JSONStruct { documents }) =
( fromIntegral $ length documents
, yieldMany documents .| mapC doc2hyperdoc )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
parseIstex :: Lang
-> BL.ByteString
-> IO (Either Text HyperdataDocument)
parseIstex l bs = do
let ej = readJSONLazyBS bs
case ej of
Left err -> pure $ Left err
Right j -> Right <$> Istex.toDoc l j
This diff is collapsed.
{-|
Module : Gargantext.Utils.Aeson
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling zip files
-}
module Gargantext.Utils.Aeson where
import Data.Aeson.Types
......
{-|
Module : Gargantext.Utils.Jobs
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
module Gargantext.Utils.Jobs (
......
{-|
Module : Gargantext.Utils.Servant
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Utils.Servant where
import qualified Data.ByteString.Lazy.Char8 as BSC
......
{-|
Module : Gargantext.Utils.Tuple
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Utils.Tuple where
import Protolude
......
{-|
Module : Gargantext.Utils.UTCTime
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeFamilies #-}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment