[istex] implement zip file upload for istex

3dd27b5a · Przemyslaw Kaminski · 03bd0bf6 · 3dd27b5a · 3dd27b5a · 3dd27b5a
Verified Commit 3dd27b5a authored Oct 06, 2023 by Przemyslaw Kaminski
21 changed files
--- a/bin/gargantext-cli/CleanCsvCorpus.hs
+++ b/bin/gargantext-cli/CleanCsvCorpus.hs
@@ -55,4 +55,4 @@ main = do
      putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')

      CSV.writeFile wPath (h, docs')
-    Left e -> panic $ "Error: " <> (pack e)
+    Left e -> panic $ "Error: " <> e
--- a/bin/gargantext-cli/Main.hs
+++ b/bin/gargantext-cli/Main.hs
@@ -18,34 +18,25 @@ Main specifications to index a corpus with a term list
 module Main where

 import Control.Concurrent.Async as CCA (mapConcurrently)
-import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
-import Control.Monad (zipWithM)
 import Control.Monad.IO.Class
 import Data.Aeson
-import Data.ByteString.Lazy (writeFile)
-import Data.Either (Either(..))
-import Data.List (cycle, concat, unwords)
 import Data.List.Split (chunksOf)
-import Data.Map.Strict (Map)
-import qualified Data.Map.Strict    as DM
-import Data.Text (pack, Text)
-import qualified Data.Text as DT
+import Data.Map.Strict qualified as DM
+import Data.Text (pack)
+import Data.Text qualified as DT
+import Data.Text.Lazy qualified as DTL
+import Data.Text.Lazy.Encoding qualified as TLE
 import Data.Tuple.Extra (both)
-import qualified Data.Vector as DV
+import Data.Vector qualified as DV
 import GHC.Generics
-import System.IO (hPutStr, hFlush, stderr)
-import System.Environment
-
-import Gargantext.Prelude
-import Gargantext.Core
-import Gargantext.Core.Types
-import Gargantext.Core.Text.Terms
 import Gargantext.Core.Text.Context
-import Gargantext.Core.Text.Terms.WithList
-import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
+import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, fromMIntOrDec, defaultYear)
 import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
-import Gargantext.Core.Text.Terms (terms)
 import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
+import Gargantext.Core.Text.Terms.WithList
+import Gargantext.Prelude hiding (show)
+import Protolude
+import System.IO (hFlush)

 ------------------------------------------------------------------------
 -- OUTPUT format
@@ -78,7 +69,7 @@ filterTermsAndCooc patterns (year, ts) = do
    log m = do
      tid    <- myThreadId
      (p, _) <- threadCapability tid
-      putStrLn . unwords $
+      putText . unwords $
        ["filterTermsAndCooc:", m, show year, "on proc", show p]

 main :: IO ()
@@ -97,14 +88,14 @@ main = do
      -- termListMap :: [Text]
      termList <- csvMapTermList termListFile

-      putStrLn $ show $ length termList
+      putText $ show $ length termList

      let patterns = buildPatterns termList

      -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
      r <-  mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
-      writeFile outputFile $ encode (CoocByYears r)
-    Left e -> panic $ "Error: " <> (pack e)
+      writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
+    Left e -> panic $ "Error: " <> e



@@ -113,7 +104,7 @@ main = do
 mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
 mapMP f xs = do
    bs <- zipWithM g (cycle "-\\|/") xs
-    liftIO $ hPutStr stderr "\rDone\n"
+    liftIO $ hPutStr stderr ("\rDone\n" :: Text)
    pure bs
  where
    g c x = do
@@ -130,6 +121,7 @@ mapConcurrentlyChunked f ts = do


 --terms' :: Patterns -> Text -> Corpus [[Text]]
+terms' :: Applicative f => Patterns -> Text -> f [[Text]]
 terms' pats txt = pure $ concat $ extractTermsWithList pats txt



--- a/cabal.project
+++ b/cabal.project
@@ -94,7 +94,7 @@ source-repository-package
 source-repository-package
    type: git
    location: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git
-    tag: a34bb341236d82cf3d488210bc1d8448a98f5808
+    tag: 9b1bd17f3ed38eab83e675bb68278922217a9c73

 source-repository-package
    type: git
@@ -167,3 +167,4 @@ package hmatrix

 package sparse-linear
    ghc-options: -O2 -fsimpl-tick-factor=10000 -fdicts-cheap -fdicts-strict -flate-dmd-anal  -fno-state-hack
+
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -147,6 +147,7 @@ library
      Gargantext.Utils.Jobs.State
      Gargantext.Utils.SpacyNLP
      Gargantext.Utils.Tuple
+      Gargantext.Utils.Zip
  other-modules:
      Gargantext.API.Admin.Auth
      Gargantext.API.Admin.FrontEnd
@@ -229,6 +230,7 @@ library
      Gargantext.Core.Text.Corpus.Parsers.Iramuteq
      Gargantext.Core.Text.Corpus.Parsers.Isidore
      Gargantext.Core.Text.Corpus.Parsers.JSON
+      Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
      Gargantext.Core.Text.Corpus.Parsers.Json2Csv
      Gargantext.Core.Text.Corpus.Parsers.RIS
      Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
@@ -665,6 +667,7 @@ executable gargantext-cli
    , gargantext-prelude
    , ini ^>= 0.4.1
    , optparse-generic ^>= 1.4.7
+    , protolude ^>= 0.3.0
    , split ^>= 0.2.3.4
    , text ^>= 1.2.4.1
    , unordered-containers ^>= 0.2.16.0

--- a/src/Gargantext/API/Node/Contact.hs
+++ b/src/Gargantext/API/Node/Contact.hs
@@ -87,7 +87,7 @@ addContact :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
 addContact u nId (AddContactParams fn ln) jobHandle = do

  markStarted 2 jobHandle
-  _ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (Just 1, yield $ hyperdataContact fn ln) jobHandle
+  _ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (1, yield $ hyperdataContact fn ln) jobHandle

  markComplete jobHandle
 addContact _uId _nId _p jobHandle = do

--- a/src/Gargantext/API/Node/Corpus/New.hs
+++ b/src/Gargantext/API/Node/Corpus/New.hs
@@ -42,7 +42,7 @@ import Gargantext.API.Node.Corpus.Searx
 import Gargantext.API.Node.Corpus.Types
 import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
 import Gargantext.API.Node.Types
-import Gargantext.Core (Lang(..), withDefaultLanguage)
+import Gargantext.Core (Lang(..), withDefaultLanguage, defaultLanguage)
 import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion)
 import Gargantext.Core.Text.Corpus.API qualified as API
 import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC)
@@ -278,33 +278,35 @@ addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
                    -> NewWithForm
                    -> JobHandle m
                    -> m ()
-addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n sel) jobHandle = do
+addToCorpusWithForm user cid nwf jobHandle = do
  -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
  -- printDebug "[addToCorpusWithForm] fileType" ft
  -- printDebug "[addToCorpusWithForm] fileFormat" ff

+  let l = nwf ^. wf_lang . non defaultLanguage
  addLanguageToCorpus cid l

  limit' <- view $ hasConfig . gc_max_docs_parsers
  let limit = fromIntegral limit' :: Integer
  let
-    parseC = case ft of
-      CSV_HAL   -> Parser.parseFormatC Parser.CsvHal
+    parseC = case (nwf ^. wf_filetype) of
      CSV       -> Parser.parseFormatC Parser.CsvGargV3
-      WOS       -> Parser.parseFormatC Parser.WOS
-      PresseRIS -> Parser.parseFormatC Parser.RisPresse
+      CSV_HAL   -> Parser.parseFormatC Parser.CsvHal
      Iramuteq  -> Parser.parseFormatC Parser.Iramuteq
+      Istex     -> Parser.parseFormatC Parser.Istex
      JSON      -> Parser.parseFormatC Parser.JSON
+      PresseRIS -> Parser.parseFormatC Parser.RisPresse
+      WOS       -> Parser.parseFormatC Parser.WOS

  -- TODO granularity of the logStatus
-  let data' = case ff of
-        Plain -> cs d
-        ZIP   -> case BSB64.decode $ TE.encodeUtf8 d of
+  let data' = case (nwf ^. wf_fileformat) of
+        Plain -> cs (nwf ^. wf_data)
+        ZIP   -> case BSB64.decode $ TE.encodeUtf8 (nwf ^. wf_data) of
          Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
          Right decoded -> decoded
-  eDocsC <- liftBase $ parseC ff data'
+  eDocsC <- liftBase $ parseC (nwf ^. wf_fileformat) data'
  case eDocsC of
-    Right (mCount, docsC) -> do
+    Right (count, docsC) -> do
      -- TODO Add progress (jobStatus) update for docs - this is a
      -- long action

@@ -333,9 +335,9 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
      _cid' <- flowCorpus user
                          (Right [cid])
                          (Multi l)
-                          (Just sel)
+                          (Just (nwf ^. wf_selection))
                          --(Just $ fromIntegral $ length docs, docsC')
-                          (mCount, transPipe liftBase docsC') -- TODO fix number of docs
+                          (count, transPipe liftBase docsC') -- TODO fix number of docs
                          --(map (map toHyperdataDocument) docs)
                          jobHandle

@@ -347,7 +349,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
      markComplete jobHandle
    Left e -> do
      printDebug "[addToCorpusWithForm] parse error" e
-      markFailed (Just $ T.pack e) jobHandle
+      markFailed (Just e) jobHandle

 {-
 addToCorpusWithFile :: FlowCmdM env err m

--- a/src/Gargantext/API/Node/Corpus/New/Types.hs
+++ b/src/Gargantext/API/Node/Corpus/New/Types.hs
@@ -12,6 +12,7 @@ import Gargantext.Prelude

 data FileType = CSV
              | CSV_HAL
+              | Istex
              | PresseRIS
              | WOS
              | Iramuteq
@@ -26,6 +27,7 @@ instance ToJSON FileType
 instance FromHttpApiData FileType where
  parseUrlPiece "CSV"       = pure CSV
  parseUrlPiece "CSV_HAL"   = pure CSV_HAL
+  parseUrlPiece "Istex"     = pure Istex
  parseUrlPiece "PresseRis" = pure PresseRIS
  parseUrlPiece "WOS"       = pure WOS
  parseUrlPiece "Iramuteq"  = pure Iramuteq

--- a/src/Gargantext/API/Node/FrameCalcUpload.hs
+++ b/src/Gargantext/API/Node/FrameCalcUpload.hs
@@ -90,6 +90,13 @@ frameCalcUploadAsync uId nId (FrameCalcUpload _wf_lang _wf_selection) jobHandle
  case mCId of
    Nothing -> markFailure 1 Nothing jobHandle
    Just cId ->
-      addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV Plain body _wf_lang "calc-upload.csv" _wf_selection) jobHandle
+      addToCorpusWithForm (RootId (NodeId uId))
+                          cId
+                          (NewWithForm { _wf_filetype = CSV
+                                       , _wf_fileformat = Plain
+                                       , _wf_data = body
+                                       , _wf_lang
+                                       , _wf_name = "calc-upload.csv"
+                                       , _wf_selection }) jobHandle

  markComplete jobHandle
--- a/src/Gargantext/Core/Ext/IMT.hs
+++ b/src/Gargantext/Core/Ext/IMT.hs
@@ -14,22 +14,17 @@ Portability : POSIX
 module Gargantext.Core.Ext.IMT where

 import Data.Either (Either(..))
+import Data.List qualified as DL
 import Data.Map.Strict (Map)
-import Data.Text (Text, splitOn)
-
-import qualified Data.Set    as S
-import qualified Data.List   as DL
-import qualified Data.Vector as DV
-import qualified Data.Map.Strict    as M
-import qualified Prelude
-
+import Data.Map.Strict qualified as M
 import Data.Morpheus.Types (GQLType)
+import Data.Set qualified as S
+import Data.Text (Text, splitOn)
+import Data.Vector qualified as DV
 import GHC.Generics (Generic)
-
-import Gargantext.Prelude
-
-import Gargantext.Core.Text.Metrics.Utils      as Utils
 import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
+import Gargantext.Core.Text.Metrics.Utils      as Utils
+import Gargantext.Prelude

 data School = School { school_shortName :: Text
                     , school_longName  :: Text
@@ -112,7 +107,7 @@ mapIdSchool :: Map Text Text
 mapIdSchool = M.fromList $ Gargantext.Prelude.map
                (\(School { school_shortName, school_id }) -> (school_id, school_shortName)) schools

-hal_data :: IO (Either Prelude.String (DV.Vector CsvHal))
+hal_data :: IO (Either Text (DV.Vector CsvHal))
 hal_data = do
  r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
  pure $ snd <$> r

--- a/src/Gargantext/Core/Text/Convert.hs
+++ b/src/Gargantext/Core/Text/Convert.hs
@@ -17,12 +17,10 @@ module Gargantext.Core.Text.Convert (risPress2csvWrite)
    where

 import Data.Either (Either(..))
-import qualified Data.Text as T
-import System.FilePath (FilePath()) -- , takeExtension)
-
-import Gargantext.Prelude
-import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
+import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
+import Gargantext.Prelude
+import System.FilePath (FilePath()) -- , takeExtension)


 risPress2csvWrite :: FilePath -> IO ()
@@ -30,7 +28,7 @@ risPress2csvWrite f = do
  eContents <- parseFile RisPresse Plain (f <> ".ris")
  case eContents of
    Right contents -> writeDocs2Csv (f <> ".csv") contents
-    Left e         -> panic $ "Error: " <> (T.pack e)
+    Left e         -> panic $ "Error: " <> e



--- a/src/Gargantext/Core/Text/Corpus/API/Istex.hs
+++ b/src/Gargantext/Core/Text/Corpus/API/Istex.hs
@@ -14,19 +14,16 @@ module Gargantext.Core.Text.Corpus.API.Istex
    where

 import Data.Either (Either(..))
-import Data.List (concat)
+import Data.List qualified as List
 import Data.Maybe
-import Data.Text (Text, pack)
-
-import qualified Data.Text as Text
-import qualified Data.List as List
+import Data.Text (Text)
+import Data.Text qualified as Text
 import Gargantext.Core (Lang(..))
+import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-import qualified Gargantext.Defaults as Defaults
 import Gargantext.Prelude
-import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
-import qualified ISTEX        as ISTEX
-import qualified ISTEX.Client as ISTEX
+import ISTEX qualified as ISTEX
+import ISTEX.Client qualified as ISTEX

 type Query = Text
 type MaxResults = Maybe Int
@@ -76,31 +73,4 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
 toDoc' la docs' =  mapM (toDoc la) (ISTEX._documents_hits docs')
  --printDebug "ISTEX" (ISTEX._documents_total docs')

-- | TODO remove dateSplit here
-- TODO current year as default
-toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
-toDoc la (ISTEX.Document i t a ab d s) = do
-  --printDebug "ISTEX date" d
-  (utctime, (pub_year, pub_month, pub_day)) <-
-        Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
-  --printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
-  pure $ HyperdataDocument { _hd_bdd       = Just "Istex"
-                           , _hd_doi       = Just i
-                           , _hd_url       = Nothing
-                           , _hd_uniqId    = Nothing
-                           , _hd_uniqIdBdd = Nothing
-                           , _hd_page      = Nothing
-                           , _hd_title     = t
-                           , _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
-                           , _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
-                           , _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
-                           , _hd_abstract = ab
-                           , _hd_publication_date = fmap (pack . show) utctime
-                           , _hd_publication_year = pub_year
-                           , _hd_publication_month = pub_month
-                           , _hd_publication_day = pub_day
-                           , _hd_publication_hour = Nothing
-                           , _hd_publication_minute = Nothing
-                           , _hd_publication_second = Nothing
-                           , _hd_language_iso2 = Just $ (pack . show) la
-                           }
+
--- a/src/Gargantext/Core/Text/Corpus/Parsers.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers.hs
--- a/src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
@@ -16,25 +16,19 @@ module Gargantext.Core.Text.Corpus.Parsers.CSV where

 import Conduit
 import Control.Applicative
-import qualified Data.ByteString      as BS
-import qualified Data.ByteString.Lazy as BL
-import Data.Char (ord)
+import Data.ByteString qualified as BS
+import Data.ByteString.Lazy qualified as BL
 import Data.Csv
-import Data.Either (Either(..))
-import Data.Maybe (fromMaybe)
-import Data.Text (Text, pack, length, intercalate)
+import Data.Text (pack)
+import Data.Text qualified as T
 import Data.Time.Segment (jour)
-import qualified Data.Vector          as V
 import Data.Vector (Vector)
-import GHC.IO (FilePath)
-import GHC.Word (Word8)
-
-import qualified Prelude
-
-import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-import Gargantext.Prelude hiding (length)
+import Data.Vector qualified as V
 import Gargantext.Core.Text
 import Gargantext.Core.Text.Context
+import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
+import Gargantext.Prelude hiding (length, show)
+import Protolude

 ---------------------------------------------------------------
 headerCsvGargV3 :: Header
@@ -117,7 +111,7 @@ fromDocs docs = V.map fromDocs' docs
 -- | Split a document in its context
 -- TODO adapt the size of the paragraph according to the corpus average
 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
-splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
+splitDoc m splt doc = let docSize = (T.length $ csv_abstract doc) in
                          if docSize > 1000
                            then
                              if (mod (round m) docSize) >= 10
@@ -148,7 +142,7 @@ type Mean = Double
 docsSize :: Vector CsvDoc -> Mean
 docsSize csvDoc = mean ls
  where
-    ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
+    ls = V.toList $ V.map (fromIntegral . T.length . csv_abstract) csvDoc


 ---------------------------------------------------------------
@@ -158,7 +152,7 @@ unIntOrDec :: IntOrDec -> Int
 unIntOrDec (IntOrDec i) = i
 instance FromField IntOrDec where
  parseField s = case runParser (parseField s :: Parser Int) of
-    Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
+    Left _err -> IntOrDec <$> floor <$> (parseField s :: Parser Double)
    Right n   -> pure $ IntOrDec n
 instance ToField IntOrDec where
  toField (IntOrDec i) = toField i
@@ -230,44 +224,40 @@ csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
 delimiter :: Delimiter -> Word8
 delimiter Tab   = fromIntegral $ ord '\t'
 delimiter Comma = fromIntegral $ ord ','
------------------------------------------------------------------------
------------------------------------------------------------------------
-readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
-readCsvOn' fields fp = do
-  r <- readCSVFile fp
-  pure $ ( V.toList
-          . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
-          . snd ) <$> r

 ------------------------------------------------------------------------

-readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
+readFileLazy :: (FromNamedRecord a)
+             => proxy a
+             -> Delimiter
+             -> FilePath
+             -> IO (Either Text (Header, Vector a))
 readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile

 readFileStrict :: (FromNamedRecord a)
               => proxy a
               -> Delimiter
               -> FilePath
-               -> IO (Either Prelude.String (Header, Vector a))
+               -> IO (Either Text (Header, Vector a))
 readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile

 readByteStringLazy :: (FromNamedRecord a)
                   => proxy a
                   -> Delimiter
                   -> BL.ByteString
-                   -> Either Prelude.String (Header, Vector a)
-readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
+                   -> Either Text (Header, Vector a)
+readByteStringLazy _f d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs

 readByteStringStrict :: (FromNamedRecord a)
                     => proxy a
                     -> Delimiter
                     -> BS.ByteString
-                     -> Either Prelude.String (Header, Vector a)
+                     -> Either Text (Header, Vector a)
 readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict

 ------------------------------------------------------------------------
 -- | TODO use readFileLazy
-readCSVFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
+readCSVFile :: FilePath -> IO (Either Text (Header, Vector CsvDoc))
 readCSVFile fp = do
  result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
  case result of
@@ -277,20 +267,24 @@ readCSVFile fp = do


 -- | TODO use readByteStringLazy
-readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
-readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
+readCsvLazyBS :: Delimiter
+              -> BL.ByteString
+              -> Either Text (Header, Vector CsvDoc)
+readCsvLazyBS d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs

 ------------------------------------------------------------------------
 -- | TODO use readFileLazy
-readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
-readCsvHal = fmap readCsvHalLazyBS . BL.readFile
+readCsvHal :: FilePath -> IO (Either Text (Header, Vector CsvHal))
+readCsvHal fp = do
+  c <- BL.readFile fp
+  pure $ readCsvHalLazyBS c

 -- | TODO use readByteStringLazy
-readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
-readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
+readCsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector CsvHal)
+readCsvHalLazyBS bs = first pack $ decodeByNameWith (csvDecodeOptions Tab) bs

-readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
-readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
+readCsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector CsvHal)
+readCsvHalBSStrict bs = readCsvHalLazyBS $ BL.fromStrict bs

 ------------------------------------------------------------------------
 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
@@ -437,25 +431,25 @@ csv2doc (CsvDoc { .. })
    pubDay = fromMaybe defaultDay csv_publication_day

 ------------------------------------------------------------------------
-parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
+parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
 parseHal fp = do
  r <- readCsvHal fp
  pure $ (V.toList . V.map csvHal2doc . snd) <$> r

-parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
+parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs

 ------------------------------------------------------------------------

-parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
+parseCsv :: FilePath -> IO (Either Text [HyperdataDocument])
 parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readCSVFile fp

 {-
-parseCsv' ::  BL.ByteString -> Either Prelude.String [HyperdataDocument]
+parseCsv' ::  BL.ByteString -> Either Text [HyperdataDocument]
 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
 -}

-parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
+parseCsv' :: BL.ByteString -> Either Text [HyperdataDocument]
 parseCsv' bs = do
  let
    result = case readCsvLazyBS Comma bs of
@@ -464,15 +458,13 @@ parseCsv' bs = do
  (V.toList . V.map csv2doc . snd) <$> result

 parseCsvC :: BL.ByteString
-          -> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
-parseCsvC bs = do
-  let
-    result = case readCsvLazyBS Comma bs of
+          -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
+parseCsvC bs =
+  (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC csv2doc)) <$> eResult
+  where
+    eResult = case readCsvLazyBS Comma bs of
      Left  _err -> readCsvLazyBS Tab bs
      Right res -> Right res
-  case result of
-    Left err -> Left err
-    Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)

 ------------------------------------------------------------------------
 -- Csv v3 weighted for phylo

--- a/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
@@ -53,12 +53,11 @@ dateSplit Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
 dateSplit (Just txt) = do
  utcTime <- parse txt
  let (y, m, d) = split' utcTime
-  pure (Just utcTime, (Just y, Just m,Just d))
+  pure (Just utcTime, (Just y, Just m, Just d))

 split' :: UTCTime -> (Year, Month, Day)
-split' utcTime = (fromIntegral y, m, d)
+split' (UTCTime day _) = (fromIntegral y, m, d)
  where
-    (UTCTime day _) = utcTime
    (y,m,d)         = toGregorian day

 type Year  = Int

--- a/src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
@@ -15,18 +15,17 @@ JSON parser for Gargantext corpus files.

 module Gargantext.Core.Text.Corpus.Parsers.JSON where

+-- import Gargantext.Database.Schema.Node (NodePoly(..))
 import Conduit
 import Data.Aeson
-import qualified Data.ByteString.Lazy as BL
-import Data.Either (Either(..))
-import Data.Text
+import Data.ByteString.Lazy qualified as BL
+import Data.Text qualified as T
 import GHC.Generics
-
-import qualified Prelude
-
+import Gargantext.Core (Lang)
+import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex qualified as Istex
 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
 import Gargantext.Prelude hiding (length)
+import Protolude


 data JSONStruct =
@@ -62,18 +61,27 @@ instance FromJSON JSONNgrams

 ------------------------------------------------------------------------
 -- | TODO: documents -> document -> hyperdata + title etc
-readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct
-readJSONLazyBS bs = eitherDecode bs
+readJSONLazyBS :: (FromJSON a) => BL.ByteString -> Either Text a
+readJSONLazyBS bs = first T.pack $ eitherDecode bs


 parseJSONC :: BL.ByteString
-           -> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
-parseJSONC bs = do
-  case readJSONLazyBS bs of
-    Left err -> Left err
-    Right (JSONStruct { documents }) ->
-      Right ( Just $ Prelude.fromIntegral $ Prelude.length documents
-            , yieldMany documents .| mapC doc2hyperdoc )
+           -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
+parseJSONC bs = f <$> readJSONLazyBS bs
+  where
+    f (JSONStruct { documents }) =
+      ( fromIntegral $ length documents
+      , yieldMany documents .| mapC doc2hyperdoc )

 doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
 doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
+
+
+parseIstex :: Lang
+           -> BL.ByteString
+           -> IO (Either Text HyperdataDocument)
+parseIstex l bs = do
+  let ej = readJSONLazyBS bs
+  case ej of
+    Left err -> pure $ Left err
+    Right j -> Right <$> Istex.toDoc l j
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
--- a/src/Gargantext/Utils/Aeson.hs
+++ b/src/Gargantext/Utils/Aeson.hs
+{-|
+Module      : Gargantext.Utils.Aeson
+Description : Gargantext utilities
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Utilities for handling zip files
+-}
+
 module Gargantext.Utils.Aeson where

 import Data.Aeson.Types

--- a/src/Gargantext/Utils/Jobs.hs
+++ b/src/Gargantext/Utils/Jobs.hs
+{-|
+Module      : Gargantext.Utils.Jobs
+Description : Gargantext utilities
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+
 {-# LANGUAGE TemplateHaskell #-}
 {-# LANGUAGE TypeFamilies #-}
 module Gargantext.Utils.Jobs (

--- a/src/Gargantext/Utils/Servant.hs
+++ b/src/Gargantext/Utils/Servant.hs
+{-|
+Module      : Gargantext.Utils.Servant
+Description : Gargantext utilities
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+
 module Gargantext.Utils.Servant where

 import qualified Data.ByteString.Lazy.Char8 as BSC

--- a/src/Gargantext/Utils/Tuple.hs
+++ b/src/Gargantext/Utils/Tuple.hs
+
+{-|
+Module      : Gargantext.Utils.Tuple
+Description : Gargantext utilities
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+
 module Gargantext.Utils.Tuple where

 import Protolude

--- a/src/Gargantext/Utils/UTCTime.hs
+++ b/src/Gargantext/Utils/UTCTime.hs
+{-|
+Module      : Gargantext.Utils.UTCTime
+Description : Gargantext utilities
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
+
+
 {-# LANGUAGE DeriveAnyClass             #-}
 {-# LANGUAGE DeriveGeneric              #-}
 {-# LANGUAGE TypeFamilies               #-}