[istex] implement zip file upload for istex

parent 03bd0bf6
Pipeline #5213 failed with stages
in 12 minutes and 11 seconds
...@@ -55,4 +55,4 @@ main = do ...@@ -55,4 +55,4 @@ main = do
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs') putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
CSV.writeFile wPath (h, docs') CSV.writeFile wPath (h, docs')
Left e -> panic $ "Error: " <> (pack e) Left e -> panic $ "Error: " <> e
...@@ -18,34 +18,25 @@ Main specifications to index a corpus with a term list ...@@ -18,34 +18,25 @@ Main specifications to index a corpus with a term list
module Main where module Main where
import Control.Concurrent.Async as CCA (mapConcurrently) import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class import Control.Monad.IO.Class
import Data.Aeson import Data.Aeson
import Data.ByteString.Lazy (writeFile)
import Data.Either (Either(..))
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf) import Data.List.Split (chunksOf)
import Data.Map.Strict (Map) import Data.Map.Strict qualified as DM
import qualified Data.Map.Strict as DM import Data.Text (pack)
import Data.Text (pack, Text) import Data.Text qualified as DT
import qualified Data.Text as DT import Data.Text.Lazy qualified as DTL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both) import Data.Tuple.Extra (both)
import qualified Data.Vector as DV import Data.Vector qualified as DV
import GHC.Generics import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Prelude hiding (show)
import Protolude
import System.IO (hFlush)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- OUTPUT format -- OUTPUT format
...@@ -78,7 +69,7 @@ filterTermsAndCooc patterns (year, ts) = do ...@@ -78,7 +69,7 @@ filterTermsAndCooc patterns (year, ts) = do
log m = do log m = do
tid <- myThreadId tid <- myThreadId
(p, _) <- threadCapability tid (p, _) <- threadCapability tid
putStrLn . unwords $ putText . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p] ["filterTermsAndCooc:", m, show year, "on proc", show p]
main :: IO () main :: IO ()
...@@ -97,14 +88,14 @@ main = do ...@@ -97,14 +88,14 @@ main = do
-- termListMap :: [Text] -- termListMap :: [Text]
termList <- csvMapTermList termListFile termList <- csvMapTermList termListFile
putStrLn $ show $ length termList putText $ show $ length termList
let patterns = buildPatterns termList let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus) -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus) r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r) writeFile outputFile $ DTL.toStrict $ TLE.decodeUtf8 $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> (pack e) Left e -> panic $ "Error: " <> e
...@@ -113,7 +104,7 @@ main = do ...@@ -113,7 +104,7 @@ main = do
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b] mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n" liftIO $ hPutStr stderr ("\rDone\n" :: Text)
pure bs pure bs
where where
g c x = do g c x = do
...@@ -130,6 +121,7 @@ mapConcurrentlyChunked f ts = do ...@@ -130,6 +121,7 @@ mapConcurrentlyChunked f ts = do
--terms' :: Patterns -> Text -> Corpus [[Text]] --terms' :: Patterns -> Text -> Corpus [[Text]]
terms' :: Applicative f => Patterns -> Text -> f [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt terms' pats txt = pure $ concat $ extractTermsWithList pats txt
......
...@@ -94,7 +94,7 @@ source-repository-package ...@@ -94,7 +94,7 @@ source-repository-package
source-repository-package source-repository-package
type: git type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git location: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git
tag: a34bb341236d82cf3d488210bc1d8448a98f5808 tag: 9b1bd17f3ed38eab83e675bb68278922217a9c73
source-repository-package source-repository-package
type: git type: git
...@@ -167,3 +167,4 @@ package hmatrix ...@@ -167,3 +167,4 @@ package hmatrix
package sparse-linear package sparse-linear
ghc-options: -O2 -fsimpl-tick-factor=10000 -fdicts-cheap -fdicts-strict -flate-dmd-anal -fno-state-hack ghc-options: -O2 -fsimpl-tick-factor=10000 -fdicts-cheap -fdicts-strict -flate-dmd-anal -fno-state-hack
...@@ -147,6 +147,7 @@ library ...@@ -147,6 +147,7 @@ library
Gargantext.Utils.Jobs.State Gargantext.Utils.Jobs.State
Gargantext.Utils.SpacyNLP Gargantext.Utils.SpacyNLP
Gargantext.Utils.Tuple Gargantext.Utils.Tuple
Gargantext.Utils.Zip
other-modules: other-modules:
Gargantext.API.Admin.Auth Gargantext.API.Admin.Auth
Gargantext.API.Admin.FrontEnd Gargantext.API.Admin.FrontEnd
...@@ -229,6 +230,7 @@ library ...@@ -229,6 +230,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Iramuteq Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.Json2Csv Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
...@@ -665,6 +667,7 @@ executable gargantext-cli ...@@ -665,6 +667,7 @@ executable gargantext-cli
, gargantext-prelude , gargantext-prelude
, ini ^>= 0.4.1 , ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7 , optparse-generic ^>= 1.4.7
, protolude ^>= 0.3.0
, split ^>= 0.2.3.4 , split ^>= 0.2.3.4
, text ^>= 1.2.4.1 , text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0 , unordered-containers ^>= 0.2.16.0
......
...@@ -87,7 +87,7 @@ addContact :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m) ...@@ -87,7 +87,7 @@ addContact :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
addContact u nId (AddContactParams fn ln) jobHandle = do addContact u nId (AddContactParams fn ln) jobHandle = do
markStarted 2 jobHandle markStarted 2 jobHandle
_ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (Just 1, yield $ hyperdataContact fn ln) jobHandle _ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (1, yield $ hyperdataContact fn ln) jobHandle
markComplete jobHandle markComplete jobHandle
addContact _uId _nId _p jobHandle = do addContact _uId _nId _p jobHandle = do
......
...@@ -42,7 +42,7 @@ import Gargantext.API.Node.Corpus.Searx ...@@ -42,7 +42,7 @@ import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus) import Gargantext.API.Node.Corpus.Update (addLanguageToCorpus)
import Gargantext.API.Node.Types import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..), withDefaultLanguage) import Gargantext.Core (Lang(..), withDefaultLanguage, defaultLanguage)
import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion) import Gargantext.Core.NodeStory (HasNodeStoryImmediateSaver, HasNodeArchiveStoryImmediateSaver, currentVersion)
import Gargantext.Core.Text.Corpus.API qualified as API import Gargantext.Core.Text.Corpus.API qualified as API
import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC) import Gargantext.Core.Text.Corpus.Parsers qualified as Parser (FileType(..), parseFormatC)
...@@ -278,33 +278,35 @@ addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m) ...@@ -278,33 +278,35 @@ addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
-> NewWithForm -> NewWithForm
-> JobHandle m -> JobHandle m
-> m () -> m ()
addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n sel) jobHandle = do addToCorpusWithForm user cid nwf jobHandle = do
-- printDebug "[addToCorpusWithForm] Parsing corpus: " cid -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
-- printDebug "[addToCorpusWithForm] fileType" ft -- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff -- printDebug "[addToCorpusWithForm] fileFormat" ff
let l = nwf ^. wf_lang . non defaultLanguage
addLanguageToCorpus cid l addLanguageToCorpus cid l
limit' <- view $ hasConfig . gc_max_docs_parsers limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' :: Integer let limit = fromIntegral limit' :: Integer
let let
parseC = case ft of parseC = case (nwf ^. wf_filetype) of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
CSV -> Parser.parseFormatC Parser.CsvGargV3 CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormatC Parser.WOS CSV_HAL -> Parser.parseFormatC Parser.CsvHal
PresseRIS -> Parser.parseFormatC Parser.RisPresse
Iramuteq -> Parser.parseFormatC Parser.Iramuteq Iramuteq -> Parser.parseFormatC Parser.Iramuteq
Istex -> Parser.parseFormatC Parser.Istex
JSON -> Parser.parseFormatC Parser.JSON JSON -> Parser.parseFormatC Parser.JSON
PresseRIS -> Parser.parseFormatC Parser.RisPresse
WOS -> Parser.parseFormatC Parser.WOS
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
let data' = case ff of let data' = case (nwf ^. wf_fileformat) of
Plain -> cs d Plain -> cs (nwf ^. wf_data)
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of ZIP -> case BSB64.decode $ TE.encodeUtf8 (nwf ^. wf_data) of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded Right decoded -> decoded
eDocsC <- liftBase $ parseC ff data' eDocsC <- liftBase $ parseC (nwf ^. wf_fileformat) data'
case eDocsC of case eDocsC of
Right (mCount, docsC) -> do Right (count, docsC) -> do
-- TODO Add progress (jobStatus) update for docs - this is a -- TODO Add progress (jobStatus) update for docs - this is a
-- long action -- long action
...@@ -333,9 +335,9 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n ...@@ -333,9 +335,9 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
_cid' <- flowCorpus user _cid' <- flowCorpus user
(Right [cid]) (Right [cid])
(Multi l) (Multi l)
(Just sel) (Just (nwf ^. wf_selection))
--(Just $ fromIntegral $ length docs, docsC') --(Just $ fromIntegral $ length docs, docsC')
(mCount, transPipe liftBase docsC') -- TODO fix number of docs (count, transPipe liftBase docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs) --(map (map toHyperdataDocument) docs)
jobHandle jobHandle
...@@ -347,7 +349,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n ...@@ -347,7 +349,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
markComplete jobHandle markComplete jobHandle
Left e -> do Left e -> do
printDebug "[addToCorpusWithForm] parse error" e printDebug "[addToCorpusWithForm] parse error" e
markFailed (Just $ T.pack e) jobHandle markFailed (Just e) jobHandle
{- {-
addToCorpusWithFile :: FlowCmdM env err m addToCorpusWithFile :: FlowCmdM env err m
......
...@@ -12,6 +12,7 @@ import Gargantext.Prelude ...@@ -12,6 +12,7 @@ import Gargantext.Prelude
data FileType = CSV data FileType = CSV
| CSV_HAL | CSV_HAL
| Istex
| PresseRIS | PresseRIS
| WOS | WOS
| Iramuteq | Iramuteq
...@@ -26,6 +27,7 @@ instance ToJSON FileType ...@@ -26,6 +27,7 @@ instance ToJSON FileType
instance FromHttpApiData FileType where instance FromHttpApiData FileType where
parseUrlPiece "CSV" = pure CSV parseUrlPiece "CSV" = pure CSV
parseUrlPiece "CSV_HAL" = pure CSV_HAL parseUrlPiece "CSV_HAL" = pure CSV_HAL
parseUrlPiece "Istex" = pure Istex
parseUrlPiece "PresseRis" = pure PresseRIS parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS parseUrlPiece "WOS" = pure WOS
parseUrlPiece "Iramuteq" = pure Iramuteq parseUrlPiece "Iramuteq" = pure Iramuteq
......
...@@ -90,6 +90,13 @@ frameCalcUploadAsync uId nId (FrameCalcUpload _wf_lang _wf_selection) jobHandle ...@@ -90,6 +90,13 @@ frameCalcUploadAsync uId nId (FrameCalcUpload _wf_lang _wf_selection) jobHandle
case mCId of case mCId of
Nothing -> markFailure 1 Nothing jobHandle Nothing -> markFailure 1 Nothing jobHandle
Just cId -> Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV Plain body _wf_lang "calc-upload.csv" _wf_selection) jobHandle addToCorpusWithForm (RootId (NodeId uId))
cId
(NewWithForm { _wf_filetype = CSV
, _wf_fileformat = Plain
, _wf_data = body
, _wf_lang
, _wf_name = "calc-upload.csv"
, _wf_selection }) jobHandle
markComplete jobHandle markComplete jobHandle
...@@ -14,22 +14,17 @@ Portability : POSIX ...@@ -14,22 +14,17 @@ Portability : POSIX
module Gargantext.Core.Ext.IMT where module Gargantext.Core.Ext.IMT where
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.List qualified as DL
import Data.Map.Strict (Map) import Data.Map.Strict (Map)
import Data.Text (Text, splitOn) import Data.Map.Strict qualified as M
import qualified Data.Set as S
import qualified Data.List as DL
import qualified Data.Vector as DV
import qualified Data.Map.Strict as M
import qualified Prelude
import Data.Morpheus.Types (GQLType) import Data.Morpheus.Types (GQLType)
import Data.Set qualified as S
import Data.Text (Text, splitOn)
import Data.Vector qualified as DV
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Prelude
data School = School { school_shortName :: Text data School = School { school_shortName :: Text
, school_longName :: Text , school_longName :: Text
...@@ -112,7 +107,7 @@ mapIdSchool :: Map Text Text ...@@ -112,7 +107,7 @@ mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map mapIdSchool = M.fromList $ Gargantext.Prelude.map
(\(School { school_shortName, school_id }) -> (school_id, school_shortName)) schools (\(School { school_shortName, school_id }) -> (school_id, school_shortName)) schools
hal_data :: IO (Either Prelude.String (DV.Vector CsvHal)) hal_data :: IO (Either Text (DV.Vector CsvHal))
hal_data = do hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv" r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r pure $ snd <$> r
......
...@@ -17,12 +17,10 @@ module Gargantext.Core.Text.Convert (risPress2csvWrite) ...@@ -17,12 +17,10 @@ module Gargantext.Core.Text.Convert (risPress2csvWrite)
where where
import Data.Either (Either(..)) import Data.Either (Either(..))
import qualified Data.Text as T
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..)) import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Prelude
import System.FilePath (FilePath()) -- , takeExtension)
risPress2csvWrite :: FilePath -> IO () risPress2csvWrite :: FilePath -> IO ()
...@@ -30,7 +28,7 @@ risPress2csvWrite f = do ...@@ -30,7 +28,7 @@ risPress2csvWrite f = do
eContents <- parseFile RisPresse Plain (f <> ".ris") eContents <- parseFile RisPresse Plain (f <> ".ris")
case eContents of case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e) Left e -> panic $ "Error: " <> e
...@@ -14,19 +14,16 @@ module Gargantext.Core.Text.Corpus.API.Istex ...@@ -14,19 +14,16 @@ module Gargantext.Core.Text.Corpus.API.Istex
where where
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.List (concat) import Data.List qualified as List
import Data.Maybe import Data.Maybe
import Data.Text (Text, pack) import Data.Text (Text)
import Data.Text qualified as Text
import qualified Data.Text as Text
import qualified Data.List as List
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex (toDoc)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude import Gargantext.Prelude
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import ISTEX qualified as ISTEX
import qualified ISTEX as ISTEX import ISTEX.Client qualified as ISTEX
import qualified ISTEX.Client as ISTEX
type Query = Text type Query = Text
type MaxResults = Maybe Int type MaxResults = Maybe Int
...@@ -76,31 +73,4 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument] ...@@ -76,31 +73,4 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs') toDoc' la docs' = mapM (toDoc la) (ISTEX._documents_hits docs')
--printDebug "ISTEX" (ISTEX._documents_total docs') --printDebug "ISTEX" (ISTEX._documents_total docs')
-- | TODO remove dateSplit here
-- TODO current year as default
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
--printDebug "ISTEX date" d
(utctime, (pub_year, pub_month, pub_day)) <-
Date.dateSplit (maybe (Just $ pack $ show Defaults.year) (Just . pack . show) d)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
, _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
, _hd_abstract = ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la
}
This diff is collapsed.
...@@ -16,25 +16,19 @@ module Gargantext.Core.Text.Corpus.Parsers.CSV where ...@@ -16,25 +16,19 @@ module Gargantext.Core.Text.Corpus.Parsers.CSV where
import Conduit import Conduit
import Control.Applicative import Control.Applicative
import qualified Data.ByteString as BS import Data.ByteString qualified as BS
import qualified Data.ByteString.Lazy as BL import Data.ByteString.Lazy qualified as BL
import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(..)) import Data.Text (pack)
import Data.Maybe (fromMaybe) import Data.Text qualified as T
import Data.Text (Text, pack, length, intercalate)
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import qualified Data.Vector as V
import Data.Vector (Vector) import Data.Vector (Vector)
import GHC.IO (FilePath) import Data.Vector qualified as V
import GHC.Word (Word8)
import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Core.Text import Gargantext.Core.Text
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length, show)
import Protolude
--------------------------------------------------------------- ---------------------------------------------------------------
headerCsvGargV3 :: Header headerCsvGargV3 :: Header
...@@ -117,7 +111,7 @@ fromDocs docs = V.map fromDocs' docs ...@@ -117,7 +111,7 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context -- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average -- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in splitDoc m splt doc = let docSize = (T.length $ csv_abstract doc) in
if docSize > 1000 if docSize > 1000
then then
if (mod (round m) docSize) >= 10 if (mod (round m) docSize) >= 10
...@@ -148,7 +142,7 @@ type Mean = Double ...@@ -148,7 +142,7 @@ type Mean = Double
docsSize :: Vector CsvDoc -> Mean docsSize :: Vector CsvDoc -> Mean
docsSize csvDoc = mean ls docsSize csvDoc = mean ls
where where
ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc ls = V.toList $ V.map (fromIntegral . T.length . csv_abstract) csvDoc
--------------------------------------------------------------- ---------------------------------------------------------------
...@@ -158,7 +152,7 @@ unIntOrDec :: IntOrDec -> Int ...@@ -158,7 +152,7 @@ unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double) Left _err -> IntOrDec <$> floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n Right n -> pure $ IntOrDec n
instance ToField IntOrDec where instance ToField IntOrDec where
toField (IntOrDec i) = toField i toField (IntOrDec i) = toField i
...@@ -230,44 +224,40 @@ csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d} ...@@ -230,44 +224,40 @@ csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8 delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t' delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ',' delimiter Comma = fromIntegral $ ord ','
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
readCsvOn' fields fp = do
r <- readCSVFile fp
pure $ ( V.toList
. V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
. snd ) <$> r
------------------------------------------------------------------------ ------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a)) readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a) readFileStrict :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> Delimiter
-> FilePath -> FilePath
-> IO (Either Prelude.String (Header, Vector a)) -> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) readByteStringLazy :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> Delimiter
-> BL.ByteString -> BL.ByteString
-> Either Prelude.String (Header, Vector a) -> Either Text (Header, Vector a)
readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs readByteStringLazy _f d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a) readByteStringStrict :: (FromNamedRecord a)
=> proxy a => proxy a
-> Delimiter -> Delimiter
-> BS.ByteString -> BS.ByteString
-> Either Prelude.String (Header, Vector a) -> Either Text (Header, Vector a)
readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
readCSVFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc)) readCSVFile :: FilePath -> IO (Either Text (Header, Vector CsvDoc))
readCSVFile fp = do readCSVFile fp = do
result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
case result of case result of
...@@ -277,20 +267,24 @@ readCSVFile fp = do ...@@ -277,20 +267,24 @@ readCSVFile fp = do
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc) readCsvLazyBS :: Delimiter
readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs -> BL.ByteString
-> Either Text (Header, Vector CsvDoc)
readCsvLazyBS d bs = first pack $ decodeByNameWith (csvDecodeOptions d) bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal)) readCsvHal :: FilePath -> IO (Either Text (Header, Vector CsvHal))
readCsvHal = fmap readCsvHalLazyBS . BL.readFile readCsvHal fp = do
c <- BL.readFile fp
pure $ readCsvHalLazyBS c
-- | TODO use readByteStringLazy -- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal) readCsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector CsvHal)
readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs readCsvHalLazyBS bs = first pack $ decodeByNameWith (csvDecodeOptions Tab) bs
readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal) readCsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict readCsvHalBSStrict bs = readCsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO () writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
...@@ -437,25 +431,25 @@ csv2doc (CsvDoc { .. }) ...@@ -437,25 +431,25 @@ csv2doc (CsvDoc { .. })
pubDay = fromMaybe defaultDay csv_publication_day pubDay = fromMaybe defaultDay csv_publication_day
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do parseHal fp = do
r <- readCsvHal fp r <- readCsvHal fp
pure $ (V.toList . V.map csvHal2doc . snd) <$> r pure $ (V.toList . V.map csvHal2doc . snd) <$> r
parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument] parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseCsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readCSVFile fp parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readCSVFile fp
{- {-
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument] parseCsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
-} -}
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument] parseCsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseCsv' bs = do parseCsv' bs = do
let let
result = case readCsvLazyBS Comma bs of result = case readCsvLazyBS Comma bs of
...@@ -464,15 +458,13 @@ parseCsv' bs = do ...@@ -464,15 +458,13 @@ parseCsv' bs = do
(V.toList . V.map csv2doc . snd) <$> result (V.toList . V.map csv2doc . snd) <$> result
parseCsvC :: BL.ByteString parseCsvC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ()) -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseCsvC bs = do parseCsvC bs =
let (\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC csv2doc)) <$> eResult
result = case readCsvLazyBS Comma bs of where
eResult = case readCsvLazyBS Comma bs of
Left _err -> readCsvLazyBS Tab bs Left _err -> readCsvLazyBS Tab bs
Right res -> Right res Right res -> Right res
case result of
Left err -> Left err
Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
......
...@@ -53,12 +53,11 @@ dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing)) ...@@ -53,12 +53,11 @@ dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit (Just txt) = do dateSplit (Just txt) = do
utcTime <- parse txt utcTime <- parse txt
let (y, m, d) = split' utcTime let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d)) pure (Just utcTime, (Just y, Just m, Just d))
split' :: UTCTime -> (Year, Month, Day) split' :: UTCTime -> (Year, Month, Day)
split' utcTime = (fromIntegral y, m, d) split' (UTCTime day _) = (fromIntegral y, m, d)
where where
(UTCTime day _) = utcTime
(y,m,d) = toGregorian day (y,m,d) = toGregorian day
type Year = Int type Year = Int
......
...@@ -15,18 +15,17 @@ JSON parser for Gargantext corpus files. ...@@ -15,18 +15,17 @@ JSON parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.JSON where module Gargantext.Core.Text.Corpus.Parsers.JSON where
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Conduit import Conduit
import Data.Aeson import Data.Aeson
import qualified Data.ByteString.Lazy as BL import Data.ByteString.Lazy qualified as BL
import Data.Either (Either(..)) import Data.Text qualified as T
import Data.Text
import GHC.Generics import GHC.Generics
import Gargantext.Core (Lang)
import qualified Prelude import Gargantext.Core.Text.Corpus.Parsers.JSON.Istex qualified as Istex
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
import Protolude
data JSONStruct = data JSONStruct =
...@@ -62,18 +61,27 @@ instance FromJSON JSONNgrams ...@@ -62,18 +61,27 @@ instance FromJSON JSONNgrams
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc -- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct readJSONLazyBS :: (FromJSON a) => BL.ByteString -> Either Text a
readJSONLazyBS bs = eitherDecode bs readJSONLazyBS bs = first T.pack $ eitherDecode bs
parseJSONC :: BL.ByteString parseJSONC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ()) -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = do parseJSONC bs = f <$> readJSONLazyBS bs
case readJSONLazyBS bs of where
Left err -> Left err f (JSONStruct { documents }) =
Right (JSONStruct { documents }) -> ( fromIntegral $ length documents
Right ( Just $ Prelude.fromIntegral $ Prelude.length documents , yieldMany documents .| mapC doc2hyperdoc )
, yieldMany documents .| mapC doc2hyperdoc )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
parseIstex :: Lang
-> BL.ByteString
-> IO (Either Text HyperdataDocument)
parseIstex l bs = do
let ej = readJSONLazyBS bs
case ej of
Left err -> pure $ Left err
Right j -> Right <$> Istex.toDoc l j
This diff is collapsed.
{-|
Module : Gargantext.Utils.Aeson
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling zip files
-}
module Gargantext.Utils.Aeson where module Gargantext.Utils.Aeson where
import Data.Aeson.Types import Data.Aeson.Types
......
{-|
Module : Gargantext.Utils.Jobs
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
module Gargantext.Utils.Jobs ( module Gargantext.Utils.Jobs (
......
{-|
Module : Gargantext.Utils.Servant
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Utils.Servant where module Gargantext.Utils.Servant where
import qualified Data.ByteString.Lazy.Char8 as BSC import qualified Data.ByteString.Lazy.Char8 as BSC
......
{-|
Module : Gargantext.Utils.Tuple
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Utils.Tuple where module Gargantext.Utils.Tuple where
import Protolude import Protolude
......
{-|
Module : Gargantext.Utils.UTCTime
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeFamilies #-}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment