Verified Commit dca7d379 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 341-dev-websockets

parents 5286cb51 e147c115
## Version 0.0.7.1.6.3
* [BACK][FIX][CSV; TSV in all codebase (#348)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/348)
* [FRONT][FIX][CSV TSV in all codebase (#676)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/676)
* [FRONT][FIX][TSV export for ngrams table]
## Version 0.0.7.1.6.2
* [FRONT][FIX][Pb with term selection in phylomemy (#673)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/673)
## Version 0.0.7.1.6.1
* [FRONT][FIX][Display Phylomemy parameters (#580)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/580)
......
{
"corpusPath" : "Gargantext_DocsList-nodeId-185487.csv",
"listPath" : "Gargantext_NgramsList-185488.csv",
"corpusPath" : "Gargantext_DocsList-nodeId-185487.tsv",
"listPath" : "Gargantext_NgramsList-185488.tsv",
"outputPath" : "data",
"corpusParser" : {
"tag" : "Csv",
"_csv_limit" : 1500000
"tag" : "Tsv",
"_tsv_limit" : 1500000
},
"listParser" : "V3",
"phyloName" : "bpa",
......
......@@ -19,15 +19,15 @@ import Data.Set qualified as S
import Data.Text (pack)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.CSV qualified as CSV
import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as TSV
import Gargantext.Core.Text.Search
import Gargantext.Prelude
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
filterDocs :: [DocId] -> Vector TSV.TsvGargV3 -> Vector TSV.TsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (TSV.d_docId doc) $ S.fromList docIds )
main :: IO ()
......@@ -37,19 +37,19 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
eDocs <- CSV.readCSVFile rPath
eDocs <- TSV.readTSVFile rPath
case eDocs of
Right (h, csvDocs) -> do
putStrLn ("Number of documents before:" <> show (V.length csvDocs) :: Text)
putStrLn ("Mean size of docs:" <> show ( CSV.docsSize csvDocs) :: Text)
Right (h, tsvDocs) -> do
putStrLn ("Number of documents before:" <> show (V.length tsvDocs) :: Text)
putStrLn ("Mean size of docs:" <> show ( TSV.docsSize tsvDocs) :: Text)
let docs = CSV.toDocs csvDocs
let docs = TSV.toDocs tsvDocs
let engine = S.insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
let docs' = TSV.fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn ("Number of documents after:" <> show (V.length docs') :: Text)
putStrLn ("Mean size of docs:" <> show (CSV.docsSize docs') :: Text)
putStrLn ("Mean size of docs:" <> show (TSV.docsSize docs') :: Text)
CSV.writeFile wPath (h, docs')
TSV.writeFile wPath (h, docs')
Left e -> panicTrace $ "Error: " <> e
......@@ -26,8 +26,8 @@ import Data.Text.Lazy.Encoding qualified as TLE
import Data.Tuple.Extra (both)
import Data.Vector qualified as DV
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (readCSVFile, csv_title, csv_abstract, csv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
import Gargantext.Prelude
......@@ -72,16 +72,16 @@ main = do
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
eCorpusFile <- readCSVFile corpusFile
eCorpusFile <- readTSVFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- csvMapTermList termListFile
termList <- tsvMapTermList termListFile
putText $ show $ length termList
......
......@@ -41,7 +41,7 @@ main = do
let
--tt = (Unsupervised EN 6 0 Nothing)
tt = (Multi EN)
format = CsvGargV3 -- CsvHal --WOS
format = TsvGargV3 -- TsvHal --WOS
limit' = case (readMaybe limit :: Maybe Limit) of
Nothing -> panicTrace $ "Cannot read limit: " <> (Text.pack limit)
Just l -> l
......@@ -49,8 +49,8 @@ main = do
mkCorpusUser = MkCorpusUserNormalCorpusName (UserName $ cs user) (cs name :: Text)
corpus = flowCorpusFile mkCorpusUser limit' tt format Plain corpusPath Nothing DevJobHandle
corpusCsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusCsvHal = flowCorpusFile mkCorpusUser limit' tt CsvHal Plain corpusPath Nothing DevJobHandle
corpusTsvHal :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
corpusTsvHal = flowCorpusFile mkCorpusUser limit' tt TsvHal Plain corpusPath Nothing DevJobHandle
annuaire :: forall m. (FlowCmdM DevEnv BackendInternalError m, MonadJobStatus m, JobHandle m ~ DevJobHandle) => m CorpusId
annuaire = flowAnnuaire (MkCorpusUserNormalCorpusName (UserName $ cs user) "Annuaire") (Multi EN) corpusPath DevJobHandle
......@@ -71,8 +71,8 @@ main = do
then runCmdGargDev env corpus
else pure 0 --(cs "false")
_ <- if fun == "corpusCsvHal"
then runCmdGargDev env corpusCsvHal
_ <- if fun == "corpusTsvHal"
then runCmdGargDev env corpusTsvHal
else pure 0 --(cs "false")
_ <- if fun == "annuaire"
......
......@@ -26,7 +26,7 @@ phyloConfig outdir = PhyloConfig {
corpusPath = "corpus.csv"
, listPath = "list.csv"
, outputPath = outdir
, corpusParser = Csv {_csv_limit = 150000}
, corpusParser = Tsv {_tsv_limit = 150000}
, listParser = V4
, phyloName = "phylo_profile_test"
, phyloScale = 2
......
......@@ -31,9 +31,9 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.CSV qualified as Csv
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day, tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Core.Types.Main (ListType(..))
......
......@@ -15,9 +15,9 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.CSV qualified as Csv
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day, tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
import Gargantext.Core.Types.Main (ListType(..))
......@@ -76,29 +76,29 @@ wosToDocs limit patterns time path = do
<$> fromRight [] <$> parseFile WOS Plain (path <> file) ) files
-- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
csvToDocs parser patterns time path =
-- To transform a Tsv file into a list of Document
tsvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
tsvToDocs parser patterns time path =
case parser of
Wos _ -> Prelude.error "csvToDocs: unimplemented"
Csv limit -> Vector.toList
Wos _ -> Prelude.error "tsvToDocs: unimplemented"
Tsv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
(toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row) (fromMaybe Csv.defaultMonth $ csv_publication_month row) (fromMaybe Csv.defaultDay $ csv_publication_day row) time)
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
<$> Vector.map (\row -> Document (toPhyloDate (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(toPhyloDate' (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row))
Nothing
[]
time
) <$> snd <$> either (\err -> panicTrace $ "CSV error" <> (show err)) identity <$> Csv.readCSVFile path
Csv' limit -> Vector.toList
) <$> snd <$> either (\err -> panicTrace $ "TSV error" <> (show err)) identity <$> Tsv.readTSVFile path
Tsv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(toPhyloDate' (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
(termsInText patterns $ (csv'_title row) <> " " <> (csv'_abstract row))
(Just $ csv'_weight row)
(map (T.strip . pack) $ splitOn ";" (unpack $ (csv'_source row)))
<$> Vector.map (\row -> Document (toPhyloDate (tsv'_publication_year row) (tsv'_publication_month row) (tsv'_publication_day row) time)
(toPhyloDate' (tsv'_publication_year row) (tsv'_publication_month row) (tsv'_publication_day row) time)
(termsInText patterns $ (tsv'_title row) <> " " <> (tsv'_abstract row))
(Just $ tsv'_weight row)
(map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row)))
time
) <$> snd <$> Csv.readWeightedCsv path
) <$> snd <$> Tsv.readWeightedTsv path
-- To parse a file into a list of Document
......@@ -107,8 +107,8 @@ fileToDocsAdvanced parser path time lst = do
let patterns = buildPatterns lst
case parser of
Wos limit -> wosToDocs limit patterns time path
Csv _ -> csvToDocs parser patterns time path
Csv' _ -> csvToDocs parser patterns time path
Tsv _ -> tsvToDocs parser patterns time path
Tsv' _ -> tsvToDocs parser patterns time path
fileToDocsDefault :: CorpusParser -> FilePath -> [TimeUnit] -> TermList -> IO [Document]
fileToDocsDefault parser path timeUnits lst =
......@@ -140,7 +140,7 @@ readListV4 path = do
fileToList :: ListParser -> FilePath -> IO TermList
fileToList parser path =
case parser of
V3 -> csvMapTermList path
V3 -> tsvMapTermList path
V4 -> fromJust
<$> toTermList MapTerm NgramsTerms
<$> readListV4 path
......
......@@ -5,7 +5,7 @@ cabal-version: 3.4
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.7.1.6.1
version: 0.0.7.1.6.3
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -38,13 +38,13 @@ data-files:
test-data/phylo/bpa_phylo_test.json
test-data/phylo/cleopatre.golden.json
test-data/phylo/nadal.golden.json
test-data/phylo/nadal_docslist.golden.csv
test-data/phylo/nadal_ngramslist.golden.csv
test-data/phylo/nadal_docslist.golden.tsv
test-data/phylo/nadal_ngramslist.golden.tsv
test-data/phylo/issue-290-small.golden.json
test-data/phylo/open_science.json
test-data/phylo/small-phylo.golden.json
test-data/phylo/small_phylo_docslist.csv
test-data/phylo/small_phylo_ngramslist.csv
test-data/phylo/small_phylo_docslist.tsv
test-data/phylo/small_phylo_ngramslist.tsv
test-data/phylo/187481.json
test-data/phylo/phylo2dot2json.golden.json
test-data/stemming/lancaster.txt
......@@ -190,13 +190,13 @@ library
Gargantext.Core.Text.Corpus.API.OpenAlex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group.WithStem
Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.List.Formats.TSV
Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar
Gargantext.Core.Text.Metrics.Count
......
......@@ -96,7 +96,7 @@ data GargJob
= TableNgramsJob
| ForgotPasswordJob
| UpdateNgramsListJobJSON
| UpdateNgramsListJobCSV
| UpdateNgramsListJobTSV
| AddContactJob
| AddFileJob
| DocumentFromWriteNodeJob
......
......@@ -18,7 +18,7 @@ module Gargantext.API.Ngrams.List
where
import Data.ByteString.Lazy qualified as BSL
import Data.Csv qualified as Csv
import Data.Csv qualified as Tsv
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict (toList)
......@@ -57,7 +57,7 @@ getAPI :: Named.GETAPI (AsServerT (GargM Env BackendInternalError))
getAPI = Named.GETAPI $ \listId -> Named.ListEndpoints
{ listJSONEp = getJson listId
, listJSONZipEp = getJsonZip listId
, listCSVEp = getCsv listId
, listTSVEp = getTsv listId
}
--
......@@ -91,17 +91,17 @@ getJsonZip lId = do
]
) nlz
getCsv :: HasNodeStory env err m
getTsv :: HasNodeStory env err m
=> ListId
-> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
getCsv lId = do
getTsv lId = do
lst <- getNgramsList lId
pure $ case Map.lookup NgramsTerms lst of
Nothing -> noHeader Map.empty
Just (Versioned { _v_data }) ->
addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
, pack $ show (_NodeId lId)
, ".csv"
, ".tsv"
]
) _v_data
......@@ -140,43 +140,43 @@ postAsyncJSON l ngramsList jobHandle = do
--
-- CSV API
-- TSV API
--
csvAPI :: Named.CSVAPI (AsServerT (GargM Env BackendInternalError))
csvAPI = csvPostAsync
tsvAPI :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvAPI = tsvPostAsync
------------------------------------------------------------------------
csvPostAsync :: Named.CSVAPI (AsServerT (GargM Env BackendInternalError))
csvPostAsync = Named.CSVAPI $ \lId -> AsyncJobs $
serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
case ngramsListFromCSVData (_wtf_data f) of
tsvPostAsync :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvPostAsync = Named.TSVAPI $ \lId -> AsyncJobs $
serveJobsAPI UpdateNgramsListJobTSV $ \jHandle f -> do
case ngramsListFromTSVData (_wtf_data f) of
Left err -> serverError $ err500 { errReasonPhrase = err }
Right ngramsList -> postAsyncJSON lId ngramsList jHandle
-- | Tries converting a text file into an 'NgramList', so that we can reuse the
-- existing JSON endpoint for the CSV upload.
ngramsListFromCSVData :: Text -> Either Prelude.String NgramsList
ngramsListFromCSVData csvData = case decodeCsv of
-- /NOTE/ The legacy CSV data only supports terms in imports and exports, so this is
-- existing JSON endpoint for the TSV upload.
ngramsListFromTSVData :: Text -> Either Prelude.String NgramsList
ngramsListFromTSVData tsvData = case decodeTsv of
-- /NOTE/ The legacy TSV data only supports terms in imports and exports, so this is
-- all we care about.
Left err -> Left $ "Invalid CSV found in ngramsListFromCSVData: " <> err
Left err -> Left $ "Invalid TSV found in ngramsListFromTSVData: " <> err
Right terms -> pure $ Map.fromList [ (NgramsTerms, Versioned 0 $ mconcat . Vec.toList $ terms) ]
where
binaryData = BSL.fromStrict $ P.encodeUtf8 csvData
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeCsv :: Either Prelude.String (Vector NgramsTableMap)
decodeCsv = Csv.decodeWithP csvToNgramsTableMap
(Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
Csv.HasHeader
decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
Tsv.HasHeader
binaryData
-- | Converts a plain CSV 'Record' into an NgramsTableMap
csvToNgramsTableMap :: Csv.Record -> Csv.Parser NgramsTableMap
csvToNgramsTableMap record = case Vec.toList record of
-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms
_ -> Prelude.fail "csvToNgramsTableMap failed"
_ -> Prelude.fail "tsvToNgramsTableMap failed"
where
conv :: Text -> Text -> Text -> NgramsTableMap
......@@ -199,12 +199,12 @@ csvToNgramsTableMap record = case Vec.toList record of
------------------------------------------------------------------------
-- | This is for debugging the CSV parser in the REPL
importCsvFile :: forall env err m. (HasNodeStory env err m, HasServerError err, MonadJobStatus m)
-- | This is for debugging the TSV parser in the REPL
importTsvFile :: forall env err m. (HasNodeStory env err m, HasServerError err, MonadJobStatus m)
=> ListId -> P.FilePath -> m ()
importCsvFile lId fp = do
importTsvFile lId fp = do
contents <- liftBase $ P.readFile fp
case ngramsListFromCSVData contents of
case ngramsListFromTSVData contents of
Left err -> serverError $ err500 { errReasonPhrase = err }
Right ngramsList -> postAsyncJSON lId ngramsList (noJobHandle @m Proxy)
......
......@@ -24,7 +24,7 @@ import Control.Category ((>>>))
import Control.Lens (makePrisms, Iso', iso, from, (.=), (?=), (#), to, folded, {-withIndex, ifolded,-} view, use, (^?), (%~), (%=), at, _Just, Each(..), itraverse_, both, forOf_, (?~), over)
import Data.Aeson hiding ((.=))
import Data.Csv (defaultEncodeOptions, encodeByNameWith, header, namedRecord, EncodeOptions(..), NamedRecord, Quoting(QuoteNone))
import Data.Csv qualified as Csv
import Data.Csv qualified as Tsv
import Data.HashMap.Strict.InsOrd qualified as InsOrdHashMap
import Data.List qualified as List
import Data.Map.Strict qualified as Map
......@@ -47,7 +47,7 @@ import Gargantext.Database.Admin.Types.Node (ContextId)
import Gargantext.Database.Prelude (fromField', HasConnectionPool, HasConfig, CmdM')
import Gargantext.Prelude hiding (IsString, hash, from, replace, to)
import Gargantext.Prelude.Crypto.Hash (IsHashable(..))
import Gargantext.Utils.Servant (CSV, ZIP)
import Gargantext.Utils.Servant (TSV, ZIP)
import Gargantext.Utils.Zip (zipContentsPure)
import Servant ( FromHttpApiData(parseUrlPiece), ToHttpApiData(toUrlPiece), Required, Strict, QueryParam', MimeRender(.. ), MimeUnrender(..))
import Servant.Job.Utils (jsonOptions)
......@@ -299,10 +299,10 @@ data NgramsSearchQuery = NgramsSearchQuery
type NgramsTableMap = Map NgramsTerm NgramsRepoElement
-- CSV:
-- TSV:
-- header: status\tlabel\tforms
-- item: map\taccountability\taccounting|&|accoutns|&|account
instance MimeRender CSV NgramsTableMap where
instance MimeRender TSV NgramsTableMap where
-- mimeRender _ _val = encode ([] :: [(Text, Text)])
mimeRender _ val = encodeByNameWith encOptions (header ["status", "label", "forms"]) $ fn <$> Map.toList val
where
......@@ -310,9 +310,9 @@ instance MimeRender CSV NgramsTableMap where
, encQuoting = QuoteNone }
fn :: (NgramsTerm, NgramsRepoElement) -> NamedRecord
fn (NgramsTerm term, NgramsRepoElement { _nre_list, _nre_children }) =
namedRecord [ "status" Csv..= toText _nre_list
, "label" Csv..= term
, "forms" Csv..= T.intercalate "|&|" (unNgramsTerm <$> mSetToList _nre_children)]
namedRecord [ "status" Tsv..= toText _nre_list
, "label" Tsv..= term
, "forms" Tsv..= T.intercalate "|&|" (unNgramsTerm <$> mSetToList _nre_children)]
toText :: ListType -> Text
toText CandidateTerm = "candidate"
toText MapTerm = "map"
......
......@@ -240,8 +240,8 @@ addToCorpusWithForm user cid nwf jobHandle = do
let limit = fromIntegral limit' :: Integer
let
parseC = case (nwf ^. wf_filetype) of
CSV -> Parser.parseFormatC Parser.CsvGargV3
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
TSV -> Parser.parseFormatC Parser.TsvGargV3
TSV_HAL -> Parser.parseFormatC Parser.TsvHal
Iramuteq -> Parser.parseFormatC Parser.Iramuteq
Istex -> Parser.parseFormatC Parser.Istex
JSON -> Parser.parseFormatC Parser.JSON
......
......@@ -8,8 +8,8 @@ import Servant
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
data FileType = CSV
| CSV_HAL
data FileType = TSV
| TSV_HAL
| Istex
| PresseRIS
| WOS
......@@ -17,14 +17,14 @@ data FileType = CSV
| JSON
deriving (Eq, Show, Generic)
instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
instance Arbitrary FileType where arbitrary = elements [TSV, PresseRIS]
instance ToParamSchema FileType
instance FromJSON FileType
instance ToJSON FileType
instance FromHttpApiData FileType where
parseUrlPiece "CSV" = pure CSV
parseUrlPiece "CSV_HAL" = pure CSV_HAL
parseUrlPiece "TSV" = pure TSV
parseUrlPiece "TSV_HAL" = pure TSV_HAL
parseUrlPiece "Istex" = pure Istex
parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS
......
......@@ -41,7 +41,7 @@ documentExportAPI :: IsGargServer env err m
documentExportAPI userNodeId dId = Named.DocumentExportAPI $ Named.DocumentExportEndpoints
{ exportJSONEp = getDocumentsJSON userNodeId dId
, exportJSONZipEp = getDocumentsJSONZip userNodeId dId
, exportCSVEp = getDocumentsCSV userNodeId dId
, exportTSVEp = getDocumentsTSV userNodeId dId
}
--------------------------------------------------
......@@ -98,16 +98,16 @@ getDocumentsJSONZip userNodeId pId = do
, dezFileName dexpz
, ".zip" ]) dexpz
getDocumentsCSV :: NodeId
getDocumentsTSV :: NodeId
-- ^ The Node ID of the target user
-> DocId
-> GargNoServer (Headers '[Header "Content-Disposition" T.Text] T.Text) -- [Document]
getDocumentsCSV userNodeId pId = do
getDocumentsTSV userNodeId pId = do
dJSON <- getDocumentsJSON userNodeId pId
let DocumentExport { _de_documents } = getResponse dJSON
let ret = TE.decodeUtf8 $ BSC.toStrict $ encodeDefaultOrderedByNameWith (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t', encQuoting = QuoteAll }) _de_documents
pure $ addHeader (T.concat [ "attachment; filename=GarganText_DocsList-"
, T.pack $ show pId
, ".csv"])
, ".tsv"])
ret
......@@ -89,7 +89,7 @@ frameCalcUploadAsync authenticatedUser nId (FrameCalcUpload _wf_lang _wf_selecti
-- FIXME(adn) Audit this conversion.
addToCorpusWithForm (RootId userNodeId)
cId
(NewWithForm { _wf_filetype = CSV
(NewWithForm { _wf_filetype = TSV
, _wf_fileformat = Plain
, _wf_data = body
, _wf_lang
......
......@@ -31,7 +31,7 @@ data DocumentExportEndpoints mode = DocumentExportEndpoints
:> Get '[JSON] (Headers '[Servant.Header "Content-Disposition" Text] DocumentExport)
, exportJSONZipEp :: mode :- "json.zip"
:> Get '[ZIP] (Headers '[Servant.Header "Content-Disposition" Text] DocumentExportZIP)
, exportCSVEp :: mode :- "csv"
, exportTSVEp :: mode :- "tsv"
:> Get '[PlainText] (Headers '[Servant.Header "Content-Disposition" Text] Text)
} deriving Generic
......
......@@ -5,7 +5,7 @@ module Gargantext.API.Routes.Named.List (
GETAPI(..)
, ListEndpoints(..)
, JSONAPI(..)
, CSVAPI(..)
, TSVAPI(..)
) where
import Data.Text (Text)
......@@ -30,7 +30,7 @@ data ListEndpoints mode = ListEndpoints
:> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
, listJSONZipEp :: mode :- "json.zip"
:> Get '[GUS.ZIP] (Headers '[Header "Content-Disposition" Text] NgramsListZIP)
, listCSVEp :: mode :- "csv" :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
, listTSVEp :: mode :- "tsv" :> Get '[GUS.TSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
} deriving Generic
newtype JSONAPI mode = JSONAPI
......@@ -44,11 +44,11 @@ newtype JSONAPI mode = JSONAPI
} deriving Generic
newtype CSVAPI mode = CSVAPI
{ updateListCSVEp :: mode :- Summary "Update List (legacy v3 CSV)"
newtype TSVAPI mode = TSVAPI
{ updateListTSVEp :: mode :- Summary "Update List (legacy v3 TSV)"
:> "lists"
:> Capture "listId" ListId
:> "csv"
:> "tsv"
:> "add"
:> "form"
:> "async"
......
......@@ -91,7 +91,7 @@ data GargPrivateAPI' mode = GargPrivateAPI'
, addWithQueryEp :: mode :- NamedRoutes AddWithQuery
, listGetAPI :: mode :- NamedRoutes List.GETAPI
, listJsonAPI :: mode :- NamedRoutes List.JSONAPI
, listCsvAPI :: mode :- NamedRoutes List.CSVAPI
, listTsvAPI :: mode :- NamedRoutes List.TSVAPI
, shareUrlEp :: mode :- "shareurl" :> NamedRoutes ShareURL
} deriving Generic
......
......@@ -63,6 +63,6 @@ serverPrivateGargAPI' authenticatedUser@(AuthenticatedUser userNodeId userId)
, addWithQueryEp = addCorpusWithQuery (RootId userNodeId)
, listGetAPI = List.getAPI
, listJsonAPI = List.jsonAPI
, listCsvAPI = List.csvAPI
, listTsvAPI = List.tsvAPI
, shareUrlEp = shareURL
}
......@@ -19,7 +19,7 @@ import Data.Morpheus.Types (GQLType)
import Data.Set qualified as S
import Data.Text (splitOn)
import Data.Vector qualified as DV
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import Gargantext.Core.Text.Corpus.Parsers.TSV as TSV
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Prelude
......@@ -104,9 +104,9 @@ mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map
(\(School { school_shortName, school_id }) -> (school_id, school_shortName)) schools
hal_data :: IO (Either Text (DV.Vector CsvHal))
hal_data :: IO (Either Text (DV.Vector TsvHal))
hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
r <- TSV.readTsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r
names :: S.Set Text
......@@ -117,7 +117,7 @@ toSchoolName t = case M.lookup t mapIdSchool of
Nothing -> t
Just t' -> t'
publisBySchool :: DV.Vector CsvHal -> [(Maybe Text, Int)]
publisBySchool :: DV.Vector TsvHal -> [(Maybe Text, Int)]
publisBySchool hal_data' = Gargantext.Prelude.map (\(i,n) -> (M.lookup i mapIdSchool, n))
$ DL.filter (\i -> S.member (fst i) names)
$ DL.reverse
......@@ -126,5 +126,5 @@ publisBySchool hal_data' = Gargantext.Prelude.map (\(i,n) -> (M.lookup i mapIdSc
$ Utils.freq
$ DL.concat
$ DV.toList
$ DV.map (\n -> splitOn ( ", ") (csvHal_instStructId_i n) )
$ DV.filter (\n -> csvHal_publication_year n == 2017) hal_data'
$ DV.map (\n -> splitOn ( ", ") (tsvHal_instStructId_i n) )
$ DV.filter (\n -> tsvHal_publication_year n == 2017) hal_data'
......@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
import Data.Text qualified as T
import Data.Vector (Vector)
import Data.Vector qualified as Vector
import Gargantext.Core.Text.Corpus.Parsers.CSV ( csvDecodeOptions, Delimiter(Tab) )
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, Delimiter(Tab) )
import Gargantext.Database.Admin.Types.Hyperdata.Contact
import Gargantext.Prelude
import System.FilePath.Posix (takeExtension)
......@@ -30,7 +30,7 @@ import System.FilePath.Posix (takeExtension)
------------------------------------------------------------------------
readFile_Annuaire :: FilePath -> IO [HyperdataContact]
readFile_Annuaire fp = case takeExtension fp of
".csv" -> readCSVFile_Annuaire fp
".csv" -> readTSVFile_Annuaire fp
".data" -> deserialiseImtUsersFromFile fp
unknownExt -> panicTrace $ "[G.C.E.I.readFile_Annuaire] extension unknown: " <> T.pack unknownExt
......@@ -69,7 +69,7 @@ data IMTUser = IMTUser
, date_modification :: Maybe Text
} deriving (Eq, Show, Generic)
-- | CSV instance
-- | TSV instance
instance FromNamedRecord IMTUser where
parseNamedRecord r = do
id <- r .: "id"
......@@ -105,21 +105,21 @@ instance FromNamedRecord IMTUser where
date_modification <- r .: "date_modification"
pure $ IMTUser {..}
headerCSVannuaire :: Header
headerCSVannuaire =
headerTSVannuaire :: Header
headerTSVannuaire =
header ["id","entite","mail","nom","prenom","fonction","fonction2","tel","fax","service","groupe","entite2","service2","groupe2","bureau","url","pservice","pfonction","afonction","afonction2","grprech","appellation","lieu","aprecision","atel","sexe","statut","idutilentite","actif","idutilsiecoles","date_modification"]
readCSVFile_Annuaire :: FilePath -> IO [HyperdataContact]
readCSVFile_Annuaire fp = do
users <- snd <$> readCSVFile_Annuaire' fp
readTSVFile_Annuaire :: FilePath -> IO [HyperdataContact]
readTSVFile_Annuaire fp = do
users <- snd <$> readTSVFile_Annuaire' fp
pure $ map imtUser2gargContact $ Vector.toList users
readCSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
readCSVFile_Annuaire' = fmap readCsvHalLazyBS' . BL.readFile
readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile
where
readCsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
readCsvHalLazyBS' bs = case decodeByNameWith (csvDecodeOptions Tab) bs of
readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
readTsvHalLazyBS' bs = case decodeByNameWith (tsvDecodeOptions Tab) bs of
Left e -> panicTrace (cs e)
Right rows -> rows
......
......@@ -13,19 +13,19 @@ Format Converter.
{-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Convert (risPress2csvWrite)
module Gargantext.Core.Text.Convert (risPress2tsvWrite)
where
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Prelude
risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = do
risPress2tsvWrite :: FilePath -> IO ()
risPress2tsvWrite f = do
eContents <- parseFile RisPresse Plain (f <> ".ris")
case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents
Right contents -> writeDocs2Tsv (f <> ".csv") contents
Left e -> panicTrace $ "Error: " <> e
......
......@@ -15,13 +15,13 @@ module Gargantext.Core.Text.Corpus.API.Isidore (
get
-- * Internals (possibly unused?)
, isidore2csvFile
, isidore2tsvFile
) where
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
......@@ -50,12 +50,12 @@ get lang l q a = do
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs
isidore2csvFile :: FilePath -> Lang -> Maybe Isidore.Limit
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2csvFile fp lang li tq aq = do
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Csv fp hdocs
writeDocs2Tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do
......
......@@ -47,7 +47,7 @@ import Data.Text qualified as DT
import Data.Tuple.Extra (both) -- , first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Core.Text.Corpus.Parsers.Iramuteq qualified as Iramuteq
......@@ -78,8 +78,8 @@ type ParseError = Text
data FileType = WOS
| RIS
| RisPresse
| CsvGargV3
| CsvHal
| TsvGargV3
| TsvHal
| Iramuteq
| JSON
| Istex
......@@ -109,11 +109,11 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
-> FileFormat
-> DB.ByteString
-> m (Either DT.Text (Integer, ConduitT () HyperdataDocument IO ()))
do_parse CsvGargV3 Plain bs = do
let eParsedC = parseCsvC $ DBL.fromStrict bs
do_parse TsvGargV3 Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse CsvHal Plain bs = do
let eParsedC = parseCsvC $ DBL.fromStrict bs
do_parse TsvHal Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse Istex Plain bs = do
ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs
......@@ -188,8 +188,8 @@ etale = concatMap etale'
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
-- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat TsvGargV3 bs = pure $ parseTsv' $ DBL.fromStrict bs
-- parseFormat TsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
......@@ -220,8 +220,8 @@ parseFile :: FileType
-> FileFormat
-> FilePath
-> IO (Either Text [HyperdataDocument])
parseFile CsvGargV3 Plain p = parseCsv p
parseFile CsvHal Plain p = parseHal p
parseFile TsvGargV3 Plain p = parseTsv p
parseFile TsvHal Plain p = parseHal p
parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
......
......@@ -18,7 +18,7 @@ import Data.ByteString.Lazy qualified as DBL
import Data.List qualified as List
import Data.Text qualified as DT
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (hyperdataDocument2csv)
import Gargantext.Core.Text.Corpus.Parsers.TSV (hyperdataDocument2tsv)
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
......@@ -29,13 +29,13 @@ import System.Directory -- (getDirectoryContents)
type FileOut = FilePath
book2csv :: Int -> FileDir -> FileOut -> IO ()
book2csv n f_in f_out = do
book2tsv :: Int -> FileDir -> FileOut -> IO ()
book2tsv n f_in f_out = do
files <- filesOf f_in
texts <- readPublis f_in files
let publis = concatMap (file2publi n) texts
let docs = zipWith publiToHyperdata [1..] publis
DBL.writeFile f_out (hyperdataDocument2csv docs)
DBL.writeFile f_out (hyperdataDocument2tsv docs)
filesOf :: FileDir -> IO [FilePath]
filesOf fd = List.sort -- sort by filename
......
......@@ -14,14 +14,14 @@ Json parser to export towoard CSV GargV3 format.
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Corpus.Parsers.Json2Csv (json2csv, readPatents)
module Gargantext.Core.Text.Corpus.Parsers.Json2Csv (json2tsv, readPatents)
where
import Data.Aeson ( decode )
import Data.ByteString.Lazy (readFile)
import Data.Text (unpack)
import Data.Vector (fromList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
import Gargantext.Core.Text.Corpus.Parsers.TSV (TsvDoc(..), writeFile, headerTsvGargV3)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (readFile, writeFile)
......@@ -41,20 +41,20 @@ readPatents fp = decode <$> readFile fp
type FilePathIn = FilePath
type FilePathOut = FilePath
json2csv :: FilePathIn -> FilePathOut -> IO ()
json2csv fin fout = do
patents <- maybe (panicTrace "json2csv error") identity <$> readPatents fin
writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent { .. }) =
CsvDoc { csv_title = _patent_title
, csv_source = "Source"
, csv_publication_year = Just $ read (unpack _patent_year)
, csv_publication_month = Just $ Defaults.month
, csv_publication_day = Just $ Defaults.day
, csv_abstract = _patent_abstract
, csv_authors = "Authors" }
json2tsv :: FilePathIn -> FilePathOut -> IO ()
json2tsv fin fout = do
patents <- maybe (panicTrace "json2tsv error") identity <$> readPatents fin
writeFile fout (headerTsvGargV3, fromList $ map patent2tsvDoc patents)
patent2tsvDoc :: Patent -> TsvDoc
patent2tsvDoc (Patent { .. }) =
TsvDoc { tsv_title = _patent_title
, tsv_source = "Source"
, tsv_publication_year = Just $ read (unpack _patent_year)
, tsv_publication_month = Just $ Defaults.month
, tsv_publication_day = Just $ Defaults.day
, tsv_abstract = _patent_abstract
, tsv_authors = "Authors" }
......
{-|
Module : Gargantext.Core.Text.List.Formats.CSV
Module : Gargantext.Core.Text.List.Formats.TSV
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
......@@ -7,12 +7,12 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
TSV parser for Gargantext corpus files.
-}
module Gargantext.Core.Text.List.Formats.CSV where
module Gargantext.Core.Text.List.Formats.TSV where
import Control.Applicative
import Data.ByteString.Lazy qualified as BL
......@@ -25,74 +25,74 @@ import Gargantext.Core.Text.Context
import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
csvMapTermList :: FilePath -> IO TermList
csvMapTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp
tsvMapTermList :: FilePath -> IO TermList
tsvMapTermList fp = tsv2list TsvMap <$> snd <$> fromTsvListFile fp
csv2list :: CsvListType -> Vector CsvList -> TermList
csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms)
-> (DT.words label, [DT.words label] <> (filter (not . null) . map DT.words $ DT.splitOn csvListFormsDelimiter forms)))
$ V.filter (\l -> csvList_status l == lt ) vs
tsv2list :: TsvListType -> Vector TsvList -> TermList
tsv2list lt vs = V.toList $ V.map (\(TsvList _ label forms)
-> (DT.words label, [DT.words label] <> (filter (not . null) . map DT.words $ DT.splitOn tsvListFormsDelimiter forms)))
$ V.filter (\l -> tsvList_status l == lt ) vs
------------------------------------------------------------------------
data CsvListType = CsvMap | CsvStop | CsvCandidate
data TsvListType = TsvMap | TsvStop | TsvCandidate
deriving (Read, Show, Eq)
------------------------------------------------------------------------
-- CSV List Main Configuration
csvListFieldDelimiter :: Char
csvListFieldDelimiter = '\t'
-- TSV List Main Configuration
tsvListFieldDelimiter :: Char
tsvListFieldDelimiter = '\t'
csvListFormsDelimiter :: Text
csvListFormsDelimiter = "|&|"
tsvListFormsDelimiter :: Text
tsvListFormsDelimiter = "|&|"
------------------------------------------------------------------------
data CsvList = CsvList
{ csvList_status :: !CsvListType
, csvList_label :: !Text
, csvList_forms :: !Text
data TsvList = TsvList
{ tsvList_status :: !TsvListType
, tsvList_label :: !Text
, tsvList_forms :: !Text
}
deriving (Show)
------------------------------------------------------------------------
instance FromNamedRecord CsvList where
parseNamedRecord r = CsvList <$> r .: "status"
instance FromNamedRecord TsvList where
parseNamedRecord r = TsvList <$> r .: "status"
<*> r .: "label"
<*> r .: "forms"
instance ToNamedRecord CsvList where
toNamedRecord (CsvList s l f) =
instance ToNamedRecord TsvList where
toNamedRecord (TsvList s l f) =
namedRecord [ "status" .= s
, "label" .= l
, "forms" .= f
]
------------------------------------------------------------------------
instance FromField CsvListType where
parseField "map" = pure CsvMap
parseField "main" = pure CsvCandidate
parseField "candidate" = pure CsvCandidate -- backward compat
parseField "stop" = pure CsvStop
instance FromField TsvListType where
parseField "map" = pure TsvMap
parseField "main" = pure TsvCandidate
parseField "candidate" = pure TsvCandidate -- backward compat
parseField "stop" = pure TsvStop
parseField _ = mzero
instance ToField CsvListType where
toField CsvMap = "map"
toField CsvCandidate = "main"
toField CsvStop = "stop"
instance ToField TsvListType where
toField TsvMap = "map"
toField TsvCandidate = "main"
toField TsvStop = "stop"
------------------------------------------------------------------------
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord csvListFieldDelimiter}
tsvDecodeOptions :: DecodeOptions
tsvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord tsvListFieldDelimiter}
)
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord csvListFieldDelimiter}
tsvEncodeOptions :: EncodeOptions
tsvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord tsvListFieldDelimiter}
)
------------------------------------------------------------------------
fromCsvListFile :: FilePath -> IO (Header, Vector CsvList)
fromCsvListFile fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
fromTsvListFile :: FilePath -> IO (Header, Vector TsvList)
fromTsvListFile fp = do
tsvData <- BL.readFile fp
case decodeByNameWith tsvDecodeOptions tsvData of
Left e -> panicTrace (pack e)
Right csvList -> pure csvList
Right tsvList -> pure tsvList
------------------------------------------------------------------------
toCsvListFile :: FilePath -> (Header, Vector CsvList) -> IO ()
toCsvListFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
toTsvListFile :: FilePath -> (Header, Vector TsvList) -> IO ()
toTsvListFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith tsvEncodeOptions h (V.toList vs)
------------------------------------------------------------------------
......@@ -7,7 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
This search Engine is first made to clean TSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
......@@ -17,7 +17,7 @@ module Gargantext.Core.Text.Search where
import Data.Ix
import Data.SearchEngine
import Gargantext.Core.Text.Corpus.Parsers.CSV
import Gargantext.Core.Text.Corpus.Parsers.TSV
import Gargantext.Core.Text.Terms.Mono (monoTexts)
import Gargantext.Core.Text.Terms.Mono.Stem as ST
import Gargantext.Prelude
......@@ -29,7 +29,7 @@ import Gargantext.Prelude
type DocId = Int
type DocSearchEngine = SearchEngine
CsvGargV3
TsvGargV3
DocId
DocField
NoFeatures
......@@ -42,7 +42,7 @@ initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig :: SearchConfig TsvGargV3 DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
......@@ -51,7 +51,7 @@ docSearchConfig =
documentFeatureValue = const noFeatures
}
where
extractTerms :: CsvGargV3 -> DocField -> [Text]
extractTerms :: TsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
......
......@@ -13,7 +13,7 @@ where
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Gargantext.Utils.Servant (CSV, Markdown)
import Gargantext.Utils.Servant (TSV, Markdown)
import Network.HTTP.Client (newManager, Request(..))
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
......@@ -26,10 +26,10 @@ newtype DocId = DocId { fromDocId :: Text }
newtype Data = Data { fromData :: Text }
data ContentType a =
CTPlain a
| CTCSV a
| CTTSV a
-- TODO SocialCalc, Excel XML ?
instance MimeRender CSV Data where
instance MimeRender TSV Data where
mimeRender p (Data d) = mimeRender p d
instance MimeRender PlainText Data where
mimeRender p (Data d) = mimeRender p d
......@@ -49,14 +49,14 @@ type EthercalcAPI =
:> ReqBody '[PlainText] Data
:> Put '[PlainText] Text
-- csv
-- tsv
:<|>
ReqBody '[CSV] Data
:> Post '[PlainText, CSV] Text
ReqBody '[TSV] Data
:> Post '[PlainText, TSV] Text
:<|>
Capture "docId" DocId
:> ReqBody '[CSV] Data
:> Put '[PlainText, CSV] Text
:> ReqBody '[TSV] Data
:> Put '[PlainText, TSV] Text
)
ethercalcAPI :: Proxy EthercalcAPI
......@@ -64,16 +64,16 @@ ethercalcAPI = Proxy
ethercalcNewPlain :: Data -> ClientM Text
ethercalcUpdatePlain :: DocId -> Data -> ClientM Text
ethercalcNewCSV :: Data -> ClientM Text
ethercalcUpdateCSV :: DocId -> Data -> ClientM Text
ethercalcNewTSV :: Data -> ClientM Text
ethercalcUpdateTSV :: DocId -> Data -> ClientM Text
ethercalcNewPlain :<|> ethercalcUpdatePlain
:<|> ethercalcNewCSV :<|> ethercalcUpdateCSV = client ethercalcAPI
:<|> ethercalcNewTSV :<|> ethercalcUpdateTSV = client ethercalcAPI
------------------------------
-- | Create new or update existing Ethercalc document (depending on
-- `Maybe DocId` constructor). `Data` can be in various formats (CSV,
-- `Maybe DocId` constructor). `Data` can be in various formats (TSV,
-- etc).
ethercalc :: Host -> Maybe DocId -> ContentType Data -> IO (Either ClientError Text)
ethercalc (Host host) mDocId ctD = do
......@@ -81,9 +81,9 @@ ethercalc (Host host) mDocId ctD = do
let env = mkClientEnv manager' (BaseUrl Https (T.unpack host) 443 "")
case (mDocId, ctD) of
(Nothing, CTPlain d) -> runClientM (ethercalcNewPlain d) env
(Nothing, CTCSV d) -> runClientM (ethercalcNewCSV d) env
(Nothing, CTTSV d) -> runClientM (ethercalcNewTSV d) env
(Just docId, CTPlain d) -> runClientM (ethercalcUpdatePlain docId d) env
(Just docId, CTCSV d) -> runClientM (ethercalcUpdateCSV docId d) env
(Just docId, CTTSV d) -> runClientM (ethercalcUpdateTSV docId d) env
-----------------------------------
......
......@@ -43,8 +43,8 @@ import Test.QuickCheck.Instances.Vector()
data CorpusParser =
Wos {_wos_limit :: Int}
| Csv {_csv_limit :: Int}
| Csv' {_csv'_limit :: Int}
| Tsv {_tsv_limit :: Int}
| Tsv' {_tsv'_limit :: Int}
deriving (Show,Generic,Eq, ToExpr)
instance ToSchema CorpusParser where
......@@ -223,7 +223,7 @@ defaultConfig =
PhyloConfig { corpusPath = "corpus.csv" -- useful for commandline only
, listPath = "list.csv" -- useful for commandline only
, outputPath = "data/"
, corpusParser = Csv 150000
, corpusParser = Tsv 150000
, listParser = V4
, phyloName = pack "Phylo Name"
, phyloScale = 2
......@@ -725,8 +725,8 @@ instance Arbitrary PhyloConfig where
instance Arbitrary CorpusParser where
arbitrary = oneof [ Wos <$> arbitrary
, Csv <$> arbitrary
, Csv' <$> arbitrary
, Tsv <$> arbitrary
, Tsv' <$> arbitrary
]
instance Arbitrary ListParser where
......
......@@ -344,7 +344,7 @@ toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
Fis _ _ ->
let fis = parMap rpar (\(prd,docs) ->
case (corpusParser $ getConfig phylo) of
Csv' _ -> let lst = toList
Tsv' _ -> let lst = toList
$ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
_ -> let lst = toList
......
......@@ -70,8 +70,8 @@ getHyperdataFrameContents (HyperdataFrame { _hf_base, _hf_frame_id }) = do
r <- Wreq.get $ T.unpack path
pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody
getHyperdataFrameCSV :: HyperdataFrame -> IO Text
getHyperdataFrameCSV (HyperdataFrame { _hf_base, _hf_frame_id }) = do
getHyperdataFrameTSV :: HyperdataFrame -> IO Text
getHyperdataFrameTSV (HyperdataFrame { _hf_base, _hf_frame_id }) = do
let path = T.concat [_hf_base, "/", _hf_frame_id, ".csv"]
r <- Wreq.get $ T.unpack path
pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody
......@@ -69,7 +69,7 @@ parseGargJob s = case s of
"tablengrams" -> Just TableNgramsJob
"forgotpassword" -> Just ForgotPasswordJob
"updatengramslistjson" -> Just UpdateNgramsListJobJSON
"updatengramslistcsv" -> Just UpdateNgramsListJobCSV
"updatengramslisttsv" -> Just UpdateNgramsListJobTSV
"addcontact" -> Just AddContactJob
"addfile" -> Just AddFileJob
"documentfromwritenode" -> Just DocumentFromWriteNodeJob
......
......@@ -21,18 +21,18 @@ import Protolude.Partial (read)
import Servant ( Accept(contentType), MimeRender(..), MimeUnrender(mimeUnrender) )
data CSV = CSV
data TSV = TSV
instance Accept CSV where
instance Accept TSV where
contentType _ = "text" // "csv" /: ("charset", "utf-8")
instance (DefaultOrdered a, ToNamedRecord a) => MimeRender CSV [a] where
instance (DefaultOrdered a, ToNamedRecord a) => MimeRender TSV [a] where
mimeRender _ = encodeDefaultOrderedByName
instance MimeRender CSV T.Text where
instance MimeRender TSV T.Text where
mimeRender _ = BSC.fromStrict . TE.encodeUtf8
instance Read a => MimeUnrender CSV a where
instance Read a => MimeUnrender TSV a where
mimeUnrender _ bs = case BSC.take len bs of
"text/csv" -> pure . read . BSC.unpack $ BSC.drop len bs
_ -> Left "didn't start with the magic incantation"
......@@ -40,7 +40,7 @@ instance Read a => MimeUnrender CSV a where
len :: Int64
len = fromIntegral $ length ("text/csv" :: Prelude.String)
--instance ToNamedRecord a => MimeRender CSV [a] where
--instance ToNamedRecord a => MimeRender TSV [a] where
-- mimeRender _ val = encode val
----------------------------
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -1323,8 +1323,8 @@
"tag": "MaxClique"
},
"corpusParser": {
"_csv_limit": 150000,
"tag": "Csv"
"_tsv_limit": 150000,
"tag": "Tsv"
},
"corpusPath": "corpus.csv",
"defaultMode": false,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -25,7 +25,7 @@ import Data.Text.IO qualified as TIO
import Fmt
import Gargantext.API.Admin.Auth.Types (Token)
import Gargantext.API.Ngrams qualified as APINgrams
import Gargantext.API.Ngrams.List ( ngramsListFromCSVData )
import Gargantext.API.Ngrams.List ( ngramsListFromTSVData )
import Gargantext.API.Ngrams.Types ( MSet(..), NgramsPatch(..), NgramsRepoElement(..), NgramsTablePatch(..), NgramsTerm(..), Versioned(..), mSetToList, toNgramsPatch, ne_children, ne_ngrams, vc_data, _NgramsTable )
import Gargantext.Core.Text.Ngrams
import Gargantext.Core.Types ( CorpusId, ListId, ListType(..), NodeId, _NodeId )
......@@ -190,7 +190,7 @@ tests = sequential $ aroundAll withTestDBAndPort $ do
it "parses CSV via ngramsListFromCSVData" $ \((_testEnv, _port), _app) -> do
simpleNgrams <- liftIO (TIO.readFile =<< getDataFileName "test-data/ngrams/simple.csv")
ngramsListFromCSVData simpleNgrams `shouldBe`
ngramsListFromTSVData simpleNgrams `shouldBe`
Right (Map.fromList [ (NgramsTerms, Versioned 0 $ Map.fromList [
(NgramsTerm "abelian group", NgramsRepoElement 1 MapTerm Nothing Nothing (MSet mempty))
, (NgramsTerm "brazorf", NgramsRepoElement 1 StopTerm Nothing Nothing (MSet mempty))
......
......@@ -12,7 +12,7 @@ import Data.GraphViz.Attributes.Complete qualified as Graphviz
import Data.Text.Lazy as TL
import Data.TreeDiff
import Data.Vector qualified as V
import Gargantext.Core.Text.List.Formats.CSV
import Gargantext.Core.Text.List.Formats.TSV
import Gargantext.Core.Types.Phylo
import Gargantext.Core.Viz.Phylo hiding (EdgeType(..))
import Gargantext.Core.Viz.Phylo.API.Tools (readPhylo, phylo2dot2json)
......@@ -30,7 +30,7 @@ phyloTestConfig = PhyloConfig {
corpusPath = "corpus.csv"
, listPath = "list.csv"
, outputPath = "data/"
, corpusParser = Csv {_csv_limit = 150000}
, corpusParser = Tsv {_tsv_limit = 150000}
, listParser = V4
, phyloName = "Phylo Name"
, phyloScale = 2
......@@ -79,8 +79,8 @@ testCleopatreWithoutLinkExpectedOutput = do
testNadalWithoutLinkExpectedOutput :: Assertion
testNadalWithoutLinkExpectedOutput = do
corpusPath' <- getDataFileName "test-data/phylo/nadal_docslist.golden.csv"
listPath' <- getDataFileName "test-data/phylo/nadal_ngramslist.golden.csv"
corpusPath' <- getDataFileName "test-data/phylo/nadal_docslist.golden.tsv"
listPath' <- getDataFileName "test-data/phylo/nadal_ngramslist.golden.tsv"
let config = phyloTestConfig { corpusPath = corpusPath'
, listPath = listPath'
, listParser = V3
......@@ -97,11 +97,11 @@ testNadalWithoutLinkExpectedOutput = do
testSmallPhyloWithoutLinkExpectedOutput :: Assertion
testSmallPhyloWithoutLinkExpectedOutput = do
bpaConfig <- getDataFileName "bench-data/phylo/bpa-config.json"
corpusPath' <- getDataFileName "test-data/phylo/small_phylo_docslist.csv"
listPath' <- getDataFileName "test-data/phylo/small_phylo_ngramslist.csv"
corpusPath' <- getDataFileName "test-data/phylo/small_phylo_docslist.tsv"
listPath' <- getDataFileName "test-data/phylo/small_phylo_ngramslist.tsv"
(Right config) <- fmap (\pcfg -> pcfg { corpusPath = corpusPath'
, listPath = listPath'
}) <$> (eitherDecodeFileStrict' bpaConfig)
}) <$> (JSON.eitherDecodeFileStrict' bpaConfig)
mapList <- fileToList (listParser config) (listPath config)
corpus <- fileToDocsDefault (corpusParser config)
(corpusPath config)
......@@ -234,13 +234,13 @@ testRelComp_Connected = do
testToPhyloDeterminism :: Assertion
testToPhyloDeterminism = do
corpusPath' <- getDataFileName "test-data/phylo/GarganText_DocsList-nodeId-187481.csv"
listPath' <- getDataFileName "test-data/phylo/GarganText_NgramsList-187482.csv"
corpusPath' <- getDataFileName "test-data/phylo/GarganText_DocsList-nodeId-187481.tsv"
listPath' <- getDataFileName "test-data/phylo/GarganText_NgramsList-187482.tsv"
let config = phyloTestConfig { corpusPath = corpusPath'
, listPath = listPath'
, listParser = V3
}
mapList <- csvMapTermList (listPath config)
mapList <- tsvMapTermList (listPath config)
corpus <- fileToDocsDefault (corpusParser config)
(corpusPath config)
[Year 3 1 5,Month 3 1 5,Week 4 2 5]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment