Commit ec9fe2d7 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Make tests play nice with the changes

Not mergeable: the tests fail for now. I suspect the problem comes from
the tests for `testCorrectFile`, where the function `checkFromLine` tries
to look up a wrong line number. Some more investigation would be needed
to figure out exactly what happens (probably should show the TSV file
being tested).

Anyway, here are the changes in this commit:
- Encapsulation of the TSV module in an `Internal` module, so we can export
  just the interface, but still expose the internals to the tests
- Correction of type errors in the tests to have something that typechecks
  with the changes introduced before
parent 51c8a407
Pipeline #6836 canceled with stages
in 2 minutes and 45 seconds
...@@ -197,6 +197,7 @@ library ...@@ -197,6 +197,7 @@ library
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.TSV Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV Gargantext.Core.Text.List.Formats.TSV
......
...@@ -33,631 +33,4 @@ module Gargantext.Core.Text.Corpus.Parsers.TSV ...@@ -33,631 +33,4 @@ module Gargantext.Core.Text.Corpus.Parsers.TSV
) )
where where
import Conduit (ConduitT, yieldMany, mapC, (.|)) import Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
import Data.ByteString.Lazy qualified as BL
import Data.Csv qualified as CSV
import Data.Csv ((.=), (.:))
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector qualified as V
import Data.Vector ((!?))
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn
---------------------------------
-- Utility types and functions --
---------------------------------
-- | Day of the UNIX reference time (UNIX epoch)
defaultYear :: Int
defaultYear = 1973
-- | UNIX epoch month
defaultMonth :: Int
defaultMonth = 1
-- | UNIX epoch year
defaultDay :: Int
defaultDay = 1
-- | The possible delimiters for a CSV file
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
-- | Concrete representation of the above type
toWord8 :: ColumnDelimiter -> Word8
toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
-- | Just the newline character in `Word8` form; handy to have around
newline :: Word8
newline = fromIntegral $ ord '\n'
-- | Split a text depending on the delimiter type
split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
-- | Remove all occurences of `needle` in `haystack`
removeText :: T.Text -- ^ The text to remove (the `needle`)
-> T.Text -- ^ The original text (the `haystack`)
-> T.Text
removeText needle haystack = T.replace needle "" haystack
-- | Convert from raw byte string into `Text`
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
-- | Fail in the Either monad iff a predicate is False
assertE :: Bool -- ^ The predicate to be tested
-> e -- ^ The error if the predicate fails
-> Either e ()
assertE predicate err = if predicate then Right () else Left err
-- | Trying getting a value out of a `Maybe`; if it does not work,
-- return an error message
tryE :: Maybe a -- ^ The value to extract
-> e -- ^ The error message
-> Either e a -- ^ The extracted value, in the `Either` monad
tryE Nothing err = Left err
tryE (Just value) _ = Right value
-- | Tests whether the input text can be parsed as a number
parseNumber :: Text -> Maybe Int
parseNumber str = case DTR.decimal str of
Right (x, "") -> Just x
_ -> Nothing
-- | An integer that may be the result of parsing either an integer value
-- or a floating-point value. If the parsed value was a floating-point number,
-- it is cropped to take only its integer part.
newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
deriving (Show, Eq, Read)
instance CSV.FromField IntOrDec where
-- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
Left _ -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
Right n -> pure $ IntOrDec n
instance CSV.ToField IntOrDec where
toField = CSV.toField . unIntOrDec
-- | Default options for TSV encoding, except for the delimiter,
-- which is passed as argument
defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
-- | Default options for TSV decoding, except for the delimiter,
-- which is passed as argument
defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
--------------------
-- Basic TSV type --
--------------------
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance CSV.FromNamedRecord TsvDoc where
parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance CSV.ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
CSV.namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = maybe defaultYear unIntOrDec tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
-- | Minimal header for a working TSV import
-- TODO why is this so much different from `minimalExportHeader`?
minimalTsvImportHeaders :: [Text]
minimalTsvImportHeaders = [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"
]
-- | Minimal header for a working TSV export
-- TODO why is this so much different from `minimalImportHeader`?
minimalTsvExportHeader :: CSV.Header
minimalTsvExportHeader =
CSV.header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
-- | Parses a TSV file into the native HyperdataDocument datatype..
-- Returns `Left <error message>` upon failure
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv filepath = do
eitherTsvFile <- readTSVFile filepath
return $ do
(_header, tsvDocs) <- eitherTsvFile
let hyperdataDocs = tsv2doc <$> tsvDocs
return $ V.toList hyperdataDocs
-- | Conduit version of `parseTsv`
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC tsvFile =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case testCorrectFile tsvFile of
Left err -> (Left err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case testCorrectFile file of
Left _err -> pure $ Left _err
Right (del,_) -> pure $ readTsvLazyBS del file
-- | Write a TSV file containing a collection of hyperdata documents
writeDocs2Tsv :: FilePath -- ^ The output filepath
-> [HyperdataDocument] -- ^ The hyperdata documents to write
-> IO ()
writeDocs2Tsv filepath hyperdataDocs = do
let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
minimalTsvExportHeader
tsvDocs
BL.writeFile filepath fileToWrite
-- | Convert between the native hyperdata datatype
-- and the transition TsvDoc datatype
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h =
TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
-------------------------------------
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (CSV.Header, V.Vector TsvDoc)
readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile file = do
delimiter <- findColumnDelimiter file
let rows = BL.splitWith (== newline) file
headers <- getHeaders rows delimiter
(\content -> (delimiter, content)) <$> checkRows rows delimiter headers
-- | Given a delimiter type and a list of headers, checks that the rows
-- of the input CSV/TSV file, split into lines, are valid. If they are, return
-- the list of warnings encountered during checking.
-- Note that the first line (the header line) must be included,
-- but will not be checked here.
checkRows :: [BL.ByteString] -- ^ The file, split in rows
-> ColumnDelimiter -- ^ The purported delimiter type
-> [Text] -- ^ The headers
-> Either Text [Text]
checkRows fileLines delimiter headers =
checkFromLine 2 (V.fromList fileLines) delimiter headers
-- | Detect which delimiter a file is using; if detection fails, an error message
-- is output inside the `Left` constructor
findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| testDelimiter Semicolon bs = Right Semicolon
| otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
-- | Get a TSV file's header fields
getHeaders :: [BL.ByteString] -- ^ The input file, split into lines
-> ColumnDelimiter -- ^ This CSV/TSV's delimiter
-> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
getHeaders rows delimiter =
case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of
Nothing -> Left "Error in function `getHeaders`: Empty file"
Just headers -> case testAllHeadersPresence $ removeText "\"" <$> headers of
Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
Right () -> Right headers
-- | Check that all required headers are present for the basic TSV type
testAllHeadersPresence :: [Text] -- ^ List of headers
-> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
testAllHeadersPresence headers = do
-- Remove the \r character that sometimes appears at the end of a line
let cleanHeaders = removeText "\r" <$> headers
let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
if null missingHeaders
then Right ()
else Left missingHeaders
-- | Test whether the given file has the given character as a delimiter
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter delimiter file = fromMaybe False $ do
-- do-notation in the Maybe monad for nice pattern matching
let rows = V.fromList $ BL.splitWith (== newline) file
headers <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
return $ length headers == length firstRow && length headers > 2
-- | Check a TSV file's rows starting at the given line number
checkFromLine :: Int -- ^ The line number from which to check
-> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Either Text [Text]
-- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
checkFromLine lineNumber tsvLines delim headers = do
assertE (length tsvLines == lineNumber)
"Error in function checkIntegrity: queried line number is greater than actual number of lines"
currentLine <- tryE (tsvLines !? lineNumber)
"Error in function checkIntegrity: no line at queried number"
(y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
restOfWarnings <- checkFromLine (y+1) tsvLines delim headers
return $ currentLineWarnings <> restOfWarnings
-- | Parse the row given as input. If the input is not a whole row, only part of a row,
-- fetch the next line and interpret it as the rest of the row
checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ Input row, or part of a row
-> Int -- ^ Current line number
-> Either Text (Int, [BL.ByteString])
-- ^ Left: Error message; Right: (Line number after the row, list of cells)
checkPartialRow fileLines delimiter headers partialRow lineNum = do
let partialCells = split delimiter partialRow
if length partialCells == length headers
-- We've got a full row, we can check its contents:
then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
else do
assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
(T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
"checkPartialRow"
checkPartialRow fileLines delimiter headers val (lineNum+1)
-- | Looks up the next line...?
checkNextLine :: V.Vector BL.ByteString -- ^ All rows
-> ColumnDelimiter -- ^ Type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ ???
-> Int -- ^ Line number
-> (Int, [BL.ByteString])
checkNextLine allLines delimiter headers res lineNumber =
case allLines !? (lineNumber + 1) of
Nothing -> (lineNumber, split delimiter res)
Just nextLine -> let nextRow = split delimiter nextLine in
if length nextRow > 1
then (lineNumber, split delimiter res)
else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
-- | Check for any errors or warnings in a given row
checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ The list of headers
-> Int -- ^ Current line; used for logging purposes
-> Either Text [Text] -- ^ List of warnings
-- Empty row & no headers: nothing to check:
checkRow [] _ [] _ = Right []
-- More cells than headers: error
checkRow _ delimiter [] line = Left $
"Too many fields at line " <> (T.pack . show) line <> "." <>
if delimiter /= Comma
then ""
else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
-- More headers than cells: error
checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
-- Still some fields and headers to consume
checkRow (v:val) delimiter (h:headers) line = do
warning <- checkValue v h line
warnings <- checkRow val delimiter headers line
return $ warning <> warnings
-- | Tests whether a given chunk of text is a valid field
checkValue :: BL.ByteString -- ^ The field to check
-> Text -- ^ Current column header; passed as input for logging purposes
-> Int -- ^ Current line number; passed as input for logging purposes
-> Either Text [Text]
checkValue field columnHeader lineNumber = case columnHeader of
"Publication Day" -> checkNumber field columnHeader lineNumber
"Publication Month" -> checkNumber field columnHeader lineNumber
"Publication Year" -> checkNumber field columnHeader lineNumber
"Authors" -> checkTextField field columnHeader lineNumber
"Title" -> checkTextField field columnHeader lineNumber
"Source" -> checkTextField field columnHeader lineNumber
"Abstract" -> checkTextField field columnHeader lineNumber
_ -> Right []
-- | Checks whether a given chunk of text is a valid number
checkNumber :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkNumber field columnHeader line = do
let cleanField = removeText "\"" $ lBLToText field
value <- tryE (parseNumber cleanField) $
"Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
assertE (value >= 0) $
"Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
return []
-- | Checks whether a given chunk of text is a valid text field
-- (i.e. does not have parasite quotes)
checkTextField :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkTextField field columnHeader ligne = do
let cleanField = removeText "\"\"" $ lBLToText field
if T.null cleanField
then Right ["The column '" <> columnHeader <> "' at line " <> T.pack (show ligne) <> " is empty"]
else do
assertE (
-- Field is not empty...
(T.length cleanField > 0) &&
-- ... and either there are no quotes at all...
( T.length (T.filter (== '\"') cleanField) == 0 ||
-- ... or there is only one quote at the beginning and one at the end
((T.head cleanField == '"') &&
(T.last cleanField == '"') &&
(T.length (T.filter (== '\"') cleanField) == 2)
)
)) $
"Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field "
return []
---------------------------------------
-- | Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
instance CSV.FromNamedRecord Tsv' where
parseNamedRecord r = do
tsv'_title <- r .: "title"
tsv'_source <- r .: "source"
tsv'_publication_year <- r .: "publication_year"
tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
readWeightedTsv fp =
(\bs ->
case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
Left e -> panicTrace (T.pack e)
Right corpus -> corpus
) <$> BL.readFile fp
-----------------------
-- | The data of a HAL document
data TsvHal = TsvHal
{ tsvHal_title :: !Text
, tsvHal_source :: !Text
, tsvHal_publication_year :: !Integer
, tsvHal_publication_month :: !Int
, tsvHal_publication_day :: !Int
, tsvHal_abstract :: !Text
, tsvHal_authors :: !Text
, tsvHal_url :: !Text
, tsvHal_isbn_s :: !Text
, tsvHal_issue_s :: !Text
, tsvHal_journalPublisher_s:: !Text
, tsvHal_language_s :: !Text
, tsvHal_doiId_s :: !Text
, tsvHal_authId_i :: !Text
, tsvHal_instStructId_i :: !Text
, tsvHal_deptStructId_i :: !Text
, tsvHal_labStructId_i :: !Text
, tsvHal_rteamStructId_i :: !Text
, tsvHal_docType_s :: !Text
}
deriving (Show)
instance CSV.FromNamedRecord TsvHal where
parseNamedRecord r = do
tsvHal_title <- r .: "title"
tsvHal_source <- r .: "source"
tsvHal_publication_year <- r .: "publication_year"
tsvHal_publication_month <- r .: "publication_month"
tsvHal_publication_day <- r .: "publication_day"
tsvHal_abstract <- r .: "abstract"
tsvHal_authors <- r .: "authors"
tsvHal_url <- r .: "url"
tsvHal_isbn_s <- r .: "isbn_s"
tsvHal_issue_s <- r .: "issue_s"
tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
tsvHal_language_s <- r .: "language_s"
tsvHal_doiId_s <- r .: "doiId_s"
tsvHal_authId_i <- r .: "authId_i"
tsvHal_instStructId_i <- r .: "instStructId_i"
tsvHal_deptStructId_i <- r .: "deptStructId_i"
tsvHal_labStructId_i <- r .: "labStructId_i"
tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
tsvHal_docType_s <- r .: "docType_s"
pure $ TsvHal { .. }
instance CSV.ToNamedRecord TsvHal where
toNamedRecord (TsvHal { .. }) =
CSV.namedRecord [ "title" .= tsvHal_title
, "source" .= tsvHal_source
, "publication_year" .= tsvHal_publication_year
, "publication_month" .= tsvHal_publication_month
, "publication_day" .= tsvHal_publication_day
, "abstract" .= tsvHal_abstract
, "authors" .= tsvHal_authors
, "url" .= tsvHal_url
, "isbn_s" .= tsvHal_isbn_s
, "issue_s" .= tsvHal_issue_s
, "journalPublisher_s" .= tsvHal_journalPublisher_s
, "language_s" .= tsvHal_language_s
, "doiId_s" .= tsvHal_doiId_s
, "authId_i" .= tsvHal_authId_i
, "instStructId_i" .= tsvHal_instStructId_i
, "deptStructId_i" .= tsvHal_deptStructId_i
, "labStructId_i" .= tsvHal_labStructId_i
, "rteamStructId_i" .= tsvHal_rteamStructId_i
, "docType_s" .= tsvHal_docType_s
]
readTsvHal :: FilePath
-> IO (Either Text (CSV.Header, V.Vector TsvHal))
readTsvHal path = do
file <- BL.readFile path
return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
tsvHal2doc :: TsvHal -> HyperdataDocument
tsvHal2doc (TsvHal { .. }) =
HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Just tsvHal_doiId_s
, _hd_url = Just tsvHal_url
, _hd_page = Nothing
, _hd_title = Just tsvHal_title
, _hd_authors = Just tsvHal_authors
, _hd_institutes = Just tsvHal_instStructId_i
, _hd_source = Just tsvHal_source
, _hd_abstract = Just tsvHal_abstract
, _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
tsvHal_publication_month
tsvHal_publication_day
, _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
, _hd_publication_month = Just tsvHal_publication_month
, _hd_publication_day = Just tsvHal_publication_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TSV parser for Gargantext corpus files.
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
( ColumnDelimiter (..)
, IntOrDec (..)
, TsvDoc (..)
, TsvHal (..)
, Tsv' (..)
, defaultDay
, defaultDecodingOptionsWithDelimiter
, defaultEncodingOptionsWithDelimiter
, defaultMonth
, defaultYear
, parseHal
, parseTsv
, parseTsvC
, readTSVFile
, readTsvHal
, readWeightedTsv
, writeDocs2Tsv
, checkNumber
, checkRow
, checkTextField
, findColumnDelimiter
, getHeaders
, testCorrectFile
, toWord8
)
where
import Conduit (ConduitT, yieldMany, mapC, (.|))
import Data.ByteString.Lazy qualified as BL
import Data.Csv qualified as CSV
import Data.Csv ((.=), (.:))
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector qualified as V
import Data.Vector ((!?))
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn
---------------------------------
-- Utility types and functions --
---------------------------------
-- | Day of the UNIX reference time (UNIX epoch)
defaultYear :: Int
defaultYear = 1973
-- | UNIX epoch month
defaultMonth :: Int
defaultMonth = 1
-- | UNIX epoch year
defaultDay :: Int
defaultDay = 1
-- | The possible delimiters for a CSV file
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
-- | Concrete representation of the above type
toWord8 :: ColumnDelimiter -> Word8
toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
-- | Just the newline character in `Word8` form; handy to have around
newline :: Word8
newline = fromIntegral $ ord '\n'
-- | Split a text depending on the delimiter type
split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
-- | Remove all occurences of `needle` in `haystack`
removeText :: T.Text -- ^ The text to remove (the `needle`)
-> T.Text -- ^ The original text (the `haystack`)
-> T.Text
removeText needle haystack = T.replace needle "" haystack
-- | Convert from raw byte string into `Text`
lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
-- | Fail in the Either monad iff a predicate is False
assertE :: Bool -- ^ The predicate to be tested
-> e -- ^ The error if the predicate fails
-> Either e ()
assertE predicate err = if predicate then Right () else Left err
-- | Trying getting a value out of a `Maybe`; if it does not work,
-- return an error message
tryE :: Maybe a -- ^ The value to extract
-> e -- ^ The error message
-> Either e a -- ^ The extracted value, in the `Either` monad
tryE Nothing err = Left err
tryE (Just value) _ = Right value
-- | Tests whether the input text can be parsed as a number
parseNumber :: Text -> Maybe Int
parseNumber str = case DTR.decimal str of
Right (x, "") -> Just x
_ -> Nothing
-- | An integer that may be the result of parsing either an integer value
-- or a floating-point value. If the parsed value was a floating-point number,
-- it is cropped to take only its integer part.
newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
deriving (Show, Eq, Read)
instance CSV.FromField IntOrDec where
-- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
Left _ -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
Right n -> pure $ IntOrDec n
instance CSV.ToField IntOrDec where
toField = CSV.toField . unIntOrDec
-- | Default options for TSV encoding, except for the delimiter,
-- which is passed as argument
defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
-- | Default options for TSV decoding, except for the delimiter,
-- which is passed as argument
defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
--------------------
-- Basic TSV type --
--------------------
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance CSV.FromNamedRecord TsvDoc where
parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance CSV.ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
CSV.namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = maybe defaultYear unIntOrDec tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
-- | Minimal header for a working TSV import
-- TODO why is this so much different from `minimalExportHeader`?
minimalTsvImportHeaders :: [Text]
minimalTsvImportHeaders = [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"
]
-- | Minimal header for a working TSV export
-- TODO why is this so much different from `minimalImportHeader`?
minimalTsvExportHeader :: CSV.Header
minimalTsvExportHeader =
CSV.header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
-- | Parses a TSV file into the native HyperdataDocument datatype..
-- Returns `Left <error message>` upon failure
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv filepath = do
eitherTsvFile <- readTSVFile filepath
return $ do
(_header, tsvDocs) <- eitherTsvFile
let hyperdataDocs = tsv2doc <$> tsvDocs
return $ V.toList hyperdataDocs
-- | Conduit version of `parseTsv`
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC tsvFile =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case testCorrectFile tsvFile of
Left err -> (Left err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case testCorrectFile file of
Left _err -> pure $ Left _err
Right (del,_) -> pure $ readTsvLazyBS del file
-- | Write a TSV file containing a collection of hyperdata documents
writeDocs2Tsv :: FilePath -- ^ The output filepath
-> [HyperdataDocument] -- ^ The hyperdata documents to write
-> IO ()
writeDocs2Tsv filepath hyperdataDocs = do
let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
minimalTsvExportHeader
tsvDocs
BL.writeFile filepath fileToWrite
-- | Convert between the native hyperdata datatype
-- and the transition TsvDoc datatype
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h =
TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
-------------------------------------
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (CSV.Header, V.Vector TsvDoc)
readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile file = do
delimiter <- findColumnDelimiter file
let rows = BL.splitWith (== newline) file
headers <- getHeaders rows delimiter
(\content -> (delimiter, content)) <$> checkRows rows delimiter headers
-- | Given a delimiter type and a list of headers, checks that the rows
-- of the input CSV/TSV file, split into lines, are valid. If they are, return
-- the list of warnings encountered during checking.
-- Note that the first line (the header line) must be included,
-- but will not be checked here.
checkRows :: [BL.ByteString] -- ^ The file, split in rows
-> ColumnDelimiter -- ^ The purported delimiter type
-> [Text] -- ^ The headers
-> Either Text [Text]
checkRows fileLines delimiter headers =
checkFromLine 2 (V.fromList fileLines) delimiter headers
-- | Detect which delimiter a file is using; if detection fails, an error message
-- is output inside the `Left` constructor
findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma
| testDelimiter Semicolon bs = Right Semicolon
| otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
-- | Get a TSV file's header fields
getHeaders :: [BL.ByteString] -- ^ The input file, split into lines
-> ColumnDelimiter -- ^ This CSV/TSV's delimiter
-> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
getHeaders rows delimiter =
case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of
Nothing -> Left "Error in function `getHeaders`: Empty file"
Just headers -> case testAllHeadersPresence $ removeText "\"" <$> headers of
Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
Right () -> Right headers
-- | Check that all required headers are present for the basic TSV type
testAllHeadersPresence :: [Text] -- ^ List of headers
-> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
testAllHeadersPresence headers = do
-- Remove the \r character that sometimes appears at the end of a line
let cleanHeaders = removeText "\r" <$> headers
let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
if null missingHeaders
then Right ()
else Left missingHeaders
-- | Test whether the given file has the given character as a delimiter
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter delimiter file = fromMaybe False $ do
-- do-notation in the Maybe monad for nice pattern matching
let rows = V.fromList $ BL.splitWith (== newline) file
headers <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
return $ length headers == length firstRow && length headers > 2
-- | Check a TSV file's rows starting at the given line number
checkFromLine :: Int -- ^ The line number from which to check
-> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Either Text [Text]
-- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
checkFromLine lineNumber tsvLines delim headers = do
assertE (length tsvLines == lineNumber)
"Error in function checkIntegrity: queried line number is greater than actual number of lines"
currentLine <- tryE (tsvLines !? lineNumber)
"Error in function checkIntegrity: no line at queried number"
(y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
restOfWarnings <- checkFromLine (y+1) tsvLines delim headers
return $ currentLineWarnings <> restOfWarnings
-- | Parse the row given as input. If the input is not a whole row, only part of a row,
-- fetch the next line and interpret it as the rest of the row
checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ Input row, or part of a row
-> Int -- ^ Current line number
-> Either Text (Int, [BL.ByteString])
-- ^ Left: Error message; Right: (Line number after the row, list of cells)
checkPartialRow fileLines delimiter headers partialRow lineNum = do
let partialCells = split delimiter partialRow
if length partialCells == length headers
-- We've got a full row, we can check its contents:
then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
else do
assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
(T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
"checkPartialRow"
checkPartialRow fileLines delimiter headers val (lineNum+1)
-- | Looks up the next line...?
checkNextLine :: V.Vector BL.ByteString -- ^ All rows
-> ColumnDelimiter -- ^ Type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ ???
-> Int -- ^ Line number
-> (Int, [BL.ByteString])
checkNextLine allLines delimiter headers res lineNumber =
case allLines !? (lineNumber + 1) of
Nothing -> (lineNumber, split delimiter res)
Just nextLine -> let nextRow = split delimiter nextLine in
if length nextRow > 1
then (lineNumber, split delimiter res)
else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
-- | Check for any errors or warnings in a given row
checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ The list of headers
-> Int -- ^ Current line; used for logging purposes
-> Either Text [Text] -- ^ List of warnings
-- Empty row & no headers: nothing to check:
checkRow [] _ [] _ = Right []
-- More cells than headers: error
checkRow _ delimiter [] line = Left $
"Too many fields at line " <> (T.pack . show) line <> "." <>
if delimiter /= Comma
then ""
else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
-- More headers than cells: error
checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
-- Still some fields and headers to consume
checkRow (v:val) delimiter (h:headers) line = do
warning <- checkValue v h line
warnings <- checkRow val delimiter headers line
return $ warning <> warnings
-- | Tests whether a given chunk of text is a valid field
checkValue :: BL.ByteString -- ^ The field to check
-> Text -- ^ Current column header; passed as input for logging purposes
-> Int -- ^ Current line number; passed as input for logging purposes
-> Either Text [Text]
checkValue field columnHeader lineNumber = case columnHeader of
"Publication Day" -> checkNumber field columnHeader lineNumber
"Publication Month" -> checkNumber field columnHeader lineNumber
"Publication Year" -> checkNumber field columnHeader lineNumber
"Authors" -> checkTextField field columnHeader lineNumber
"Title" -> checkTextField field columnHeader lineNumber
"Source" -> checkTextField field columnHeader lineNumber
"Abstract" -> checkTextField field columnHeader lineNumber
_ -> Right []
-- | Checks whether a given chunk of text is a valid number
checkNumber :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkNumber field columnHeader line = do
let cleanField = removeText "\"" $ lBLToText field
value <- tryE (parseNumber cleanField) $
"Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
assertE (value >= 0) $
"Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
return []
-- | Checks whether a given chunk of text is a valid text field
-- (i.e. does not have parasite quotes)
checkTextField :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkTextField field columnHeader ligne = do
let cleanField = removeText "\"\"" $ lBLToText field
if T.null cleanField
then Right ["The column '" <> columnHeader <> "' at line " <> T.pack (show ligne) <> " is empty"]
else do
assertE (
-- Field is not empty...
(T.length cleanField > 0) &&
-- ... and either there are no quotes at all...
( T.length (T.filter (== '\"') cleanField) == 0 ||
-- ... or there is only one quote at the beginning and one at the end
((T.head cleanField == '"') &&
(T.last cleanField == '"') &&
(T.length (T.filter (== '\"') cleanField) == 2)
)
)) $
"Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field "
return []
---------------------------------------
-- | Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
instance CSV.FromNamedRecord Tsv' where
parseNamedRecord r = do
tsv'_title <- r .: "title"
tsv'_source <- r .: "source"
tsv'_publication_year <- r .: "publication_year"
tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
readWeightedTsv fp =
(\bs ->
case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
Left e -> panicTrace (T.pack e)
Right corpus -> corpus
) <$> BL.readFile fp
-----------------------
-- | The data of a HAL document
data TsvHal = TsvHal
{ tsvHal_title :: !Text
, tsvHal_source :: !Text
, tsvHal_publication_year :: !Integer
, tsvHal_publication_month :: !Int
, tsvHal_publication_day :: !Int
, tsvHal_abstract :: !Text
, tsvHal_authors :: !Text
, tsvHal_url :: !Text
, tsvHal_isbn_s :: !Text
, tsvHal_issue_s :: !Text
, tsvHal_journalPublisher_s:: !Text
, tsvHal_language_s :: !Text
, tsvHal_doiId_s :: !Text
, tsvHal_authId_i :: !Text
, tsvHal_instStructId_i :: !Text
, tsvHal_deptStructId_i :: !Text
, tsvHal_labStructId_i :: !Text
, tsvHal_rteamStructId_i :: !Text
, tsvHal_docType_s :: !Text
}
deriving (Show)
instance CSV.FromNamedRecord TsvHal where
parseNamedRecord r = do
tsvHal_title <- r .: "title"
tsvHal_source <- r .: "source"
tsvHal_publication_year <- r .: "publication_year"
tsvHal_publication_month <- r .: "publication_month"
tsvHal_publication_day <- r .: "publication_day"
tsvHal_abstract <- r .: "abstract"
tsvHal_authors <- r .: "authors"
tsvHal_url <- r .: "url"
tsvHal_isbn_s <- r .: "isbn_s"
tsvHal_issue_s <- r .: "issue_s"
tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
tsvHal_language_s <- r .: "language_s"
tsvHal_doiId_s <- r .: "doiId_s"
tsvHal_authId_i <- r .: "authId_i"
tsvHal_instStructId_i <- r .: "instStructId_i"
tsvHal_deptStructId_i <- r .: "deptStructId_i"
tsvHal_labStructId_i <- r .: "labStructId_i"
tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
tsvHal_docType_s <- r .: "docType_s"
pure $ TsvHal { .. }
instance CSV.ToNamedRecord TsvHal where
toNamedRecord (TsvHal { .. }) =
CSV.namedRecord [ "title" .= tsvHal_title
, "source" .= tsvHal_source
, "publication_year" .= tsvHal_publication_year
, "publication_month" .= tsvHal_publication_month
, "publication_day" .= tsvHal_publication_day
, "abstract" .= tsvHal_abstract
, "authors" .= tsvHal_authors
, "url" .= tsvHal_url
, "isbn_s" .= tsvHal_isbn_s
, "issue_s" .= tsvHal_issue_s
, "journalPublisher_s" .= tsvHal_journalPublisher_s
, "language_s" .= tsvHal_language_s
, "doiId_s" .= tsvHal_doiId_s
, "authId_i" .= tsvHal_authId_i
, "instStructId_i" .= tsvHal_instStructId_i
, "deptStructId_i" .= tsvHal_deptStructId_i
, "labStructId_i" .= tsvHal_labStructId_i
, "rteamStructId_i" .= tsvHal_rteamStructId_i
, "docType_s" .= tsvHal_docType_s
]
readTsvHal :: FilePath
-> IO (Either Text (CSV.Header, V.Vector TsvHal))
readTsvHal path = do
file <- BL.readFile path
return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
tsvHal2doc :: TsvHal -> HyperdataDocument
tsvHal2doc (TsvHal { .. }) =
HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Just tsvHal_doiId_s
, _hd_url = Just tsvHal_url
, _hd_page = Nothing
, _hd_title = Just tsvHal_title
, _hd_authors = Just tsvHal_authors
, _hd_institutes = Just tsvHal_instStructId_i
, _hd_source = Just tsvHal_source
, _hd_abstract = Just tsvHal_abstract
, _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
tsvHal_publication_month
tsvHal_publication_day
, _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
, _hd_publication_month = Just tsvHal_publication_month
, _hd_publication_day = Just tsvHal_publication_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
...@@ -2,19 +2,16 @@ ...@@ -2,19 +2,16 @@
module Test.Core.Text.Corpus.TSV (tests) where module Test.Core.Text.Corpus.TSV (tests) where
import Gargantext.Core.Text.Corpus.Parsers.TSV import Gargantext.Core.Text.Corpus.Parsers.TSV.Internal
import Test.QuickCheck
import Test.QuickCheck.Instances ()
import Data.ByteString.Lazy.UTF8 as BLU
import Data.ByteString.Lazy as BL import Data.ByteString.Lazy as BL
import Data.Char ( ord ) import Data.ByteString.Lazy.UTF8 as BLU
import Data.Char (ord)
import Data.Text as DT (Text, pack, null, elem) import Data.Text as DT (Text, pack, null, elem)
import Data.Text.Encoding as DT import Data.Text.Encoding as DT
import Prelude import Prelude
import Test.QuickCheck
import Test.QuickCheck.Instances ()
import Test.Tasty import Test.Tasty
import Test.Tasty.HUnit
import Test.Tasty.QuickCheck hiding (Positive, Negative) import Test.Tasty.QuickCheck hiding (Positive, Negative)
tests :: TestTree tests :: TestTree
...@@ -30,12 +27,12 @@ tests = testGroup "TSV Parser" [ ...@@ -30,12 +27,12 @@ tests = testGroup "TSV Parser" [
delimiterBS :: Delimiter -> BL.ByteString delimiterBS :: ColumnDelimiter -> BL.ByteString
delimiterBS Tab = BLU.fromString "\t" delimiterBS Tab = BLU.fromString "\t"
delimiterBS Comma = BLU.fromString "," delimiterBS Comma = BLU.fromString ","
delimiterBS Line = BLU.fromString "\n" delimiterBS Semicolon = BLU.fromString ";"
data File = File { fDelimiter :: Delimiter data File = File { fDelimiter :: ColumnDelimiter
, allCorpus :: [RandomCorpus] , allCorpus :: [RandomCorpus]
} }
deriving (Show) deriving (Show)
...@@ -56,19 +53,19 @@ instance Arbitrary File where ...@@ -56,19 +53,19 @@ instance Arbitrary File where
arbitrarySizedFile :: Int -> Gen File arbitrarySizedFile :: Int -> Gen File
arbitrarySizedFile m = do arbitrarySizedFile m = do
del <- elements [Tab, Comma] del <- elements [Tab, Comma, Semicolon]
corp <- vectorOf m (generateRandomCorpus) corp <- vectorOf m (generateRandomCorpus)
return (File del corp) return (File del corp)
delimiterToText :: Delimiter -> Text delimiterToText :: ColumnDelimiter -> Text
delimiterToText Tab = DT.pack "\t" delimiterToText Tab = DT.pack "\t"
delimiterToText Comma = DT.pack "," delimiterToText Comma = DT.pack ","
delimiterToText Line = DT.pack "\n" delimiterToText Semicolon = DT.pack ";"
delimiterToString :: Delimiter -> Char delimiterToString :: ColumnDelimiter -> Char
delimiterToString Tab = '\t' delimiterToString Tab = '\t'
delimiterToString Comma = ',' delimiterToString Comma = ','
delimiterToString Line = '\n' delimiterToString Semicolon = ';'
textToBL :: Text -> BL.ByteString textToBL :: Text -> BL.ByteString
textToBL b = BL.fromChunks . return . DT.encodeUtf8 $ b textToBL b = BL.fromChunks . return . DT.encodeUtf8 $ b
...@@ -85,7 +82,7 @@ generateRandomCorpus = RandomCorpus ...@@ -85,7 +82,7 @@ generateRandomCorpus = RandomCorpus
generateFileDelimiter :: Gen File generateFileDelimiter :: Gen File
generateFileDelimiter = do generateFileDelimiter = do
del <- elements [Tab, Comma] del <- elements [Tab, Comma, Semicolon]
m <- choose (1,5) m <- choose (1,5)
corp <- vectorOf m (generateRandomCorpus) corp <- vectorOf m (generateRandomCorpus)
return (File del corp) return (File del corp)
...@@ -106,7 +103,7 @@ randomHeaderList = frequency [ ...@@ -106,7 +103,7 @@ randomHeaderList = frequency [
] ]
--TODO add delimiter --TODO add delimiter
createLineFromCorpus :: RandomCorpus -> Delimiter -> BL.ByteString createLineFromCorpus :: RandomCorpus -> ColumnDelimiter -> BL.ByteString
createLineFromCorpus corpus delD = do createLineFromCorpus corpus delD = do
let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"") let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"") let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
...@@ -118,7 +115,7 @@ createLineFromCorpus corpus delD = do ...@@ -118,7 +115,7 @@ createLineFromCorpus corpus delD = do
let del = delimiterToText delD let del = delimiterToText delD
textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt) textToBL(pDay <> del <> pMonth <> del <> pYears <> del <> aut <> del <> tit <> del <> sou <> del <> abt)
createLineFromCorpusWithNewLine :: RandomCorpus -> Delimiter -> BL.ByteString createLineFromCorpusWithNewLine :: RandomCorpus -> ColumnDelimiter -> BL.ByteString
createLineFromCorpusWithNewLine corpus delD = do createLineFromCorpusWithNewLine corpus delD = do
let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"") let aut = (DT.pack "\"") <> (authors corpus) <> (DT.pack "\"")
let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"") let tit = (DT.pack "\"") <> (title corpus) <> (DT.pack "\"")
...@@ -143,9 +140,9 @@ createFileWithNewLine file = do ...@@ -143,9 +140,9 @@ createFileWithNewLine file = do
let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpus x (fDelimiter file)) (allCorpus file) let allLines = BL.intercalate (BLU.fromString "\n") $ Prelude.map (\x -> createLineFromCorpus x (fDelimiter file)) (allCorpus file)
headers <> (BLU.fromString "\n") <> allLines headers <> (BLU.fromString "\n") <> allLines
validRandomCorpus :: RandomCorpus -> Delimiter -> Bool validRandomCorpus :: RandomCorpus -> ColumnDelimiter -> Bool
validRandomCorpus tsv del validRandomCorpus tsv del
| BL.length (BL.filter (==delimiter del) (createLineFromCorpus tsv del)) > 3= True | BL.length (BL.filter (== toWord8 del) (createLineFromCorpus tsv del)) > 3= True
| DT.null $ abstract tsv = True | DT.null $ abstract tsv = True
| DT.null $ title tsv = True | DT.null $ title tsv = True
| DT.null $ authors tsv = True | DT.null $ authors tsv = True
...@@ -161,7 +158,7 @@ testValidNumber :: Property ...@@ -161,7 +158,7 @@ testValidNumber :: Property
testValidNumber = forAll generateNumber (\s -> do testValidNumber = forAll generateNumber (\s -> do
let nbText = DT.pack $ show s let nbText = DT.pack $ show s
let bl = textToBL nbText let bl = textToBL nbText
case validNumber bl nbText 1 [] of case checkNumber bl nbText 1 of
Right _ -> True Right _ -> True
Left _ | BL.empty == bl -> True Left _ | BL.empty == bl -> True
| s < 1 -> True | s < 1 -> True
...@@ -171,7 +168,7 @@ testValidNumber = forAll generateNumber (\s -> do ...@@ -171,7 +168,7 @@ testValidNumber = forAll generateNumber (\s -> do
testValidText :: Property testValidText :: Property
testValidText = forAll generateString (\s -> testValidText = forAll generateString (\s ->
let bl = textToBL s in let bl = textToBL s in
case validTextField bl s 1 [] of case checkTextField bl s 1 of
Right _ -> True Right _ -> True
Left _ | BL.empty == bl -> True Left _ | BL.empty == bl -> True
| (fromIntegral $ ord '\"') `BL.elem` bl -> True | (fromIntegral $ ord '\"') `BL.elem` bl -> True
...@@ -184,15 +181,11 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do ...@@ -184,15 +181,11 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
let del = Tab let del = Tab
let line = createLineFromCorpus tsv del let line = createLineFromCorpus tsv del
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) line let splitLine = BL.splitWith (== toWord8 del) line
case testErrorPerLine splitLine del headers 1 [] of case checkRow splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus tsv del) Left _ -> validRandomCorpus tsv del)
--check :
-- True Del
-- False Error
-- Test if a file is OK -- Test if a file is OK
testTestCorrectFile :: Property testTestCorrectFile :: Property
testTestCorrectFile = forAll generateFile (\file -> do testTestCorrectFile = forAll generateFile (\file -> do
...@@ -202,8 +195,8 @@ testTestCorrectFile = forAll generateFile (\file -> do ...@@ -202,8 +195,8 @@ testTestCorrectFile = forAll generateFile (\file -> do
Left _ -> Prelude.all (\x -> do Left _ -> Prelude.all (\x -> do
let del = fDelimiter file let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del let splitLine = BL.splitWith (== toWord8 del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 [] of case checkRow splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file)) Left _ -> validRandomCorpus x del) (allCorpus file))
...@@ -217,20 +210,20 @@ testTestCorrectFileWithNewLine = forAll generateFile (\file -> do ...@@ -217,20 +210,20 @@ testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
Left _ -> Prelude.all (\x -> do Left _ -> Prelude.all (\x -> do
let del = fDelimiter file let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del let splitLine = BL.splitWith (== toWord8 del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 [] of case checkRow splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file)) Left _ -> validRandomCorpus x del) (allCorpus file))
testFindDelimiter :: Property testFindDelimiter :: Property
testFindDelimiter = forAll generateFileDelimiter (\file -> do testFindDelimiter = forAll generateFileDelimiter (\file -> do
let tsv = createFile file let tsv = createFile file
case findDelimiter tsv of case findColumnDelimiter tsv of
Right _ -> True Right _ -> True
Left _ -> do Left _ -> do
let line = Prelude.head $ allCorpus file let line = Prelude.head $ allCorpus file
let del = delimiterToString $ fDelimiter file let del = delimiterToString $ fDelimiter file
let delLine = delimiterToString Line let delLine = '\n'
del `DT.elem` (abstract line) || del `DT.elem` (authors line) || del `DT.elem` (title line) || del `DT.elem` (source line) || delLine `DT.elem` (abstract line) || delLine `DT.elem` (authors line) || delLine `DT.elem` (title line) || delLine `DT.elem` (source line)) del `DT.elem` (abstract line) || del `DT.elem` (authors line) || del `DT.elem` (title line) || del `DT.elem` (source line) || delLine `DT.elem` (abstract line) || delLine `DT.elem` (authors line) || delLine `DT.elem` (title line) || delLine `DT.elem` (source line))
testGetHeader :: Property testGetHeader :: Property
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment