Commit 51c8a407 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Reorganize TSV parsing code

parent e4dfb4bd
Pipeline #6835 canceled with stages
...@@ -20,7 +20,7 @@ import Data.Tuple.Extra (both) ...@@ -20,7 +20,7 @@ import Data.Tuple.Extra (both)
import Data.Vector qualified as DV import Data.Vector qualified as DV
import GHC.Generics import GHC.Generics
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear) import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, unIntOrDec, tsv_title, tsv_abstract, tsv_publication_year, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList) import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList ) import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
...@@ -52,7 +52,7 @@ filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (Outpu ...@@ -52,7 +52,7 @@ filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (Outpu
Right cf -> do Right cf -> do
let corpus = DM.fromListWith (<>) let corpus = DM.fromListWith (<>)
. DV.toList . DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)])) . DV.map (\n -> (maybe defaultYear unIntOrDec $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf . snd $ cf
-- termListMap :: [Text] -- termListMap :: [Text]
......
...@@ -82,8 +82,8 @@ tsvToDocs parser patterns time path = ...@@ -82,8 +82,8 @@ tsvToDocs parser patterns time path =
Wos _ -> Prelude.error "tsvToDocs: unimplemented" Wos _ -> Prelude.error "tsvToDocs: unimplemented"
Tsv limit -> Vector.toList Tsv limit -> Vector.toList
<$> Vector.take limit <$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time) <$> Vector.map (\row -> Document (toPhyloDate (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(toPhyloDate' (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time) (toPhyloDate' (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row)) (termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row))
Nothing Nothing
[] []
......
...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header ) ...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
import Data.Text qualified as T import Data.Text qualified as T
import Data.Vector (Vector) import Data.Vector (Vector)
import Data.Vector qualified as Vector import Data.Vector qualified as Vector
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, ColumnDelimiter(Tab) ) import Gargantext.Core.Text.Corpus.Parsers.TSV (defaultDecodingOptionsWithDelimiter, ColumnDelimiter(Tab) )
import Gargantext.Database.Admin.Types.Hyperdata.Contact import Gargantext.Database.Admin.Types.Hyperdata.Contact
import Gargantext.Prelude import Gargantext.Prelude
import System.FilePath.Posix (takeExtension) import System.FilePath.Posix (takeExtension)
...@@ -119,7 +119,7 @@ readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser) ...@@ -119,7 +119,7 @@ readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile
where where
readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser) readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
readTsvHalLazyBS' bs = case decodeByNameWith (tsvDecodeOptions Tab) bs of readTsvHalLazyBS' bs = case decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
Left e -> panicTrace (cs e) Left e -> panicTrace (cs e)
Right rows -> rows Right rows -> rows
......
...@@ -12,109 +12,141 @@ TSV parser for Gargantext corpus files. ...@@ -12,109 +12,141 @@ TSV parser for Gargantext corpus files.
-} -}
module Gargantext.Core.Text.Corpus.Parsers.TSV where module Gargantext.Core.Text.Corpus.Parsers.TSV
( ColumnDelimiter (..)
import Conduit ( ConduitT, (.|), yieldMany, mapC ) , IntOrDec (..)
import Data.ByteString qualified as BS , TsvDoc (..)
, TsvHal (..)
, Tsv' (..)
, defaultDay
, defaultDecodingOptionsWithDelimiter
, defaultEncodingOptionsWithDelimiter
, defaultMonth
, defaultYear
, parseHal
, parseTsv
, parseTsvC
, readTSVFile
, readTsvHal
, readWeightedTsv
, writeDocs2Tsv
)
where
import Conduit (ConduitT, yieldMany, mapC, (.|))
import Data.ByteString.Lazy qualified as BL import Data.ByteString.Lazy qualified as BL
import Data.Csv import Data.Csv qualified as CSV
import Data.Text (pack) import Data.Csv ((.=), (.:))
import Data.Text qualified as T import Data.Text qualified as T
import Data.Text.Lazy qualified as TL import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TL import Data.Text.Lazy.Encoding qualified as TLE
import Data.Text.Read qualified as DTR import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences ) import Data.Vector ((!?))
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn import Gargantext.Utils.Jobs.Error as Warn
-- | The possible delimiters for a CSV file
data LineDelimiter = Newline deriving (Eq, Show)
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
class ToWord8 a where
-- | Convert abstract representation into a Word8 character
toWord8 :: a -> Word8
instance ToWord8 LineDelimiter where ---------------------------------
toWord8 Newline = fromIntegral $ ord '\n' -- Utility types and functions --
---------------------------------
instance ToWord8 ColumnDelimiter where -- | Day of the UNIX reference time (UNIX epoch)
toWord8 Tab = fromIntegral $ ord '\t' defaultYear :: Int
toWord8 Comma = fromIntegral $ ord ',' defaultYear = 1973
toWord8 Semicolon = fromIntegral $ ord ';'
--------------------------------------------------------------- -- | UNIX epoch month
-- | Minimal header for a working TSV import defaultMonth :: Int
minimalTsvHeader :: Header defaultMonth = 1
minimalTsvHeader =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
let docSize = (T.length $ tsv_abstract doc) in
if (docSize > 1000) && (mod (round m) docSize >= 10)
then splitDoc' splt doc
else V.fromList [doc]
where
splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt -- | UNIX epoch year
, tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt defaultDay :: Int
, .. } defaultDay = 1
) (tail' "splitDoc'2" abstracts)
-- | The possible delimiters for a CSV file
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
abstracts = (splitBy $ contextSize) tsv_abstract -- | Concrete representation of the above type
toWord8 :: ColumnDelimiter -> Word8
toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
--------------------------------------------------------------- -- | Just the newline character in `Word8` form; handy to have around
--------------------------------------------------------------- newline :: Word8
type Mean = Double newline = fromIntegral $ ord '\n'
docsSize :: Vector TsvDoc -> Mean -- | Split a text depending on the delimiter type
docsSize tsvDoc = mean ls split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
where split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
-- | Remove all occurences of `needle` in `haystack`
removeText :: T.Text -- ^ The text to remove (the `needle`)
-> T.Text -- ^ The original text (the `haystack`)
-> T.Text
removeText needle haystack = T.replace needle "" haystack
--------------------------------------------------------------- -- | Convert from raw byte string into `Text`
newtype IntOrDec = IntOrDec Int lBLToText :: BL.ByteString -> Text
lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
-- | Fail in the Either monad iff a predicate is False
assertE :: Bool -- ^ The predicate to be tested
-> e -- ^ The error if the predicate fails
-> Either e ()
assertE predicate err = if predicate then Right () else Left err
-- | Trying getting a value out of a `Maybe`; if it does not work,
-- return an error message
tryE :: Maybe a -- ^ The value to extract
-> e -- ^ The error message
-> Either e a -- ^ The extracted value, in the `Either` monad
tryE Nothing err = Left err
tryE (Just value) _ = Right value
-- | Tests whether the input text can be parsed as a number
parseNumber :: Text -> Maybe Int
parseNumber str = case DTR.decimal str of
Right (x, "") -> Just x
_ -> Nothing
-- | An integer that may be the result of parsing either an integer value
-- or a floating-point value. If the parsed value was a floating-point number,
-- it is cropped to take only its integer part.
newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
deriving (Show, Eq, Read) deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i instance CSV.FromField IntOrDec where
instance FromField IntOrDec where -- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
parseField s = case runParser (parseField s :: Parser Int) of parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double) Left _ -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
Right n -> pure $ IntOrDec n Right n -> pure $ IntOrDec n
instance ToField IntOrDec where instance CSV.ToField IntOrDec where
toField (IntOrDec i) = toField i toField = CSV.toField . unIntOrDec
-- | Default options for TSV encoding, except for the delimiter,
-- which is passed as argument
defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
-- | Default options for TSV decoding, except for the delimiter,
-- which is passed as argument
defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
--------------------
-- Basic TSV type --
--------------------
fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1
data TsvDoc = TsvDoc data TsvDoc = TsvDoc
{ tsv_title :: !Text { tsv_title :: !Text
...@@ -127,7 +159,7 @@ data TsvDoc = TsvDoc ...@@ -127,7 +159,7 @@ data TsvDoc = TsvDoc
} }
deriving (Show) deriving (Show)
instance FromNamedRecord TsvDoc where instance CSV.FromNamedRecord TsvDoc where
parseNamedRecord r = do parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title" tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source" tsv_source <- r .: "source" <|> r .: "Source"
...@@ -138,9 +170,9 @@ instance FromNamedRecord TsvDoc where ...@@ -138,9 +170,9 @@ instance FromNamedRecord TsvDoc where
tsv_authors <- r .: "authors" <|> r .: "Authors" tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. } pure $ TsvDoc { .. }
instance ToNamedRecord TsvDoc where instance CSV.ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) = toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title CSV.namedRecord [ "title" .= tsv_title
, "source" .= tsv_source , "source" .= tsv_source
, "publication_year" .= tsv_publication_year , "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month , "publication_month" .= tsv_publication_month
...@@ -149,251 +181,371 @@ instance ToNamedRecord TsvDoc where ...@@ -149,251 +181,371 @@ instance ToNamedRecord TsvDoc where
, "authors" .= tsv_authors , "authors" .= tsv_authors
] ]
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = maybe defaultYear unIntOrDec tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
-- | Minimal header for a working TSV import
-- TODO why is this so much different from `minimalExportHeader`?
minimalTsvImportHeaders :: [Text]
minimalTsvImportHeaders = [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"
]
-- | Minimal header for a working TSV export
-- TODO why is this so much different from `minimalImportHeader`?
minimalTsvExportHeader :: CSV.Header
minimalTsvExportHeader =
CSV.header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
-- | Parses a TSV file into the native HyperdataDocument datatype..
-- Returns `Left <error message>` upon failure
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv filepath = do
eitherTsvFile <- readTSVFile filepath
return $ do
(_header, tsvDocs) <- eitherTsvFile
let hyperdataDocs = tsv2doc <$> tsvDocs
return $ V.toList hyperdataDocs
-- | Conduit version of `parseTsv`
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC tsvFile =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case testCorrectFile tsvFile of
Left err -> (Left err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case testCorrectFile file of
Left _err -> pure $ Left _err
Right (del,_) -> pure $ readTsvLazyBS del file
-- | Write a TSV file containing a collection of hyperdata documents
writeDocs2Tsv :: FilePath -- ^ The output filepath
-> [HyperdataDocument] -- ^ The hyperdata documents to write
-> IO ()
writeDocs2Tsv filepath hyperdataDocs = do
let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
minimalTsvExportHeader
tsvDocs
BL.writeFile filepath fileToWrite
-- | Convert between the native hyperdata datatype
-- and the transition TsvDoc datatype
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h hyperdataDocument2tsvDoc h =
TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h , tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h , tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h , tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h , tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h } , tsv_authors = m $ _hd_authors h }
where where
m = maybe "" identity m = maybe "" identity
mI = maybe 0 identity mI = maybe 0 identity
tsvDecodeOptions :: ColumnDelimiter -> DecodeOptions -------------------------------------
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = toWord8 d}
tsvEncodeOptions :: ColumnDelimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = toWord8 d}
------------------------------------------------------------------------ readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (CSV.Header, V.Vector TsvDoc)
readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testDelimiter del bs = testCorrectFile file = do
let x = BL.splitWith (== toWord8 Newline) bs delimiter <- findColumnDelimiter file
vec = V.fromList x in let rows = BL.splitWith (== newline) file
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of headers <- getHeaders rows delimiter
Nothing -> False (\content -> (delimiter, content)) <$> checkRows rows delimiter headers
Just e -> case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2 -- | Given a delimiter type and a list of headers, checks that the rows
-- of the input CSV/TSV file, split into lines, are valid. If they are, return
-- the list of warnings encountered during checking.
-- Note that the first line (the header line) must be included,
-- but will not be checked here.
checkRows :: [BL.ByteString] -- ^ The file, split in rows
-> ColumnDelimiter -- ^ The purported delimiter type
-> [Text] -- ^ The headers
-> Either Text [Text]
checkRows fileLines delimiter headers =
checkFromLine 2 (V.fromList fileLines) delimiter headers
-- | Detect which delimiter a file is using; if detection fails, an error message
-- is output inside the `Left` constructor
findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findColumnDelimiter bs findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab | testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma | testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line") | testDelimiter Semicolon bs = Right Semicolon
| otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text -- | Get a TSV file's header fields
lBLToText b = TL.toStrict $ TL.decodeUtf8 b getHeaders :: [BL.ByteString] -- ^ The input file, split into lines
-> ColumnDelimiter -- ^ This CSV/TSV's delimiter
validNumber :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text] -> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
validNumber x columnHeader ligne warn = do getHeaders rows delimiter =
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x) case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of
case isNumeric number of Nothing -> Left "Error in function `getHeaders`: Empty file"
Right val Just headers -> case testAllHeadersPresence $ removeText "\"" <$> headers of
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative") Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
| otherwise -> Right warn Right () -> Right headers
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
validTextField x columnHeader ligne warn = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then Right warn
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else Right ( ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty") : warn)
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
testValue val columnHeader ligne warn = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne warn
"Publication Month" -> validNumber val columnHeader ligne warn
"Publication Year" -> validNumber val columnHeader ligne warn
"Authors" -> validTextField val columnHeader ligne warn
"Title" -> validTextField val columnHeader ligne warn
"Source" -> validTextField val columnHeader ligne warn
"Abstract" -> validTextField val columnHeader ligne warn
_ -> Right warn
testErrorPerLine :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine [] _ [] _ warn = Right warn
testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l _ = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne warn =
case testValue v h ligne warn of
Left _err -> Left _err
Right warning -> testErrorPerLine val del headers ligne warning
checkNextLine :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (== toWord8 del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (== toWord8 del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (== toWord8 del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (== toWord8 del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
else
if (length tmp > length headers) || (V.length bl == (x + 1))
then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
else do
case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
-- | Check that the file is well-formed and throw warnings/errors accordingly
checkIntegrity :: Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Int -- ^ Current line number
-> [Text] -- ^ Warning accumulator
-> Either Text [Text] -- ^ Left error if an error has occured, otherwise Right (list of warnings)
checkIntegrity tsvLines delim headers lineNumber warnings
| length tsvLines == lineNumber = Right warnings
| otherwise =
case (V.!?) tsvLines lineNumber of
Nothing -> Left "Gargantext.Core.Text.Corpus.Parsers.Tsv: error in function checkTsv"
Just currentLine ->
case getMultipleLinefile tsvLines delim headers currentLine lineNumber of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val delim headers (lineNumber + 1) warnings of
Left _err -> Left _err
Right warning -> checkIntegrity tsvLines delim headers (y+1) warning
testIfErrorInFile :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Either Text [Text]
testIfErrorInFile bl del headers = checkIntegrity (V.fromList bl) del headers 2 []
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile bs =
case findColumnDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (== toWord8 Newline) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> (\content -> (del, content)) <$> testIfErrorInFile bl del headers
-- | Check that all required headers are present for the basic TSV type
testAllHeadersPresence :: [Text] -- ^ List of headers
-> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
testAllHeadersPresence headers = do
-- Remove the \r character that sometimes appears at the end of a line
let cleanHeaders = removeText "\r" <$> headers
let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
if null missingHeaders
then Right ()
else Left missingHeaders
----------Test headers added to ggt -- | Test whether the given file has the given character as a delimiter
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter delimiter file = fromMaybe False $ do
-- do-notation in the Maybe monad for nice pattern matching
let rows = V.fromList $ BL.splitWith (== newline) file
headers <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
return $ length headers == length firstRow && length headers > 2
-- use a map to remove \r that sometimes appear at the end of a line -- | Check a TSV file's rows starting at the given line number
testAllHeadersPresence :: [Text] -> Either Text [Text] checkFromLine :: Int -- ^ The line number from which to check
testAllHeadersPresence headers = do -> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] -> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
if null listHeaders -> [Text] -- ^ Expected TSV headers
then Right headers -> Either Text [Text]
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders) -- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
checkFromLine lineNumber tsvLines delim headers = do
getHeaders :: [BL.ByteString] -> ColumnDelimiter -> Either Text [Text] assertE (length tsvLines == lineNumber)
getHeaders bl del = do "Error in function checkIntegrity: queried line number is greater than actual number of lines"
let vec = V.fromList bl in currentLine <- tryE (tsvLines !? lineNumber)
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of "Error in function checkIntegrity: no line at queried number"
Nothing -> Left "Error getHeaders" (y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers) currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
restOfWarnings <- checkFromLine (y+1) tsvLines delim headers
return $ currentLineWarnings <> restOfWarnings
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) -- | Parse the row given as input. If the input is not a whole row, only part of a row,
=> proxy a -- fetch the next line and interpret it as the rest of the row
-> ColumnDelimiter checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
-> FilePath -> ColumnDelimiter -- ^ The type of delimiter
-> IO (Either Text (Header, Vector a)) -> [Text] -- ^ Headers
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile -> BL.ByteString -- ^ Input row, or part of a row
-> Int -- ^ Current line number
readFileStrict :: (FromNamedRecord a) -> Either Text (Int, [BL.ByteString])
=> proxy a -- ^ Left: Error message; Right: (Line number after the row, list of cells)
-> ColumnDelimiter checkPartialRow fileLines delimiter headers partialRow lineNum = do
-> FilePath let partialCells = split delimiter partialRow
-> IO (Either Text (Header, Vector a)) if length partialCells == length headers
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile -- We've got a full row, we can check its contents:
then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
readByteStringLazy :: (FromNamedRecord a) else do
=> proxy a assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
-> ColumnDelimiter (T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
-> BL.ByteString val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
-> Either Text (Header, Vector a) "checkPartialRow"
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs checkPartialRow fileLines delimiter headers val (lineNum+1)
-- | Looks up the next line...?
checkNextLine :: V.Vector BL.ByteString -- ^ All rows
-> ColumnDelimiter -- ^ Type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ ???
-> Int -- ^ Line number
-> (Int, [BL.ByteString])
checkNextLine allLines delimiter headers res lineNumber =
case allLines !? (lineNumber + 1) of
Nothing -> (lineNumber, split delimiter res)
Just nextLine -> let nextRow = split delimiter nextLine in
if length nextRow > 1
then (lineNumber, split delimiter res)
else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
-- | Check for any errors or warnings in a given row
checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ The list of headers
-> Int -- ^ Current line; used for logging purposes
-> Either Text [Text] -- ^ List of warnings
-- Empty row & no headers: nothing to check:
checkRow [] _ [] _ = Right []
-- More cells than headers: error
checkRow _ delimiter [] line = Left $
"Too many fields at line " <> (T.pack . show) line <> "." <>
if delimiter /= Comma
then ""
else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
-- More headers than cells: error
checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
-- Still some fields and headers to consume
checkRow (v:val) delimiter (h:headers) line = do
warning <- checkValue v h line
warnings <- checkRow val delimiter headers line
return $ warning <> warnings
-- | Tests whether a given chunk of text is a valid field
checkValue :: BL.ByteString -- ^ The field to check
-> Text -- ^ Current column header; passed as input for logging purposes
-> Int -- ^ Current line number; passed as input for logging purposes
-> Either Text [Text]
checkValue field columnHeader lineNumber = case columnHeader of
"Publication Day" -> checkNumber field columnHeader lineNumber
"Publication Month" -> checkNumber field columnHeader lineNumber
"Publication Year" -> checkNumber field columnHeader lineNumber
"Authors" -> checkTextField field columnHeader lineNumber
"Title" -> checkTextField field columnHeader lineNumber
"Source" -> checkTextField field columnHeader lineNumber
"Abstract" -> checkTextField field columnHeader lineNumber
_ -> Right []
-- | Checks whether a given chunk of text is a valid number
checkNumber :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkNumber field columnHeader line = do
let cleanField = removeText "\"" $ lBLToText field
value <- tryE (parseNumber cleanField) $
"Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
assertE (value >= 0) $
"Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
return []
-- | Checks whether a given chunk of text is a valid text field
-- (i.e. does not have parasite quotes)
checkTextField :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkTextField field columnHeader ligne = do
let cleanField = removeText "\"\"" $ lBLToText field
if T.null cleanField
then Right ["The column '" <> columnHeader <> "' at line " <> T.pack (show ligne) <> " is empty"]
else do
assertE (
-- Field is not empty...
(T.length cleanField > 0) &&
-- ... and either there are no quotes at all...
( T.length (T.filter (== '\"') cleanField) == 0 ||
-- ... or there is only one quote at the beginning and one at the end
((T.head cleanField == '"') &&
(T.last cleanField == '"') &&
(T.length (T.filter (== '\"') cleanField) == 2)
)
)) $
"Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field "
return []
---------------------------------------
-- | Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> ColumnDelimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------ instance CSV.FromNamedRecord Tsv' where
-- | TODO use readFileLazy parseNamedRecord r = do
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc)) tsv'_title <- r .: "title"
readTSVFile fp = do tsv'_source <- r .: "source"
file <- BL.readFile fp tsv'_publication_year <- r .: "publication_year"
case (testCorrectFile file) of tsv'_publication_month <- r .: "publication_month"
Left _err -> pure $ Left _err tsv'_publication_day <- r .: "publication_day"
Right (del,_) -> pure $ readTsvLazyBS del file tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
-- | TODO use readByteStringLazy
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (Header, Vector TsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
------------------------------------------------------------------------ readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
-- | TODO use readFileLazy readWeightedTsv fp =
readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal)) (\bs ->
readTsvHal fp = do case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
c <- BL.readFile fp Left e -> panicTrace (T.pack e)
pure $ readTsvHalLazyBS c Right corpus -> corpus
) <$> BL.readFile fp
-- | TODO use readByteStringLazy
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString -----------------------
hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) minimalTsvHeader (map hyperdataDocument2tsvDoc hs)
------------------------------------------------------------------------ -- | The data of a HAL document
-- Hal Format
data TsvHal = TsvHal data TsvHal = TsvHal
{ tsvHal_title :: !Text { tsvHal_title :: !Text
, tsvHal_source :: !Text , tsvHal_source :: !Text
...@@ -420,7 +572,7 @@ data TsvHal = TsvHal ...@@ -420,7 +572,7 @@ data TsvHal = TsvHal
} }
deriving (Show) deriving (Show)
instance FromNamedRecord TsvHal where instance CSV.FromNamedRecord TsvHal where
parseNamedRecord r = do parseNamedRecord r = do
tsvHal_title <- r .: "title" tsvHal_title <- r .: "title"
tsvHal_source <- r .: "source" tsvHal_source <- r .: "source"
...@@ -443,10 +595,9 @@ instance FromNamedRecord TsvHal where ...@@ -443,10 +595,9 @@ instance FromNamedRecord TsvHal where
tsvHal_docType_s <- r .: "docType_s" tsvHal_docType_s <- r .: "docType_s"
pure $ TsvHal { .. } pure $ TsvHal { .. }
instance ToNamedRecord TsvHal where instance CSV.ToNamedRecord TsvHal where
--toNamedRecord (TsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
toNamedRecord (TsvHal { .. }) = toNamedRecord (TsvHal { .. }) =
namedRecord [ "title" .= tsvHal_title CSV.namedRecord [ "title" .= tsvHal_title
, "source" .= tsvHal_source , "source" .= tsvHal_source
, "publication_year" .= tsvHal_publication_year , "publication_year" .= tsvHal_publication_year
...@@ -472,6 +623,21 @@ instance ToNamedRecord TsvHal where ...@@ -472,6 +623,21 @@ instance ToNamedRecord TsvHal where
, "docType_s" .= tsvHal_docType_s , "docType_s" .= tsvHal_docType_s
] ]
readTsvHal :: FilePath
-> IO (Either Text (CSV.Header, V.Vector TsvHal))
readTsvHal path = do
file <- BL.readFile path
return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
tsvHal2doc :: TsvHal -> HyperdataDocument tsvHal2doc :: TsvHal -> HyperdataDocument
tsvHal2doc (TsvHal { .. }) = tsvHal2doc (TsvHal { .. }) =
HyperdataDocument { _hd_bdd = Just "TsvHal" HyperdataDocument { _hd_bdd = Just "TsvHal"
...@@ -483,7 +649,7 @@ tsvHal2doc (TsvHal { .. }) = ...@@ -483,7 +649,7 @@ tsvHal2doc (TsvHal { .. }) =
, _hd_institutes = Just tsvHal_instStructId_i , _hd_institutes = Just tsvHal_instStructId_i
, _hd_source = Just tsvHal_source , _hd_source = Just tsvHal_source
, _hd_abstract = Just tsvHal_abstract , _hd_abstract = Just tsvHal_abstract
, _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year , _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
tsvHal_publication_month tsvHal_publication_month
tsvHal_publication_day tsvHal_publication_day
, _hd_publication_year = Just $ fromIntegral tsvHal_publication_year , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
...@@ -495,100 +661,3 @@ tsvHal2doc (TsvHal { .. }) = ...@@ -495,100 +661,3 @@ tsvHal2doc (TsvHal { .. }) =
, _hd_language_iso2 = Nothing , _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing } , _hd_institutes_tree = Nothing }
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = fromMIntOrDec defaultYear tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
let
result = case (testCorrectFile bs) of
Left _err -> Left _err
Right (del,_) -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC bs =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case (testCorrectFile bs) of
Left _err -> (Left _err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del bs, Warn.MalformedCorpus (T.intercalate (pack "\n") warn))
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
instance FromNamedRecord Tsv' where
parseNamedRecord r = do
tsv'_title <- r .: "title"
tsv'_source <- r .: "source"
tsv'_publication_year <- r .: "publication_year"
tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
readWeightedTsv :: FilePath -> IO (Header, Vector Tsv')
readWeightedTsv fp =
fmap (\bs ->
case decodeByNameWith (tsvDecodeOptions Tab) bs of
Left e -> panicTrace (pack e)
Right corpus -> corpus
) $ BL.readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment