Commit 51c8a407 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Reorganize TSV parsing code

parent e4dfb4bd
Pipeline #6835 canceled with stages
...@@ -20,7 +20,7 @@ import Data.Tuple.Extra (both) ...@@ -20,7 +20,7 @@ import Data.Tuple.Extra (both)
import Data.Vector qualified as DV import Data.Vector qualified as DV
import GHC.Generics import GHC.Generics
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear) import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, unIntOrDec, tsv_title, tsv_abstract, tsv_publication_year, defaultYear)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList) import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList ) import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
...@@ -52,7 +52,7 @@ filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (Outpu ...@@ -52,7 +52,7 @@ filterTermsAndCoocCLI (CorpusFile corpusFile) (TermListFile termListFile) (Outpu
Right cf -> do Right cf -> do
let corpus = DM.fromListWith (<>) let corpus = DM.fromListWith (<>)
. DV.toList . DV.toList
. DV.map (\n -> (fromMIntOrDec defaultYear $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)])) . DV.map (\n -> (maybe defaultYear unIntOrDec $ tsv_publication_year n, [(tsv_title n) <> " " <> (tsv_abstract n)]))
. snd $ cf . snd $ cf
-- termListMap :: [Text] -- termListMap :: [Text]
......
...@@ -82,8 +82,8 @@ tsvToDocs parser patterns time path = ...@@ -82,8 +82,8 @@ tsvToDocs parser patterns time path =
Wos _ -> Prelude.error "tsvToDocs: unimplemented" Wos _ -> Prelude.error "tsvToDocs: unimplemented"
Tsv limit -> Vector.toList Tsv limit -> Vector.toList
<$> Vector.take limit <$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time) <$> Vector.map (\row -> Document (toPhyloDate (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(toPhyloDate' (Tsv.fromMIntOrDec Tsv.defaultYear $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time) (toPhyloDate' (maybe Tsv.defaultYear Tsv.unIntOrDec $ tsv_publication_year row) (fromMaybe Tsv.defaultMonth $ tsv_publication_month row) (fromMaybe Tsv.defaultDay $ tsv_publication_day row) time)
(termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row)) (termsInText patterns $ (tsv_title row) <> " " <> (tsv_abstract row))
Nothing Nothing
[] []
......
...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header ) ...@@ -22,7 +22,7 @@ import Data.Csv ( (.:), header, decodeByNameWith, FromNamedRecord(..), Header )
import Data.Text qualified as T import Data.Text qualified as T
import Data.Vector (Vector) import Data.Vector (Vector)
import Data.Vector qualified as Vector import Data.Vector qualified as Vector
import Gargantext.Core.Text.Corpus.Parsers.TSV ( tsvDecodeOptions, ColumnDelimiter(Tab) ) import Gargantext.Core.Text.Corpus.Parsers.TSV (defaultDecodingOptionsWithDelimiter, ColumnDelimiter(Tab) )
import Gargantext.Database.Admin.Types.Hyperdata.Contact import Gargantext.Database.Admin.Types.Hyperdata.Contact
import Gargantext.Prelude import Gargantext.Prelude
import System.FilePath.Posix (takeExtension) import System.FilePath.Posix (takeExtension)
...@@ -119,7 +119,7 @@ readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser) ...@@ -119,7 +119,7 @@ readTSVFile_Annuaire' :: FilePath -> IO (Header, Vector IMTUser)
readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile readTSVFile_Annuaire' = fmap readTsvHalLazyBS' . BL.readFile
where where
readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser) readTsvHalLazyBS' :: BL.ByteString -> (Header, Vector IMTUser)
readTsvHalLazyBS' bs = case decodeByNameWith (tsvDecodeOptions Tab) bs of readTsvHalLazyBS' bs = case decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
Left e -> panicTrace (cs e) Left e -> panicTrace (cs e)
Right rows -> rows Right rows -> rows
......
...@@ -12,135 +12,167 @@ TSV parser for Gargantext corpus files. ...@@ -12,135 +12,167 @@ TSV parser for Gargantext corpus files.
-} -}
module Gargantext.Core.Text.Corpus.Parsers.TSV where module Gargantext.Core.Text.Corpus.Parsers.TSV
( ColumnDelimiter (..)
import Conduit ( ConduitT, (.|), yieldMany, mapC ) , IntOrDec (..)
import Data.ByteString qualified as BS , TsvDoc (..)
import Data.ByteString.Lazy qualified as BL , TsvHal (..)
import Data.Csv , Tsv' (..)
import Data.Text (pack) , defaultDay
import Data.Text qualified as T , defaultDecodingOptionsWithDelimiter
import Data.Text.Lazy qualified as TL , defaultEncodingOptionsWithDelimiter
import Data.Text.Lazy.Encoding qualified as TL , defaultMonth
import Data.Text.Read qualified as DTR , defaultYear
, parseHal
, parseTsv
, parseTsvC
, readTSVFile
, readTsvHal
, readWeightedTsv
, writeDocs2Tsv
)
where
import Conduit (ConduitT, yieldMany, mapC, (.|))
import Data.ByteString.Lazy qualified as BL
import Data.Csv qualified as CSV
import Data.Csv ((.=), (.:))
import Data.Text qualified as T
import Data.Text.Lazy qualified as TL
import Data.Text.Lazy.Encoding qualified as TLE
import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences ) import Data.Vector ((!?))
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Jobs.Error as Warn import Gargantext.Utils.Jobs.Error as Warn
---------------------------------
-- Utility types and functions --
---------------------------------
-- | Day of the UNIX reference time (UNIX epoch)
defaultYear :: Int
defaultYear = 1973
-- | UNIX epoch month
defaultMonth :: Int
defaultMonth = 1
-- | UNIX epoch year
defaultDay :: Int
defaultDay = 1
-- | The possible delimiters for a CSV file -- | The possible delimiters for a CSV file
data LineDelimiter = Newline deriving (Eq, Show)
data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show) data ColumnDelimiter = Tab | Comma | Semicolon deriving (Eq, Show)
class ToWord8 a where -- | Concrete representation of the above type
-- | Convert abstract representation into a Word8 character toWord8 :: ColumnDelimiter -> Word8
toWord8 :: a -> Word8 toWord8 Tab = fromIntegral $ ord '\t'
toWord8 Comma = fromIntegral $ ord ','
toWord8 Semicolon = fromIntegral $ ord ';'
instance ToWord8 LineDelimiter where -- | Just the newline character in `Word8` form; handy to have around
toWord8 Newline = fromIntegral $ ord '\n' newline :: Word8
newline = fromIntegral $ ord '\n'
instance ToWord8 ColumnDelimiter where -- | Split a text depending on the delimiter type
toWord8 Tab = fromIntegral $ ord '\t' split :: ColumnDelimiter -> BL.ByteString -> [BL.ByteString]
toWord8 Comma = fromIntegral $ ord ',' split delimiter txt = BL.splitWith (== toWord8 delimiter) txt
toWord8 Semicolon = fromIntegral $ ord ';'
--------------------------------------------------------------- -- | Remove all occurences of `needle` in `haystack`
-- | Minimal header for a working TSV import removeText :: T.Text -- ^ The text to remove (the `needle`)
minimalTsvHeader :: Header -> T.Text -- ^ The original text (the `haystack`)
minimalTsvHeader = -> T.Text
header [ "title" removeText needle haystack = T.replace needle "" haystack
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
let docSize = (T.length $ tsv_abstract doc) in
if (docSize > 1000) && (mod (round m) docSize >= 10)
then splitDoc' splt doc
else V.fromList [doc]
where
splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt -- | Convert from raw byte string into `Text`
, tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt lBLToText :: BL.ByteString -> Text
, .. } lBLToText b = TL.toStrict $ TLE.decodeUtf8 b
) (tail' "splitDoc'2" abstracts)
-- | Fail in the Either monad iff a predicate is False
assertE :: Bool -- ^ The predicate to be tested
-> e -- ^ The error if the predicate fails
-> Either e ()
assertE predicate err = if predicate then Right () else Left err
-- | Trying getting a value out of a `Maybe`; if it does not work,
-- return an error message
tryE :: Maybe a -- ^ The value to extract
-> e -- ^ The error message
-> Either e a -- ^ The extracted value, in the `Either` monad
tryE Nothing err = Left err
tryE (Just value) _ = Right value
-- | Tests whether the input text can be parsed as a number
parseNumber :: Text -> Maybe Int
parseNumber str = case DTR.decimal str of
Right (x, "") -> Just x
_ -> Nothing
-- | An integer that may be the result of parsing either an integer value
-- or a floating-point value. If the parsed value was a floating-point number,
-- it is cropped to take only its integer part.
newtype IntOrDec = IntOrDec { unIntOrDec :: Int }
deriving (Show, Eq, Read)
abstracts = (splitBy $ contextSize) tsv_abstract instance CSV.FromField IntOrDec where
-- | Try parsing the field as an `Int`; if it fails, parse it as a `Double`
parseField s = case CSV.runParser (CSV.parseField s :: CSV.Parser Int) of
Left _ -> IntOrDec . floor <$> (CSV.parseField s :: CSV.Parser Double)
Right n -> pure $ IntOrDec n
instance CSV.ToField IntOrDec where
toField = CSV.toField . unIntOrDec
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector TsvDoc -> Mean -- | Default options for TSV encoding, except for the delimiter,
docsSize tsvDoc = mean ls -- which is passed as argument
where defaultEncodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.EncodeOptions
ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc defaultEncodingOptionsWithDelimiter d = CSV.defaultEncodeOptions {CSV.encDelimiter = toWord8 d}
-- | Default options for TSV decoding, except for the delimiter,
-- which is passed as argument
defaultDecodingOptionsWithDelimiter :: ColumnDelimiter -> CSV.DecodeOptions
defaultDecodingOptionsWithDelimiter d = CSV.defaultDecodeOptions {CSV.decDelimiter = toWord8 d}
--------------------
-- Basic TSV type --
--------------------
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec . floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1
data TsvDoc = TsvDoc data TsvDoc = TsvDoc
{ tsv_title :: !Text { tsv_title :: !Text
, tsv_source :: !Text , tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec) , tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int) , tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int) , tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text , tsv_abstract :: !Text
, tsv_authors :: !Text , tsv_authors :: !Text
} }
deriving (Show) deriving (Show)
instance FromNamedRecord TsvDoc where instance CSV.FromNamedRecord TsvDoc where
parseNamedRecord r = do parseNamedRecord r = do
tsv_title <- r .: "title" <|> r .: "Title" tsv_title <- r .: "title" <|> r .: "Title"
tsv_source <- r .: "source" <|> r .: "Source" tsv_source <- r .: "source" <|> r .: "Source"
tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year" tsv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month" tsv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day" tsv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
tsv_abstract <- r .: "abstract" <|> r .: "Abstract" tsv_abstract <- r .: "abstract" <|> r .: "Abstract"
tsv_authors <- r .: "authors" <|> r .: "Authors" tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. } pure $ TsvDoc { .. }
instance ToNamedRecord TsvDoc where instance CSV.ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) = toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title CSV.namedRecord [ "title" .= tsv_title
, "source" .= tsv_source , "source" .= tsv_source
, "publication_year" .= tsv_publication_year , "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month , "publication_month" .= tsv_publication_month
...@@ -149,278 +181,398 @@ instance ToNamedRecord TsvDoc where ...@@ -149,278 +181,398 @@ instance ToNamedRecord TsvDoc where
, "authors" .= tsv_authors , "authors" .= tsv_authors
] ]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ T.pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = maybe defaultYear unIntOrDec tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
-- | Minimal header for a working TSV import
-- TODO why is this so much different from `minimalExportHeader`?
minimalTsvImportHeaders :: [Text]
minimalTsvImportHeaders = [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"
]
-- | Minimal header for a working TSV export
-- TODO why is this so much different from `minimalImportHeader`?
minimalTsvExportHeader :: CSV.Header
minimalTsvExportHeader =
CSV.header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
-- | Parses a TSV file into the native HyperdataDocument datatype..
-- Returns `Left <error message>` upon failure
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv filepath = do
eitherTsvFile <- readTSVFile filepath
return $ do
(_header, tsvDocs) <- eitherTsvFile
let hyperdataDocs = tsv2doc <$> tsvDocs
return $ V.toList hyperdataDocs
-- | Conduit version of `parseTsv`
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC tsvFile =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case testCorrectFile tsvFile of
Left err -> (Left err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del tsvFile, Warn.MalformedCorpus (T.intercalate (T.pack "\n") warn))
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (CSV.Header, V.Vector TsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case testCorrectFile file of
Left _err -> pure $ Left _err
Right (del,_) -> pure $ readTsvLazyBS del file
-- | Write a TSV file containing a collection of hyperdata documents
writeDocs2Tsv :: FilePath -- ^ The output filepath
-> [HyperdataDocument] -- ^ The hyperdata documents to write
-> IO ()
writeDocs2Tsv filepath hyperdataDocs = do
let tsvDocs = hyperdataDocument2tsvDoc <$> hyperdataDocs
let fileToWrite = CSV.encodeByNameWith (defaultEncodingOptionsWithDelimiter Tab)
minimalTsvExportHeader
tsvDocs
BL.writeFile filepath fileToWrite
-- | Convert between the native hyperdata datatype
-- and the transition TsvDoc datatype
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h =
TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where where
m = maybe "" identity m = maybe "" identity
mI = maybe 0 identity mI = maybe 0 identity
tsvDecodeOptions :: ColumnDelimiter -> DecodeOptions -------------------------------------
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = toWord8 d}
tsvEncodeOptions :: ColumnDelimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = toWord8 d}
------------------------------------------------------------------------
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (CSV.Header, V.Vector TsvDoc)
readTsvLazyBS d bs = first T.pack $ CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter d) bs
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter del bs =
let x = BL.splitWith (== toWord8 Newline) bs
vec = V.fromList x in
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of
Nothing -> False
Just e -> case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 1) of
Nothing -> False
Just f -> length e == length f && length e > 2
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile file = do
delimiter <- findColumnDelimiter file
let rows = BL.splitWith (== newline) file
headers <- getHeaders rows delimiter
(\content -> (delimiter, content)) <$> checkRows rows delimiter headers
-- | Given a delimiter type and a list of headers, checks that the rows
-- of the input CSV/TSV file, split into lines, are valid. If they are, return
-- the list of warnings encountered during checking.
-- Note that the first line (the header line) must be included,
-- but will not be checked here.
checkRows :: [BL.ByteString] -- ^ The file, split in rows
-> ColumnDelimiter -- ^ The purported delimiter type
-> [Text] -- ^ The headers
-> Either Text [Text]
checkRows fileLines delimiter headers =
checkFromLine 2 (V.fromList fileLines) delimiter headers
-- | Detect which delimiter a file is using; if detection fails, an error message
-- is output inside the `Left` constructor
findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter findColumnDelimiter :: BL.ByteString -> Either Text ColumnDelimiter
findColumnDelimiter bs findColumnDelimiter bs
| testDelimiter Tab bs = Right Tab | testDelimiter Tab bs = Right Tab
| testDelimiter Comma bs = Right Comma | testDelimiter Comma bs = Right Comma
| otherwise = Left (pack "Problem with the delimiter : be sure that the delimiter is a tabulation for each line") | testDelimiter Semicolon bs = Right Semicolon
| otherwise = Left (T.pack "Couldn't interpret file as a valid CSV/TSV file, no matter the delimiter (comma, tab or semicolon)")
isNumeric :: Text -> Either Bool Int
isNumeric str = case DTR.decimal str of
Right (x,y) -> if y == ""
then Right x
else Left False
Left _ -> Left False
lBLToText :: BL.ByteString -> Text -- | Get a TSV file's header fields
lBLToText b = TL.toStrict $ TL.decodeUtf8 b getHeaders :: [BL.ByteString] -- ^ The input file, split into lines
-> ColumnDelimiter -- ^ This CSV/TSV's delimiter
validNumber :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text] -> Either Text [Text] -- ^ Left: error message; Right: the input file's headers
validNumber x columnHeader ligne warn = do getHeaders rows delimiter =
let number = T.replace (T.pack "\"") (T.pack "") (lBLToText x) case fmap lBLToText . BL.splitWith (== toWord8 delimiter) <$> V.fromList rows !? 0 of
case isNumeric number of Nothing -> Left "Error in function `getHeaders`: Empty file"
Right val Just headers -> case testAllHeadersPresence $ removeText "\"" <$> headers of
| val < 0 -> Left $ ("Value of column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is negative") Left missing -> Left $ "Error in function `getHeaders: Missing headers: " <> T.intercalate ", " missing
| otherwise -> Right warn Right () -> Right headers
Left _ -> Left $ ("Error in column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " : value is not a number ")
validTextField :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
validTextField x columnHeader ligne warn = do
let xs = T.replace (T.pack "\"\"") (T.pack "") (lBLToText x) in
if not (T.null xs)
then
if (T.length xs > 0) && ((T.length (T.filter (== '\"') xs) == 0) || ((T.head xs == '"') && (T.last xs == '"') && (T.length (T.filter (== '\"') xs) == 2)))
then Right warn
else Left $ ("Encapsulation problem at line " <> pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field ")
else Right ( ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty") : warn)
-- else Left $ ("The column '" <> columnHeader <> "' at line " <> pack (show ligne) <> " is empty")
-- Put a warning for the user to know their is a problem (empty column)
testValue :: BL.ByteString -> Text -> Int -> [Text] -> Either Text [Text]
testValue val columnHeader ligne warn = case columnHeader of
"Publication Day" -> validNumber val columnHeader ligne warn
"Publication Month" -> validNumber val columnHeader ligne warn
"Publication Year" -> validNumber val columnHeader ligne warn
"Authors" -> validTextField val columnHeader ligne warn
"Title" -> validTextField val columnHeader ligne warn
"Source" -> validTextField val columnHeader ligne warn
"Abstract" -> validTextField val columnHeader ligne warn
_ -> Right warn
testErrorPerLine :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Int -> [Text] -> Either Text [Text]
testErrorPerLine [] _ [] _ warn = Right warn
testErrorPerLine _ del [] l _ | del == Comma = Left (pack $ "Too much field at line " <> show l <> ". Try using tabulation as a delimiter. Other delimiter like comma (,) may appear in some text.")
| otherwise = Left (pack $ "Too much field at line " <> show l)
testErrorPerLine [] _ _ l _ = Left (pack $ "Missing one field at line " <> show l)
testErrorPerLine (v:val) del (h:headers) ligne warn =
case testValue v h ligne warn of
Left _err -> Left _err
Right warning -> testErrorPerLine val del headers ligne warning
checkNextLine :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
checkNextLine bl del headers res x = do
case BL.splitWith (== toWord8 del) <$> ((V.!?) bl (x+1)) of
Nothing -> Right (x, (BL.splitWith (== toWord8 del) res))
Just value -> if length value > 1
then Right (x, (BL.splitWith (== toWord8 del) res))
else case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "checkNextLine2"
Just val -> checkNextLine bl del headers val (x+1)
getMultipleLinefile :: Vector BL.ByteString -> ColumnDelimiter -> [Text] -> BL.ByteString -> Int -> Either Text (Int,[BL.ByteString])
getMultipleLinefile bl del headers res x = do
let tmp = BL.splitWith (== toWord8 del) res in
if length tmp == length headers
then checkNextLine bl del headers res x
else
if (length tmp > length headers) || (V.length bl == (x + 1))
then Left (pack $ "Cannot parse the file at line " <> show x <> ". Maybe because of a delimiter")
else do
case BL.append res <$> ((V.!?) bl (x+1)) of
Nothing -> Left "getMultipleLinefile"
Just val -> getMultipleLinefile bl del headers val (x+1)
-- | Check that the file is well-formed and throw warnings/errors accordingly
checkIntegrity :: Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Int -- ^ Current line number
-> [Text] -- ^ Warning accumulator
-> Either Text [Text] -- ^ Left error if an error has occured, otherwise Right (list of warnings)
checkIntegrity tsvLines delim headers lineNumber warnings
| length tsvLines == lineNumber = Right warnings
| otherwise =
case (V.!?) tsvLines lineNumber of
Nothing -> Left "Gargantext.Core.Text.Corpus.Parsers.Tsv: error in function checkTsv"
Just currentLine ->
case getMultipleLinefile tsvLines delim headers currentLine lineNumber of
Left _err -> Left _err
Right (y, val) -> case testErrorPerLine val delim headers (lineNumber + 1) warnings of
Left _err -> Left _err
Right warning -> checkIntegrity tsvLines delim headers (y+1) warning
testIfErrorInFile :: [BL.ByteString] -> ColumnDelimiter -> [Text] -> Either Text [Text]
testIfErrorInFile bl del headers = checkIntegrity (V.fromList bl) del headers 2 []
testCorrectFile :: BL.ByteString -> Either Text (ColumnDelimiter, [Text])
testCorrectFile bs =
case findColumnDelimiter bs of
Left _err -> Left _err
Right del -> do
let bl = BL.splitWith (== toWord8 Newline) bs in
case getHeaders bl del of
Left _err -> Left _err
Right headers -> (\content -> (del, content)) <$> testIfErrorInFile bl del headers
-- | Check that all required headers are present for the basic TSV type
testAllHeadersPresence :: [Text] -- ^ List of headers
-> Either [Text] () -- ^ Left: list of missing headers, if any; Right: success
testAllHeadersPresence headers = do
-- Remove the \r character that sometimes appears at the end of a line
let cleanHeaders = removeText "\r" <$> headers
let missingHeaders = filter (`notElem` cleanHeaders) minimalTsvImportHeaders
if null missingHeaders
then Right ()
else Left missingHeaders
----------Test headers added to ggt
-- | Test whether the given file has the given character as a delimiter
testDelimiter :: ColumnDelimiter -> BL.ByteString -> Bool
testDelimiter delimiter file = fromMaybe False $ do
-- do-notation in the Maybe monad for nice pattern matching
let rows = V.fromList $ BL.splitWith (== newline) file
headers <- BL.splitWith (== toWord8 delimiter) <$> rows !? 0
firstRow <- BL.splitWith (== toWord8 delimiter) <$> rows !? 1
return $ length headers == length firstRow && length headers > 2
-- | Check a TSV file's rows starting at the given line number
checkFromLine :: Int -- ^ The line number from which to check
-> V.Vector BL.ByteString -- ^ Individual lines of the TSV file
-> ColumnDelimiter -- ^ The delimiter used in this file (comma, tab, ...)
-> [Text] -- ^ Expected TSV headers
-> Either Text [Text]
-- ^ `Left <error>` if an error has occured, otherwise `Right <list of warnings>`
checkFromLine lineNumber tsvLines delim headers = do
assertE (length tsvLines == lineNumber)
"Error in function checkIntegrity: queried line number is greater than actual number of lines"
currentLine <- tryE (tsvLines !? lineNumber)
"Error in function checkIntegrity: no line at queried number"
(y, val) <- checkPartialRow tsvLines delim headers currentLine lineNumber
currentLineWarnings <- checkRow val delim headers (lineNumber + 1)
restOfWarnings <- checkFromLine (y+1) tsvLines delim headers
return $ currentLineWarnings <> restOfWarnings
-- | Parse the row given as input. If the input is not a whole row, only part of a row,
-- fetch the next line and interpret it as the rest of the row
checkPartialRow :: V.Vector BL.ByteString -- ^ The TSV file, split into lines
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ Input row, or part of a row
-> Int -- ^ Current line number
-> Either Text (Int, [BL.ByteString])
-- ^ Left: Error message; Right: (Line number after the row, list of cells)
checkPartialRow fileLines delimiter headers partialRow lineNum = do
let partialCells = split delimiter partialRow
if length partialCells == length headers
-- We've got a full row, we can check its contents:
then Right $ checkNextLine fileLines delimiter headers partialRow lineNum
else do
assertE ((length partialCells <= length headers) && (V.length fileLines < (lineNum + 1)))
(T.pack $ "Cannot parse the file at line " <> show lineNum <> ". Maybe because of a delimiter")
val <- tryE ((BL.append partialRow <$> fileLines) !? (lineNum+1))
"checkPartialRow"
checkPartialRow fileLines delimiter headers val (lineNum+1)
-- | Looks up the next line...?
checkNextLine :: V.Vector BL.ByteString -- ^ All rows
-> ColumnDelimiter -- ^ Type of delimiter
-> [Text] -- ^ Headers
-> BL.ByteString -- ^ ???
-> Int -- ^ Line number
-> (Int, [BL.ByteString])
checkNextLine allLines delimiter headers res lineNumber =
case allLines !? (lineNumber + 1) of
Nothing -> (lineNumber, split delimiter res)
Just nextLine -> let nextRow = split delimiter nextLine in
if length nextRow > 1
then (lineNumber, split delimiter res)
else checkNextLine allLines delimiter headers (res <> nextLine) (lineNumber + 1)
-- | Check for any errors or warnings in a given row
checkRow :: [BL.ByteString] -- ^ The row to check, split into cells
-> ColumnDelimiter -- ^ The type of delimiter
-> [Text] -- ^ The list of headers
-> Int -- ^ Current line; used for logging purposes
-> Either Text [Text] -- ^ List of warnings
-- Empty row & no headers: nothing to check:
checkRow [] _ [] _ = Right []
-- More cells than headers: error
checkRow _ delimiter [] line = Left $
"Too many fields at line " <> (T.pack . show) line <> "." <>
if delimiter /= Comma
then ""
else "Try using tabulation as a delimiter. Other delimiters like comma (,) may appear in some texts."
-- More headers than cells: error
checkRow [] _ _ line = Left $ "Missing one field on line " <> (T.pack . show) line
-- Still some fields and headers to consume
checkRow (v:val) delimiter (h:headers) line = do
warning <- checkValue v h line
warnings <- checkRow val delimiter headers line
return $ warning <> warnings
-- | Tests whether a given chunk of text is a valid field
checkValue :: BL.ByteString -- ^ The field to check
-> Text -- ^ Current column header; passed as input for logging purposes
-> Int -- ^ Current line number; passed as input for logging purposes
-> Either Text [Text]
checkValue field columnHeader lineNumber = case columnHeader of
"Publication Day" -> checkNumber field columnHeader lineNumber
"Publication Month" -> checkNumber field columnHeader lineNumber
"Publication Year" -> checkNumber field columnHeader lineNumber
"Authors" -> checkTextField field columnHeader lineNumber
"Title" -> checkTextField field columnHeader lineNumber
"Source" -> checkTextField field columnHeader lineNumber
"Abstract" -> checkTextField field columnHeader lineNumber
_ -> Right []
-- | Checks whether a given chunk of text is a valid number
checkNumber :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkNumber field columnHeader line = do
let cleanField = removeText "\"" $ lBLToText field
value <- tryE (parseNumber cleanField) $
"Error in column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " : value is not a number "
assertE (value >= 0) $
"Value of column '" <> columnHeader <> "' at line " <> T.pack (show line) <> " is negative"
return []
-- | Checks whether a given chunk of text is a valid text field
-- (i.e. does not have parasite quotes)
checkTextField :: BL.ByteString -- ^ The text to parse as a number
-> Text -- ^ Current header; passed as input for logging purposes
-> Int -- ^ Current line; passed as input for logging purposes
-> Either Text [Text] -- ^ List of warnings
checkTextField field columnHeader ligne = do
let cleanField = removeText "\"\"" $ lBLToText field
if T.null cleanField
then Right ["The column '" <> columnHeader <> "' at line " <> T.pack (show ligne) <> " is empty"]
else do
assertE (
-- Field is not empty...
(T.length cleanField > 0) &&
-- ... and either there are no quotes at all...
( T.length (T.filter (== '\"') cleanField) == 0 ||
-- ... or there is only one quote at the beginning and one at the end
((T.head cleanField == '"') &&
(T.last cleanField == '"') &&
(T.length (T.filter (== '\"') cleanField) == 2)
)
)) $
"Encapsulation problem at line " <> T.pack (show ligne) <> " in column '" <> columnHeader <> "' : the caracter \" must only appear at the beginning and the end of a field "
return []
---------------------------------------
-- | Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
-- use a map to remove \r that sometimes appear at the end of a line
testAllHeadersPresence :: [Text] -> Either Text [Text] instance CSV.FromNamedRecord Tsv' where
testAllHeadersPresence headers = do parseNamedRecord r = do
let listHeaders = filter (`notElem` (map (T.replace (T.pack "\r") (T.pack ""))headers)) ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] tsv'_title <- r .: "title"
if null listHeaders tsv'_source <- r .: "source"
then Right headers tsv'_publication_year <- r .: "publication_year"
else Left ((pack " Missing column : ") <> T.intercalate ", " listHeaders) tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
getHeaders :: [BL.ByteString] -> ColumnDelimiter -> Either Text [Text] tsv'_abstract <- r .: "abstract"
getHeaders bl del = do tsv'_authors <- r .: "authors"
let vec = V.fromList bl in tsv'_weight <- r .: "weight"
case BL.splitWith (== toWord8 del) <$> ((V.!?) vec 0) of pure $ Tsv' { .. }
Nothing -> Left "Error getHeaders"
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> ColumnDelimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> ColumnDelimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> ColumnDelimiter
-> BL.ByteString
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> ColumnDelimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case (testCorrectFile file) of
Left _err -> pure $ Left _err
Right (del,_) -> pure $ readTsvLazyBS del file
-- | TODO use readByteStringLazy
readTsvLazyBS :: ColumnDelimiter
-> BL.ByteString
-> Either Text (Header, Vector TsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
------------------------------------------------------------------------ readWeightedTsv :: FilePath -> IO (CSV.Header, V.Vector Tsv')
-- | TODO use readFileLazy readWeightedTsv fp =
readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal)) (\bs ->
readTsvHal fp = do case CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) bs of
c <- BL.readFile fp Left e -> panicTrace (T.pack e)
pure $ readTsvHalLazyBS c Right corpus -> corpus
) <$> BL.readFile fp
-- | TODO use readByteStringLazy
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString -----------------------
hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) minimalTsvHeader (map hyperdataDocument2tsvDoc hs)
------------------------------------------------------------------------ -- | The data of a HAL document
-- Hal Format
data TsvHal = TsvHal data TsvHal = TsvHal
{ tsvHal_title :: !Text { tsvHal_title :: !Text
, tsvHal_source :: !Text , tsvHal_source :: !Text
, tsvHal_publication_year :: !Integer , tsvHal_publication_year :: !Integer
, tsvHal_publication_month :: !Int , tsvHal_publication_month :: !Int
, tsvHal_publication_day :: !Int , tsvHal_publication_day :: !Int
, tsvHal_abstract :: !Text , tsvHal_abstract :: !Text
, tsvHal_authors :: !Text , tsvHal_authors :: !Text
, tsvHal_url :: !Text , tsvHal_url :: !Text
, tsvHal_isbn_s :: !Text , tsvHal_isbn_s :: !Text
, tsvHal_issue_s :: !Text , tsvHal_issue_s :: !Text
, tsvHal_journalPublisher_s:: !Text , tsvHal_journalPublisher_s:: !Text
, tsvHal_language_s :: !Text , tsvHal_language_s :: !Text
, tsvHal_doiId_s :: !Text , tsvHal_doiId_s :: !Text
, tsvHal_authId_i :: !Text , tsvHal_authId_i :: !Text
, tsvHal_instStructId_i :: !Text , tsvHal_instStructId_i :: !Text
, tsvHal_deptStructId_i :: !Text , tsvHal_deptStructId_i :: !Text
, tsvHal_labStructId_i :: !Text , tsvHal_labStructId_i :: !Text
, tsvHal_rteamStructId_i :: !Text , tsvHal_rteamStructId_i :: !Text
, tsvHal_docType_s :: !Text , tsvHal_docType_s :: !Text
} }
deriving (Show) deriving (Show)
instance FromNamedRecord TsvHal where instance CSV.FromNamedRecord TsvHal where
parseNamedRecord r = do parseNamedRecord r = do
tsvHal_title <- r .: "title" tsvHal_title <- r .: "title"
tsvHal_source <- r .: "source" tsvHal_source <- r .: "source"
...@@ -443,10 +595,9 @@ instance FromNamedRecord TsvHal where ...@@ -443,10 +595,9 @@ instance FromNamedRecord TsvHal where
tsvHal_docType_s <- r .: "docType_s" tsvHal_docType_s <- r .: "docType_s"
pure $ TsvHal { .. } pure $ TsvHal { .. }
instance ToNamedRecord TsvHal where instance CSV.ToNamedRecord TsvHal where
--toNamedRecord (TsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
toNamedRecord (TsvHal { .. }) = toNamedRecord (TsvHal { .. }) =
namedRecord [ "title" .= tsvHal_title CSV.namedRecord [ "title" .= tsvHal_title
, "source" .= tsvHal_source , "source" .= tsvHal_source
, "publication_year" .= tsvHal_publication_year , "publication_year" .= tsvHal_publication_year
...@@ -472,6 +623,21 @@ instance ToNamedRecord TsvHal where ...@@ -472,6 +623,21 @@ instance ToNamedRecord TsvHal where
, "docType_s" .= tsvHal_docType_s , "docType_s" .= tsvHal_docType_s
] ]
readTsvHal :: FilePath
-> IO (Either Text (CSV.Header, V.Vector TsvHal))
readTsvHal path = do
file <- BL.readFile path
return . first T.pack $ -- Applying `first` to the error message because we need a `Text`
CSV.decodeByNameWith (defaultDecodingOptionsWithDelimiter Tab) file
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
tsvHal2doc :: TsvHal -> HyperdataDocument tsvHal2doc :: TsvHal -> HyperdataDocument
tsvHal2doc (TsvHal { .. }) = tsvHal2doc (TsvHal { .. }) =
HyperdataDocument { _hd_bdd = Just "TsvHal" HyperdataDocument { _hd_bdd = Just "TsvHal"
...@@ -483,7 +649,7 @@ tsvHal2doc (TsvHal { .. }) = ...@@ -483,7 +649,7 @@ tsvHal2doc (TsvHal { .. }) =
, _hd_institutes = Just tsvHal_instStructId_i , _hd_institutes = Just tsvHal_instStructId_i
, _hd_source = Just tsvHal_source , _hd_source = Just tsvHal_source
, _hd_abstract = Just tsvHal_abstract , _hd_abstract = Just tsvHal_abstract
, _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year , _hd_publication_date = Just $ T.pack . show $ jour tsvHal_publication_year
tsvHal_publication_month tsvHal_publication_month
tsvHal_publication_day tsvHal_publication_day
, _hd_publication_year = Just $ fromIntegral tsvHal_publication_year , _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
...@@ -495,100 +661,3 @@ tsvHal2doc (TsvHal { .. }) = ...@@ -495,100 +661,3 @@ tsvHal2doc (TsvHal { .. }) =
, _hd_language_iso2 = Nothing , _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing } , _hd_institutes_tree = Nothing }
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = fromMIntOrDec defaultYear tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
let
result = case (testCorrectFile bs) of
Left _err -> Left _err
Right (del,_) -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity (), WarningDiagnostic)
parseTsvC bs =
(\(_h, rs) -> (fromIntegral $ V.length rs, yieldMany rs .| mapC tsv2doc, warning)) <$> eResult
where
(eResult, warning) = case (testCorrectFile bs) of
Left _err -> (Left _err, Warn.MalformedCorpus "")
Right (del,warn) -> (readTsvLazyBS del bs, Warn.MalformedCorpus (T.intercalate (pack "\n") warn))
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
instance FromNamedRecord Tsv' where
parseNamedRecord r = do
tsv'_title <- r .: "title"
tsv'_source <- r .: "source"
tsv'_publication_year <- r .: "publication_year"
tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
readWeightedTsv :: FilePath -> IO (Header, Vector Tsv')
readWeightedTsv fp =
fmap (\bs ->
case decodeByNameWith (tsvDecodeOptions Tab) bs of
Left e -> panicTrace (pack e)
Right corpus -> corpus
) $ BL.readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment