[tsv] migrate tsvv3 to separate module, separate tsv hal, tsv phylo

parent c8185ce7
Pipeline #7985 passed with stages
in 58 minutes and 39 seconds
...@@ -20,7 +20,8 @@ import Data.Tuple.Extra (both) ...@@ -20,7 +20,8 @@ import Data.Tuple.Extra (both)
import Data.Vector qualified as DV import Data.Vector qualified as DV
import GHC.Generics import GHC.Generics
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, tsv_title, tsv_abstract, tsv_publication_year, fromMIntOrDec, defaultYear) import Gargantext.Core.Text.Corpus.Parsers.TSV (readTSVFile, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 (tsv_title, tsv_abstract, tsv_publication_year)
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList) import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList ) import Gargantext.Core.Text.Terms.WithList ( Patterns, buildPatterns, extractTermsWithList )
......
...@@ -14,7 +14,8 @@ import Gargantext.API.Ngrams.Prelude (toTermList) ...@@ -14,7 +14,8 @@ import Gargantext.API.Ngrams.Prelude (toTermList)
import Gargantext.API.Ngrams.Types import Gargantext.API.Ngrams.Types
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile) import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), parseFile)
import Gargantext.Core.Text.Corpus.Parsers.TSV (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day, tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight) import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 (tsv_title, tsv_abstract, tsv_publication_year, tsv_publication_month, tsv_publication_day)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo (tsv'_source, tsv'_title, tsv'_abstract, tsv'_publication_year, tsv'_publication_month, tsv'_publication_day, tsv'_weight)
import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv import Gargantext.Core.Text.Corpus.Parsers.TSV qualified as Tsv
import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList) import Gargantext.Core.Text.List.Formats.TSV (tsvMapTermList)
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
...@@ -95,15 +96,17 @@ tsvToDocs parser patterns time path = ...@@ -95,15 +96,17 @@ tsvToDocs parser patterns time path =
, hour = Def.hour , hour = Def.hour
, minute = Def.minute , minute = Def.minute
, sec = Def.second } , sec = Def.second }
Tsv' limit -> Vector.toList Tsv' limit -> do
<$> Vector.take limit (Right contents) <- Tsv.readTSVFileDelim path Tsv.Tab
<$> Vector.map (\row -> Document { date = toPhyloDate (rowToUTCTimeR row) time pure $ Vector.toList
, date' = toPhyloDate' (rowToUTCTimeR row) time $ Vector.take limit
, text = termsInText patterns $ (tsv'_title row) <> " " <> (tsv'_abstract row) (Vector.map (\row -> Document { date = toPhyloDate (rowToUTCTimeR row) time
, weight = Just $ tsv'_weight row , date' = toPhyloDate' (rowToUTCTimeR row) time
, sources = map (T.strip . pack) $ splitOn ";" (unpack $ (tsv'_source row)) , text = termsInText patterns $ tsv'_title row <> " " <> tsv'_abstract row
, docTime = time } , weight = Just $ tsv'_weight row
) <$> snd <$> Tsv.readWeightedTsv path , sources = map (T.strip . pack) $ splitOn ";" (unpack $ tsv'_source row)
, docTime = time }
) $ snd contents)
where where
rowToUTCTimeR row = UTCTimeR { year = tsv'_publication_year row rowToUTCTimeR row = UTCTimeR { year = tsv'_publication_year row
, month = tsv'_publication_month row , month = tsv'_publication_month row
......
...@@ -233,6 +233,10 @@ library ...@@ -233,6 +233,10 @@ library
Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Gargantext.Core.Text.Corpus.Parsers.TSV.Types
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List Gargantext.Core.Text.List
Gargantext.Core.Text.List.Formats.TSV Gargantext.Core.Text.List.Formats.TSV
......
...@@ -22,11 +22,12 @@ import Data.Text qualified as Text ...@@ -22,11 +22,12 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText) import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv) import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (get) import Gargantext.Prelude hiding (get)
import Isidore qualified as Isidore import Isidore qualified
import Isidore.Client import Isidore.Client
import Servant.Client ( ClientError(DecodeFailure) ) import Servant.Client ( ClientError(DecodeFailure) )
...@@ -47,15 +48,14 @@ get lang l q a = do ...@@ -47,15 +48,14 @@ get lang l q a = do
iDocs <- either printErr _content <$> Isidore.get l q a iDocs <- either printErr _content <$> Isidore.get l q a
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs) mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery -> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO () -> IO ()
isidore2tsvFile fp lang li tq aq = do isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq hdocs <- get lang li tq aq
writeDocs2Tsv fp hdocs writeDocs2Tsv TSVv3.headerTsvGargV3 TSVv3.doc2tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do isidoreToDoc lang (IsidoreDoc t a d u s as) = do
...@@ -73,7 +73,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do ...@@ -73,7 +73,7 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
langText (OnlyText t2 ) = t2 langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.unwords $ map langText ts langText (ArrayText ts ) = Text.unwords $ map langText ts
let mDateS = maybe (Just $ Text.pack $ show Defaults.year) Just d let mDateS = d <|> Just (Text.pack $ show Defaults.year)
let (utcTime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS let (utcTime, (pub_year, pub_month, pub_day)) = Date.mDateSplit mDateS
pure HyperdataDocument pure HyperdataDocument
......
...@@ -54,6 +54,8 @@ import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex) ...@@ -54,6 +54,8 @@ import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC, parseIstex)
import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS import Gargantext.Core.Text.Corpus.Parsers.RIS qualified as RIS
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC) import Gargantext.Core.Text.Corpus.Parsers.TSV (parseHal, parseTsv, parseTsvC)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal qualified as TsvHal
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Gargantext.Core.Text.Corpus.Parsers.Types import Gargantext.Core.Text.Corpus.Parsers.Types
import Gargantext.Core.Text.Corpus.Parsers.WOS qualified as WOS import Gargantext.Core.Text.Corpus.Parsers.WOS qualified as WOS
import Gargantext.Core.Text.Ngrams (NgramsType(..)) import Gargantext.Core.Text.Ngrams (NgramsType(..))
...@@ -111,10 +113,10 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0 ...@@ -111,10 +113,10 @@ parseFormatC ft ff bs0 = first ParseFormatError <$> do_parse ft ff bs0
-> DB.ByteString -> DB.ByteString
-> m (Either DT.Text (Integer, ConduitT () (ParseCorpusResult HyperdataDocument) IO ())) -> m (Either DT.Text (Integer, ConduitT () (ParseCorpusResult HyperdataDocument) IO ()))
do_parse TsvGargV3 Plain bs = do do_parse TsvGargV3 Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs let eParsedC = parseTsvC TSVv3.tsv2doc $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC) pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse TsvHal Plain bs = do do_parse TsvHal Plain bs = do
let eParsedC = parseTsvC $ DBL.fromStrict bs let eParsedC = parseTsvC TsvHal.tsv2doc $ DBL.fromStrict bs
pure (second (transPipe (pure . runIdentity)) <$> eParsedC) pure (second (transPipe (pure . runIdentity)) <$> eParsedC)
do_parse Istex Plain bs = do do_parse Istex Plain bs = do
ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs ep <- liftBase $ parseIstex EN $ DBL.fromStrict bs
...@@ -221,7 +223,7 @@ parseFile :: FileType ...@@ -221,7 +223,7 @@ parseFile :: FileType
-> FileFormat -> FileFormat
-> FilePath -> FilePath
-> IO (Either Text [HyperdataDocument]) -> IO (Either Text [HyperdataDocument])
parseFile TsvGargV3 Plain p = parseTsv p parseFile TsvGargV3 Plain p = parseTsv TSVv3.tsv2doc p
parseFile TsvHal Plain p = parseHal p parseFile TsvHal Plain p = parseHal p
parseFile RisPresse Plain p = do parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
......
{-# LANGUAGE BangPatterns #-} {-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 where module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 where
import Data.Csv import Data.Csv
import Data.Text (pack)
import Data.Time.Segment (jour)
import Gargantext.Core.Text.Corpus.Parsers.TSV.Diagnostics
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude import Gargantext.Prelude
...@@ -45,6 +57,34 @@ data TsvDoc = TsvDoc ...@@ -45,6 +57,34 @@ data TsvDoc = TsvDoc
deriving (Show) deriving (Show)
tsv2doc :: TsvDoc -> HyperdataDocument
tsv2doc (TsvDoc { .. })
= HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just tsv_title
, _hd_authors = Just tsv_authors
, _hd_institutes = Nothing
, _hd_source = Just tsv_source
, _hd_abstract = Just tsv_abstract
, _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
pubMonth
pubDay
, _hd_publication_year = Just pubYear
, _hd_publication_month = Just pubMonth
, _hd_publication_day = Just pubDay
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = fromMIntOrDec defaultYear tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
pubDay = fromMaybe defaultDay tsv_publication_day
instance FromNamedRecord TsvDoc where instance FromNamedRecord TsvDoc where
parseNamedRecord r = do parseNamedRecord r = do
...@@ -69,15 +109,29 @@ instance ToNamedRecord TsvDoc where ...@@ -69,15 +109,29 @@ instance ToNamedRecord TsvDoc where
] ]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc doc2tsv :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h doc2tsv h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h , tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h , tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h , tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h , tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h , tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h } , tsv_authors = m $ _hd_authors h }
where where
m = maybe "" identity m = maybe "" identity
mI = maybe 0 identity mI = maybe 0 identity
validators :: [ColumnValidator]
validators =
[ ("Publication Day", validNumber)
, ("Publication Month", validNumber)
, ("Publication Year", validNumber)
, ("Authors", validTextField)
, ("Title", validTextField)
, ("Source", validTextField)
, ("Abstract", validTextField)
-- TODO validDateField
-- , ("Publication Date", validTextField)
]
{-# LANGUAGE BangPatterns #-} {-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 where module Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 where
import Data.Csv import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec -- import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude import Gargantext.Prelude
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal where
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types (Delimiter(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV.Utils (readTsvLazyBS)
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
------------------------------------------------------------------------
-- Hal Format
data TsvHal = TsvHal
{ tsvHal_title :: !Text
, tsvHal_source :: !Text
, tsvHal_publication_year :: !Integer
, tsvHal_publication_month :: !Int
, tsvHal_publication_day :: !Int
, tsvHal_abstract :: !Text
, tsvHal_authors :: !Text
, tsvHal_url :: !Text
, tsvHal_isbn_s :: !Text
, tsvHal_issue_s :: !Text
, tsvHal_journalPublisher_s:: !Text
, tsvHal_language_s :: !Text
, tsvHal_doiId_s :: !Text
, tsvHal_authId_i :: !Text
, tsvHal_instStructId_i :: !Text
, tsvHal_deptStructId_i :: !Text
, tsvHal_labStructId_i :: !Text
, tsvHal_rteamStructId_i :: !Text
, tsvHal_docType_s :: !Text
}
deriving (Show)
instance FromNamedRecord TsvHal where
parseNamedRecord r = do
tsvHal_title <- r .: "title"
tsvHal_source <- r .: "source"
tsvHal_publication_year <- r .: "publication_year"
tsvHal_publication_month <- r .: "publication_month"
tsvHal_publication_day <- r .: "publication_day"
tsvHal_abstract <- r .: "abstract"
tsvHal_authors <- r .: "authors"
tsvHal_url <- r .: "url"
tsvHal_isbn_s <- r .: "isbn_s"
tsvHal_issue_s <- r .: "issue_s"
tsvHal_journalPublisher_s <- r .: "journalPublisher_s"
tsvHal_language_s <- r .: "language_s"
tsvHal_doiId_s <- r .: "doiId_s"
tsvHal_authId_i <- r .: "authId_i"
tsvHal_instStructId_i <- r .: "instStructId_i"
tsvHal_deptStructId_i <- r .: "deptStructId_i"
tsvHal_labStructId_i <- r .: "labStructId_i"
tsvHal_rteamStructId_i <- r .: "rteamStructId_i"
tsvHal_docType_s <- r .: "docType_s"
pure $ TsvHal { .. }
instance ToNamedRecord TsvHal where
--toNamedRecord (TsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
toNamedRecord (TsvHal { .. }) =
namedRecord [ "title" .= tsvHal_title
, "source" .= tsvHal_source
, "publication_year" .= tsvHal_publication_year
, "publication_month" .= tsvHal_publication_month
, "publication_day" .= tsvHal_publication_day
, "abstract" .= tsvHal_abstract
, "authors" .= tsvHal_authors
, "url" .= tsvHal_url
, "isbn_s" .= tsvHal_isbn_s
, "issue_s" .= tsvHal_issue_s
, "journalPublisher_s" .= tsvHal_journalPublisher_s
, "language_s" .= tsvHal_language_s
, "doiId_s" .= tsvHal_doiId_s
, "authId_i" .= tsvHal_authId_i
, "instStructId_i" .= tsvHal_instStructId_i
, "deptStructId_i" .= tsvHal_deptStructId_i
, "labStructId_i" .= tsvHal_labStructId_i
, "rteamStructId_i" .= tsvHal_rteamStructId_i
, "docType_s" .= tsvHal_docType_s
]
tsv2doc :: TsvHal -> HyperdataDocument
tsv2doc (TsvHal { .. }) =
HyperdataDocument { _hd_bdd = Just "TsvHal"
, _hd_doi = Just tsvHal_doiId_s
, _hd_url = Just tsvHal_url
, _hd_page = Nothing
, _hd_title = Just tsvHal_title
, _hd_authors = Just tsvHal_authors
, _hd_institutes = Just tsvHal_instStructId_i
, _hd_source = Just tsvHal_source
, _hd_abstract = Just tsvHal_abstract
, _hd_publication_date = Just $ pack . show $ jour tsvHal_publication_year
tsvHal_publication_month
tsvHal_publication_day
, _hd_publication_year = Just $ fromIntegral tsvHal_publication_year
, _hd_publication_month = Just tsvHal_publication_month
, _hd_publication_day = Just tsvHal_publication_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTsvHal :: FilePath -> IO (Either Text (Header, Vector TsvHal))
readTsvHal fp = do
c <- BL.readFile fp
pure $ readTsvLazyBS Tab c
------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Text [HyperdataDocument])
parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsv2doc . snd <$> r
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo where
import Data.Csv
import Gargantext.Prelude
------------------------------------------------------------------------
-- Tsv v3 weighted for phylo
data Tsv' = Tsv'
{ tsv'_title :: !Text
, tsv'_source :: !Text
, tsv'_publication_year :: !Int
, tsv'_publication_month :: !Int
, tsv'_publication_day :: !Int
, tsv'_abstract :: !Text
, tsv'_authors :: !Text
, tsv'_weight :: !Double } deriving (Show)
instance FromNamedRecord Tsv' where
parseNamedRecord r = do
tsv'_title <- r .: "title"
tsv'_source <- r .: "source"
tsv'_publication_year <- r .: "publication_year"
tsv'_publication_month <- r .: "publication_month"
tsv'_publication_day <- r .: "publication_day"
tsv'_abstract <- r .: "abstract"
tsv'_authors <- r .: "authors"
tsv'_weight <- r .: "weight"
pure $ Tsv' { .. }
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.Utils where
import Data.ByteString.Lazy qualified as BL
import Data.ByteString.Lazy.Char8 qualified as B8L
import Data.Csv
import Data.Text (pack)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
import Gargantext.Prelude
parseTsv :: FromNamedRecord tsvDoc
=> (tsvDoc -> result)
-> FilePath
-> IO (Either Text [result])
parseTsv tsv2doc fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
-- | TODO use readByteStringLazy
readTsvLazyBS :: FromNamedRecord tsvDoc
=> Delimiter
-> BL.ByteString
-> Either Text (Header, Vector tsvDoc)
readTsvLazyBS d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
-- | TODO use readFileLazy
readTSVFile :: FromNamedRecord tsvDoc
=> FilePath
-> IO (Either Text (Header, Vector tsvDoc))
readTSVFile fp = do
file <- BL.readFile fp
case detectDelimiter file of
Left err -> pure $ Left err
Right del -> pure $ readTsvLazyBS del file
readTSVFileDelim :: FromNamedRecord tsvDoc
=> FilePath
-> Delimiter
-> IO (Either Text (Header, Vector tsvDoc))
readTSVFileDelim fp del = do
file <- BL.readFile fp
pure $ readTsvLazyBS del file
-- Detects delimiter based on the first line
detectDelimiter :: BL.ByteString -> Either Text Delimiter
detectDelimiter input =
case B8L.lines input of
(firstLine : _) ->
let candidates = [(',', count ',' firstLine), ('\t', count '\t' firstLine)]
in case fst $ maximumBy (comparing snd) candidates of
'\n' -> Right Line
'\t' -> Right Tab
',' -> Right Comma
_ -> Left $ "Invalid delimiter detected for input tsv document."
_ -> Left "Couldn't detect a valid delimiter for the input document."
-- Count occurrences of a character in a ByteString
count :: Char -> BL.ByteString -> Int64
count c = BL.count (fromIntegral (fromEnum c))
------------------------------------------------------------------------
writeDocs2Tsv :: ToNamedRecord tsvDoc
=> Header
-> (doc -> tsvDoc)
-> FilePath
-> [doc]
-> IO ()
writeDocs2Tsv hdr doc2tsv fp hs =
BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> hs)
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
module Test.Core.Text.Corpus.TSV (tests) where module Test.Core.Text.Corpus.TSV (tests) where
import Gargantext.Core.Text.Corpus.Parsers.TSV import Gargantext.Core.Text.Corpus.Parsers.TSV
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Test.QuickCheck import Test.QuickCheck
import Test.QuickCheck.Instances () import Test.QuickCheck.Instances ()
import Data.ByteString.Lazy.UTF8 as BLU import Data.ByteString.Lazy.UTF8 as BLU
...@@ -182,7 +183,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do ...@@ -182,7 +183,7 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
let line = createLineFromCorpus tsv del let line = createLineFromCorpus tsv del
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) line let splitLine = BL.splitWith (==delimiter del) line
case testErrorPerLine splitLine del headers 1 of case testErrorPerLine TSVv3.validators splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus tsv del) Left _ -> validRandomCorpus tsv del)
...@@ -194,13 +195,13 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do ...@@ -194,13 +195,13 @@ testTestErrorPerLine = forAll generateRandomCorpus (\tsv -> do
testTestCorrectFile :: Property testTestCorrectFile :: Property
testTestCorrectFile = forAll generateFile (\file -> do testTestCorrectFile = forAll generateFile (\file -> do
let tsv = createFile file let tsv = createFile file
case testCorrectFile tsv of case testCorrectFile TSVv3.validators tsv of
Right del -> del == fDelimiter file Right del -> del == fDelimiter file
Left _ -> Prelude.all (\x -> do Left _ -> Prelude.all (\x -> do
let del = fDelimiter file let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of case testErrorPerLine TSVv3.validators splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file)) Left _ -> validRandomCorpus x del) (allCorpus file))
...@@ -209,13 +210,13 @@ testTestCorrectFile = forAll generateFile (\file -> do ...@@ -209,13 +210,13 @@ testTestCorrectFile = forAll generateFile (\file -> do
testTestCorrectFileWithNewLine :: Property testTestCorrectFileWithNewLine :: Property
testTestCorrectFileWithNewLine = forAll generateFile (\file -> do testTestCorrectFileWithNewLine = forAll generateFile (\file -> do
let tsv = createFileWithNewLine file let tsv = createFileWithNewLine file
case testCorrectFile tsv of case testCorrectFile TSVv3.validators tsv of
Right _ -> True Right _ -> True
Left _ -> Prelude.all (\x -> do Left _ -> Prelude.all (\x -> do
let del = fDelimiter file let del = fDelimiter file
let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] let headers = Prelude.map DT.pack ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del let splitLine = BL.splitWith (==delimiter del) $ createLineFromCorpus x del
case testErrorPerLine splitLine del headers 1 of case testErrorPerLine TSVv3.validators splitLine del headers 1 of
Right _ -> True Right _ -> True
Left _ -> validRandomCorpus x del) (allCorpus file)) Left _ -> validRandomCorpus x del) (allCorpus file))
...@@ -233,7 +234,7 @@ testFindDelimiter = forAll generateFileDelimiter (\file -> do ...@@ -233,7 +234,7 @@ testFindDelimiter = forAll generateFileDelimiter (\file -> do
testGetHeader :: Property testGetHeader :: Property
testGetHeader = forAll randomHeaderList (\headers -> do testGetHeader = forAll randomHeaderList (\headers -> do
let headersLines = (BL.intercalate (delimiterBS Tab) $ Prelude.map BLU.fromString headers):[] let headersLines = (BL.intercalate (delimiterBS Tab) $ Prelude.map BLU.fromString headers):[]
case getHeaders headersLines Tab of case getHeaders TSVv3.validators headersLines Tab of
Right _ -> True Right _ -> True
Left _ | not ("Publication Day" `Prelude.elem` headers) -> True Left _ | not ("Publication Day" `Prelude.elem` headers) -> True
| not ("Publication Month" `Prelude.elem` headers) -> True | not ("Publication Month" `Prelude.elem` headers) -> True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment