Commit b0d6365f authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Remove unused code in Parsers.TSV module

parent 622f7062
......@@ -15,10 +15,38 @@ TSV parser for Gargantext corpus files.
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV where
module Gargantext.Core.Text.Corpus.Parsers.TSV (
readTSVFile
, Delimiter(..)
, parseHal
, parseTsv
, parseTsvC
, tsvDecodeOptions
, writeDocs2Tsv
, readTsvHal
, TsvHal(..)
, AtRow(..)
, ParseTsvResult(..)
, parseTsvWithDiagnostics
-- * Used in tests
, testCorrectFile
, testErrorPerLine
, validTextField
, validNumber
, Tsv'(..)
, TsvDoc(..)
, getHeaders
, findDelimiter
, delimiter
-- * Used in the main executables
, readWeightedTsv
, defaultDay
, defaultYear
, fromMIntOrDec
, defaultMonth
) where
import Conduit ( ConduitT, (.|), yieldMany, mapC )
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
......@@ -29,8 +57,6 @@ import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Protolude
......@@ -59,91 +85,6 @@ data TsvGargV3 = TsvGargV3
, d_authors :: !Text
}
deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc :: TsvGargV3 -> HyperdataDocument
toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument { _hd_bdd = Just "TSV"
, _hd_doi = Just . pack . show $ did
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just dt
, _hd_authors = Nothing
, _hd_institutes = Just dau
, _hd_source = Just dab
, _hd_abstract = Nothing
, _hd_publication_date = Nothing
, _hd_publication_year = Just dpy
, _hd_publication_month = Just dpm
, _hd_publication_day = Just dpd
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector TsvDoc -> [TsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (TsvDoc { .. }) -- (TsvDoc t s mPy pm pd abst auth)
-> TsvGargV3 { d_docId = nId
, d_title = tsv_title
, d_source = tsv_source
, d_publication_year = fromMIntOrDec defaultYear tsv_publication_year
, d_publication_month = fromMaybe defaultMonth tsv_publication_month
, d_publication_day = fromMaybe defaultDay tsv_publication_day
, d_abstract = tsv_abstract
, d_authors = tsv_authors })
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= V.fromList [Paragraphs 1, Sentences 3, Chars 3]
---------------------------------------------------------------
fromDocs :: Vector TsvGargV3 -> Vector TsvDoc
fromDocs = V.map fromDocs'
where
fromDocs' (TsvGargV3 { .. }) = TsvDoc { tsv_title = d_title
, tsv_source = d_source
, tsv_publication_year = Just $ IntOrDec d_publication_year
, tsv_publication_month = Just d_publication_month
, tsv_publication_day = Just d_publication_day
, tsv_abstract = d_abstract
, tsv_authors = d_authors }
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
let docSize = (T.length $ tsv_abstract doc) in
if (docSize > 1000) && (mod (round m) docSize >= 10)
then splitDoc' splt doc
else V.fromList [doc]
where
splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
, tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
, .. }
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) tsv_abstract
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector TsvDoc -> Mean
docsSize tsvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
......@@ -371,36 +312,6 @@ getHeaders bl del = do
Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BL.ByteString
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
......@@ -459,14 +370,6 @@ readTsvHal fp = do
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
......@@ -610,27 +513,11 @@ parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
let
result = case testCorrectFile bs of
Left _err -> Left _err
Right del -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseTsvC bs =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment