Remove unused code in Parsers.TSV module

b0d6365f · Alfredo Di Napoli · 622f7062 · b0d6365f
Commit b0d6365f authored Mar 31, 2025 by Alfredo Di Napoli
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 143 deletions

TSV.hs src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs +30 -143

No files found.
--- a/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
@@ -15,10 +15,38 @@ TSV parser for Gargantext corpus files.
 -}


-module Gargantext.Core.Text.Corpus.Parsers.TSV where
+module Gargantext.Core.Text.Corpus.Parsers.TSV (
+    readTSVFile
+  , Delimiter(..)
+  , parseHal
+  , parseTsv
+  , parseTsvC
+  , tsvDecodeOptions
+  , writeDocs2Tsv
+  , readTsvHal
+  , TsvHal(..)
+  , AtRow(..)
+  , ParseTsvResult(..)
+  , parseTsvWithDiagnostics
+  -- * Used in tests
+  , testCorrectFile
+  , testErrorPerLine
+  , validTextField
+  , validNumber
+  , Tsv'(..)
+  , TsvDoc(..)
+  , getHeaders
+  , findDelimiter
+  , delimiter
+  -- * Used in the main executables
+  , readWeightedTsv
+  , defaultDay
+  , defaultYear
+  , fromMIntOrDec
+  , defaultMonth
+  ) where

 import Conduit ( ConduitT, (.|), yieldMany, mapC )
-import Data.ByteString qualified as BS
 import Data.ByteString.Lazy qualified as BL
 import Data.Csv
 import Data.Text (pack)
@@ -29,8 +57,6 @@ import Data.Text.Read     qualified as DTR
 import Data.Time.Segment (jour)
 import Data.Vector (Vector)
 import Data.Vector qualified as V
-import Gargantext.Core.Text ( sentences, unsentences )
-import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
 import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
 import Gargantext.Prelude hiding (length, show)
 import Protolude
@@ -59,91 +85,6 @@ data TsvGargV3 = TsvGargV3
    , d_authors           :: !Text
    }
    deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
-toDoc :: TsvGargV3 -> HyperdataDocument
-toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
-  HyperdataDocument { _hd_bdd = Just "TSV"
-                    , _hd_doi = Just . pack . show $ did
-                    , _hd_url = Nothing
-                    , _hd_page = Nothing
-                    , _hd_title = Just dt
-                    , _hd_authors = Nothing
-                    , _hd_institutes = Just dau
-                    , _hd_source = Just dab
-                    , _hd_abstract = Nothing
-                    , _hd_publication_date = Nothing
-                    , _hd_publication_year = Just dpy
-                    , _hd_publication_month = Just dpm
-                    , _hd_publication_day = Just dpd
-                    , _hd_publication_hour = Nothing
-                    , _hd_publication_minute = Nothing
-                    , _hd_publication_second = Nothing
-                    , _hd_language_iso2 = Nothing
-                    , _hd_institutes_tree = Nothing }
-
---------------------------------------------------------------
-- | Types Conversions
-toDocs :: Vector TsvDoc -> [TsvGargV3]
-toDocs v = V.toList
-         $ V.zipWith (\nId (TsvDoc { .. }) -- (TsvDoc t s mPy pm pd abst auth)
-                       -> TsvGargV3 { d_docId = nId
-                                    , d_title = tsv_title
-                                    , d_source = tsv_source
-                                    , d_publication_year = fromMIntOrDec defaultYear tsv_publication_year
-                                    , d_publication_month = fromMaybe defaultMonth tsv_publication_month
-                                    , d_publication_day = fromMaybe defaultDay tsv_publication_day
-                                    , d_abstract = tsv_abstract
-                                    , d_authors = tsv_authors })
-                       (V.enumFromN 1 (V.length v'')) v''
-          where
-            v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
-            seps= V.fromList [Paragraphs 1, Sentences 3, Chars 3]
-
---------------------------------------------------------------
-fromDocs :: Vector TsvGargV3 -> Vector TsvDoc
-fromDocs = V.map fromDocs'
-  where
-    fromDocs' (TsvGargV3 { .. }) = TsvDoc { tsv_title = d_title
-                                          , tsv_source = d_source
-                                          , tsv_publication_year = Just $ IntOrDec d_publication_year
-                                          , tsv_publication_month = Just d_publication_month
-                                          , tsv_publication_day = Just d_publication_day
-                                          , tsv_abstract = d_abstract
-                                          , tsv_authors = d_authors }
-
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
-splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
-splitDoc m splt doc =
- let docSize = (T.length $ tsv_abstract doc) in
- if (docSize > 1000) && (mod (round m) docSize >= 10)
-   then splitDoc' splt doc
-   else V.fromList [doc]
-  where
-    splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
-    splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
-        where
-          firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
-          firstAbstract = head' "splitDoc'1" abstracts
-
-          nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
-                                         , tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
-                                         , .. }
-                          ) (tail' "splitDoc'2" abstracts)
-
-          abstracts    = (splitBy $ contextSize) tsv_abstract
-
---------------------------------------------------------------
---------------------------------------------------------------
-type Mean = Double
-
-docsSize :: Vector TsvDoc -> Mean
-docsSize tsvDoc = mean ls
-  where
-    ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
-

 ---------------------------------------------------------------
 newtype IntOrDec = IntOrDec Int
@@ -371,36 +312,6 @@ getHeaders bl del = do
          Just headers -> testAllHeadersPresence (map (\x -> T.replace (T.pack "\"") (T.pack "") (lBLToText x)) headers)


------------------------------------------------------------------------
-
-readFileLazy :: (FromNamedRecord a)
-             => proxy a
-             -> Delimiter
-             -> FilePath
-             -> IO (Either Text (Header, Vector a))
-readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
-
-readFileStrict :: (FromNamedRecord a)
-               => proxy a
-               -> Delimiter
-               -> FilePath
-               -> IO (Either Text (Header, Vector a))
-readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
-
-readByteStringLazy :: (FromNamedRecord a)
-                   => proxy a
-                   -> Delimiter
-                   -> BL.ByteString
-                   -> Either Text (Header, Vector a)
-readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
-
-readByteStringStrict :: (FromNamedRecord a)
-                     => proxy a
-                     -> Delimiter
-                     -> BS.ByteString
-                     -> Either Text (Header, Vector a)
-readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
-
 ------------------------------------------------------------------------
 -- | TODO use readFileLazy
 readTSVFile :: FilePath -> IO (Either Text (Header, Vector TsvDoc))
@@ -459,14 +370,6 @@ readTsvHal fp = do
 readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
 readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs

-readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
-readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
-
------------------------------------------------------------------------
-writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
-writeFile fp (h, vs) = BL.writeFile fp $
-                      encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
-
 writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
 writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs

@@ -610,27 +513,11 @@ parseHal fp = do
  r <- readTsvHal fp
  pure $ V.toList . V.map tsvHal2doc . snd <$> r

-parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
-parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
-
 ------------------------------------------------------------------------

 parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
 parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp

-{-
-parseTsv' ::  BL.ByteString -> Either Text [HyperdataDocument]
-parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
-
-parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
-parseTsv' bs = do
-  let
-    result = case testCorrectFile bs of
-      Left _err -> Left _err
-      Right del -> readTsvLazyBS del bs
-  V.toList . V.map tsv2doc . snd <$> result
-
 parseTsvC :: BL.ByteString
          -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
 parseTsvC bs =