Commit 094b511b authored by Yoelis Acourt's avatar Yoelis Acourt

chore: remove dead code

parent 82ffefa1
...@@ -14,14 +14,11 @@ Portability : POSIX ...@@ -14,14 +14,11 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Isidore ( module Gargantext.Core.Text.Corpus.API.Isidore (
get get
-- * Internals (possibly unused?)
, isidore2tsvFile
) where ) where
import Data.Text qualified as Text import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText) import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
...@@ -50,13 +47,6 @@ get lang l q a = do ...@@ -50,13 +47,6 @@ get lang l q a = do
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs) hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs pure hDocs
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let let
......
...@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files. ...@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.TSV where module Gargantext.Core.Text.Corpus.Parsers.TSV where
import Conduit ( ConduitT, (.|), yieldMany, mapC ) import Conduit ( ConduitT, (.|), yieldMany, mapC )
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL import Data.ByteString.Lazy qualified as BL
import Data.Csv import Data.Csv
import Data.Text (pack) import Data.Text (pack)
...@@ -26,8 +25,6 @@ import Data.Text.Read qualified as DTR ...@@ -26,8 +25,6 @@ import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import Data.Vector (Vector) import Data.Vector (Vector)
import Data.Vector qualified as V import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show) import Gargantext.Prelude hiding (length, show)
import Protolude import Protolude
...@@ -42,6 +39,7 @@ headerTsvGargV3 = ...@@ -42,6 +39,7 @@ headerTsvGargV3 =
, "publication_day" , "publication_day"
, "abstract" , "abstract"
, "authors" , "authors"
] ]
--------------------------------------------------------------- ---------------------------------------------------------------
data TsvGargV3 = TsvGargV3 data TsvGargV3 = TsvGargV3
...@@ -56,91 +54,9 @@ data TsvGargV3 = TsvGargV3 ...@@ -56,91 +54,9 @@ data TsvGargV3 = TsvGargV3
} }
deriving (Show) deriving (Show)
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc :: TsvGargV3 -> HyperdataDocument
toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument { _hd_bdd = Just "TSV"
, _hd_doi = Just . pack . show $ did
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just dt
, _hd_authors = Nothing
, _hd_institutes = Just dau
, _hd_source = Just dab
, _hd_abstract = Nothing
, _hd_publication_date = Nothing
, _hd_publication_year = Just dpy
, _hd_publication_month = Just dpm
, _hd_publication_day = Just dpd
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector TsvDoc -> [TsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (TsvDoc { .. }) -- (TsvDoc t s mPy pm pd abst auth)
-> TsvGargV3 { d_docId = nId
, d_title = tsv_title
, d_source = tsv_source
, d_publication_year = fromMIntOrDec defaultYear tsv_publication_year
, d_publication_month = fromMaybe defaultMonth tsv_publication_month
, d_publication_day = fromMaybe defaultDay tsv_publication_day
, d_abstract = tsv_abstract
, d_authors = tsv_authors })
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= V.fromList [Paragraphs 1, Sentences 3, Chars 3]
---------------------------------------------------------------
fromDocs :: Vector TsvGargV3 -> Vector TsvDoc
fromDocs = V.map fromDocs'
where
fromDocs' (TsvGargV3 { .. }) = TsvDoc { tsv_title = d_title
, tsv_source = d_source
, tsv_publication_year = Just $ IntOrDec d_publication_year
, tsv_publication_month = Just d_publication_month
, tsv_publication_day = Just d_publication_day
, tsv_abstract = d_abstract
, tsv_authors = d_authors }
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
let docSize = (T.length $ tsv_abstract doc) in
if (docSize > 1000) && (mod (round m) docSize >= 10)
then splitDoc' splt doc
else V.fromList [doc]
where
splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
, tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
, .. }
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) tsv_abstract
---------------------------------------------------------------
--------------------------------------------------------------- ---------------------------------------------------------------
type Mean = Double type Mean = Double
docsSize :: Vector TsvDoc -> Mean
docsSize tsvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
--------------------------------------------------------------- ---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read) deriving (Show, Eq, Read)
...@@ -195,28 +111,12 @@ instance ToNamedRecord TsvDoc where ...@@ -195,28 +111,12 @@ instance ToNamedRecord TsvDoc where
, "authors" .= tsv_authors , "authors" .= tsv_authors
] ]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
data Delimiter = Tab | Comma | Line deriving (Eq, Show) data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d} tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8 delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t' delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ',' delimiter Comma = fromIntegral $ ord ','
...@@ -368,33 +268,6 @@ getHeaders bl del = do ...@@ -368,33 +268,6 @@ getHeaders bl del = do
------------------------------------------------------------------------ ------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BL.ByteString
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO use readFileLazy -- | TODO use readFileLazy
...@@ -424,19 +297,8 @@ readTsvHal fp = do ...@@ -424,19 +297,8 @@ readTsvHal fp = do
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal) readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString
hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) headerTsvGargV3 (map hyperdataDocument2tsvDoc hs)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Hal Format -- Hal Format
...@@ -577,27 +439,12 @@ parseHal fp = do ...@@ -577,27 +439,12 @@ parseHal fp = do
r <- readTsvHal fp r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r pure $ V.toList . V.map tsvHal2doc . snd <$> r
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument]) parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
let
result = case (testCorrectFile bs) of
Left _err -> Left _err
Right del -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ()) -> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseTsvC bs = parseTsvC bs =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment