Commit 094b511b authored by Yoelis Acourt's avatar Yoelis Acourt

chore: remove dead code

parent 82ffefa1
......@@ -14,14 +14,11 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Isidore (
get
-- * Internals (possibly unused?)
, isidore2tsvFile
) where
import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
......@@ -50,13 +47,6 @@ get lang l q a = do
hDocs <- mapM (isidoreToDoc lang) (toIsidoreDocs iDocs)
pure hDocs
isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> Maybe Isidore.TextQuery -> Maybe Isidore.AuthorQuery
-> IO ()
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do
let
......
......@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files.
module Gargantext.Core.Text.Corpus.Parsers.TSV where
import Conduit ( ConduitT, (.|), yieldMany, mapC )
import Data.ByteString qualified as BS
import Data.ByteString.Lazy qualified as BL
import Data.Csv
import Data.Text (pack)
......@@ -26,8 +25,6 @@ import Data.Text.Read qualified as DTR
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text ( sentences, unsentences )
import Gargantext.Core.Text.Context ( splitBy, SplitContext(..) )
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude hiding (length, show)
import Protolude
......@@ -42,6 +39,7 @@ headerTsvGargV3 =
, "publication_day"
, "abstract"
, "authors"
]
---------------------------------------------------------------
data TsvGargV3 = TsvGargV3
......@@ -56,91 +54,9 @@ data TsvGargV3 = TsvGargV3
}
deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc :: TsvGargV3 -> HyperdataDocument
toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument { _hd_bdd = Just "TSV"
, _hd_doi = Just . pack . show $ did
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just dt
, _hd_authors = Nothing
, _hd_institutes = Just dau
, _hd_source = Just dab
, _hd_abstract = Nothing
, _hd_publication_date = Nothing
, _hd_publication_year = Just dpy
, _hd_publication_month = Just dpm
, _hd_publication_day = Just dpd
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector TsvDoc -> [TsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (TsvDoc { .. }) -- (TsvDoc t s mPy pm pd abst auth)
-> TsvGargV3 { d_docId = nId
, d_title = tsv_title
, d_source = tsv_source
, d_publication_year = fromMIntOrDec defaultYear tsv_publication_year
, d_publication_month = fromMaybe defaultMonth tsv_publication_month
, d_publication_day = fromMaybe defaultDay tsv_publication_day
, d_abstract = tsv_abstract
, d_authors = tsv_authors })
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= V.fromList [Paragraphs 1, Sentences 3, Chars 3]
---------------------------------------------------------------
fromDocs :: Vector TsvGargV3 -> Vector TsvDoc
fromDocs = V.map fromDocs'
where
fromDocs' (TsvGargV3 { .. }) = TsvDoc { tsv_title = d_title
, tsv_source = d_source
, tsv_publication_year = Just $ IntOrDec d_publication_year
, tsv_publication_month = Just d_publication_month
, tsv_publication_day = Just d_publication_day
, tsv_abstract = d_abstract
, tsv_authors = d_authors }
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc :: Mean -> SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc m splt doc =
let docSize = (T.length $ tsv_abstract doc) in
if (docSize > 1000) && (mod (round m) docSize >= 10)
then splitDoc' splt doc
else V.fromList [doc]
where
splitDoc' :: SplitContext -> TsvDoc -> Vector TsvDoc
splitDoc' contextSize (TsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where
firstDoc = TsvDoc { tsv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> TsvDoc { tsv_title = head' "splitDoc'2" $ sentences txt
, tsv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
, .. }
) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) tsv_abstract
---------------------------------------------------------------
---------------------------------------------------------------
type Mean = Double
docsSize :: Vector TsvDoc -> Mean
docsSize tsvDoc = mean ls
where
ls = V.toList $ V.map (fromIntegral . T.length . tsv_abstract) tsvDoc
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
......@@ -195,28 +111,12 @@ instance ToNamedRecord TsvDoc where
, "authors" .= tsv_authors
]
hyperdataDocument2tsvDoc :: HyperdataDocument -> TsvDoc
hyperdataDocument2tsvDoc h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
data Delimiter = Tab | Comma | Line deriving (Eq, Show)
tsvDecodeOptions :: Delimiter -> DecodeOptions
tsvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
tsvEncodeOptions :: Delimiter -> EncodeOptions
tsvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter :: Delimiter -> Word8
delimiter Tab = fromIntegral $ ord '\t'
delimiter Comma = fromIntegral $ ord ','
......@@ -368,33 +268,6 @@ getHeaders bl del = do
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
readFileStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> FilePath
-> IO (Either Text (Header, Vector a))
readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BL.ByteString
-> Either Text (Header, Vector a)
readByteStringLazy _f d bs = first pack $ decodeByNameWith (tsvDecodeOptions d) bs
readByteStringStrict :: (FromNamedRecord a)
=> proxy a
-> Delimiter
-> BS.ByteString
-> Either Text (Header, Vector a)
readByteStringStrict d ff = readByteStringLazy d ff . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
......@@ -424,19 +297,8 @@ readTsvHal fp = do
readTsvHalLazyBS :: BL.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalLazyBS bs = first pack $ decodeByNameWith (tsvDecodeOptions Tab) bs
readTsvHalBSStrict :: BS.ByteString -> Either Text (Header, Vector TsvHal)
readTsvHalBSStrict bs = readTsvHalLazyBS $ BL.fromStrict bs
------------------------------------------------------------------------
writeFile :: FilePath -> (Header, Vector TsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith (tsvEncodeOptions Tab) h (V.toList vs)
writeDocs2Tsv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Tsv fp hs = BL.writeFile fp $ hyperdataDocument2tsv hs
hyperdataDocument2tsv :: [HyperdataDocument] -> BL.ByteString
hyperdataDocument2tsv hs = encodeByNameWith (tsvEncodeOptions Tab) headerTsvGargV3 (map hyperdataDocument2tsvDoc hs)
------------------------------------------------------------------------
-- Hal Format
......@@ -577,27 +439,12 @@ parseHal fp = do
r <- readTsvHal fp
pure $ V.toList . V.map tsvHal2doc . snd <$> r
parseHal' :: BL.ByteString -> Either Text [HyperdataDocument]
parseHal' bs = V.toList . V.map tsvHal2doc . snd <$> readTsvHalLazyBS bs
------------------------------------------------------------------------
parseTsv :: FilePath -> IO (Either Text [HyperdataDocument])
parseTsv fp = fmap (V.toList . V.map tsv2doc . snd) <$> readTSVFile fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = do
let
result = case (testCorrectFile bs) of
Left _err -> Left _err
Right del -> readTsvLazyBS del bs
V.toList . V.map tsv2doc . snd <$> result
parseTsvC :: BL.ByteString
-> Either Text (Integer, ConduitT () HyperdataDocument Identity ())
parseTsvC bs =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment