[tsv] separate tsvIsidore format, some refactoring

parent 2ceb742b
Pipeline #7988 passed with stages
in 49 minutes and 24 seconds
......@@ -234,6 +234,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Gargantext.Core.Text.Corpus.Parsers.TSV.Types
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
......
......@@ -11,15 +11,12 @@ Portability : POSIX
module Gargantext.API.Node.Document.Export
( documentExportAPI
-- * Internals
, get_document_json
, getDocumentExport
)
where
import Control.Lens (view)
import Data.ByteString.Lazy.Char8 qualified as BSC
import Data.Csv (encodeDefaultOrderedByNameWith, defaultEncodeOptions, encDelimiter, encQuoting, Quoting(..))
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Time.Clock.System (getSystemTime, systemSeconds)
import Data.Time.LocalTime (getCurrentTimeZone, TimeZone (timeZoneMinutes))
import Data.Version (showVersion)
......@@ -27,6 +24,8 @@ import Gargantext.API.Node.Document.Export.Types
import Gargantext.API.Prelude (IsGargServer)
import Gargantext.API.Routes.Named.Document qualified as Named
import Gargantext.Core (toDBid)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Gargantext.Core.Text.Corpus.Parsers.TSV.Utils (writeDocs2TsvOrdered)
import Gargantext.Database.Admin.Types.Node (DocId, NodeId, NodeType(..))
import Gargantext.Database.Prelude
import Gargantext.Database.Query.Facet (runViewDocuments, Facet(..))
......@@ -35,7 +34,7 @@ import Gargantext.Database.Query.Table.Node.User ( getNodeUser )
import Gargantext.Database.Schema.Node (NodePoly(..), node_user_id)
import Gargantext.Prelude
import Paths_gargantext qualified as PG -- cabal magic build module
import Servant ( addHeader, Header, Headers(getResponse) )
import Servant ( addHeader, Header, Headers )
import Servant.Server.Generic (AsServerT)
documentExportAPI :: IsGargServer env err m
......@@ -57,50 +56,21 @@ getDocumentsJSON :: IsGargServer env err m
-> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] DocumentExport)
getDocumentsJSON nodeUserId pId = do
dexp <- get_document_json nodeUserId pId
dexp <- getDocumentExport nodeUserId pId
pure $ addHeader (T.concat [ "attachment; filename="
, "GarganText_DocsList-"
, T.pack (show pId)
, ".json" ]) dexp
get_document_json :: IsGargServer err env m => NodeId -> DocId -> m DocumentExport
get_document_json nodeUserId pId = do
runDBQuery $ do
uId <- view node_user_id <$> getNodeUser nodeUserId
mcId <- getClosestParentIdByType pId NodeCorpus
let cId = maybe (panicTrace "[G.A.N.D.Export] Node has no parent") identity mcId
docs <- runViewDocuments cId False Nothing Nothing Nothing Nothing Nothing
pure DocumentExport { _de_documents = mapFacetDoc uId <$> docs
, _de_garg_version = T.pack $ showVersion PG.version }
where
mapFacetDoc uId (FacetDoc { .. }) =
Document { _d_document =
Node { _node_id = facetDoc_id
, _node_hash_id = Nothing
, _node_typename = toDBid NodeDocument
, _node_user_id = uId
, _node_parent_id = Nothing
, _node_name = facetDoc_title
, _node_date = facetDoc_created
, _node_hyperdata = facetDoc_hyperdata }
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = "" }
_mapDoc d = Document { _d_document = d
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = ""}
getDocumentsJSONZip :: IsGargServer env err m
=> NodeId
-- ^ The Node ID of the target user
-> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] DocumentExportZIP) -- [Document]
getDocumentsJSONZip userNodeId pId = do
dJSON <- getDocumentsJSON userNodeId pId
dexp <- getDocumentExport userNodeId pId
systime <- liftBase getSystemTime
tz <- liftBase getCurrentTimeZone
let dexp = getResponse dJSON
let dexpz = DocumentExportZIP { _dez_dexp = dexp
, _dez_doc_id = pId
-- see https://github.com/jgm/zip-archive/commit/efe4423a9a2b1dc2a4d413917a933828d3f8dc0f
......@@ -116,11 +86,43 @@ getDocumentsTSV :: IsGargServer err env m
-> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] T.Text) -- [Document]
getDocumentsTSV userNodeId pId = do
dJSON <- getDocumentsJSON userNodeId pId
let DocumentExport { _de_documents } = getResponse dJSON
let ret = TE.decodeUtf8 $ BSC.toStrict $ encodeDefaultOrderedByNameWith (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t', encQuoting = QuoteAll }) _de_documents
dexp <- getDocumentExport userNodeId pId
let ret = writeDocs2TsvOrdered TSVv3.doc2tsv (deHyperdata dexp)
pure $ addHeader (T.concat [ "attachment; filename=GarganText_DocsList-"
, T.pack $ show pId
, ".tsv"])
ret
getDocumentExport :: IsGargServer err env m => NodeId -> DocId -> m DocumentExport
getDocumentExport nodeUserId pId = do
runDBQuery $ do
uId <- view node_user_id <$> getNodeUser nodeUserId
mcId <- getClosestParentIdByType pId NodeCorpus
let cId = maybe (panicTrace "[G.A.N.D.Export] Node has no parent") identity mcId
docs <- runViewDocuments cId False Nothing Nothing Nothing Nothing Nothing
pure DocumentExport { _de_documents = mapFacetDoc uId <$> docs
, _de_garg_version = T.pack $ showVersion PG.version }
where
mapFacetDoc uId (FacetDoc { .. }) =
Document { _d_document =
Node { _node_id = facetDoc_id
, _node_hash_id = Nothing
, _node_typename = toDBid NodeDocument
, _node_user_id = uId
, _node_parent_id = Nothing
, _node_name = facetDoc_title
, _node_date = facetDoc_created
, _node_hyperdata = facetDoc_hyperdata }
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = "" }
_mapDoc d = Document { _d_document = d
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = ""}
......@@ -16,9 +16,7 @@ module Gargantext.API.Node.Document.Export.Types where
import Codec.Serialise.Class hiding (encode)
import Data.Aeson (encode)
import Data.Aeson.TH (deriveJSON)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..), (.=), header, namedRecord)
import Data.Swagger ( genericDeclareNamedSchema, ToParamSchema(..), ToSchema(..) )
import Data.Text.Encoding qualified as TE
import Data.Text qualified as T
import Gargantext.Core.Types ( Node, TODO )
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
......@@ -40,6 +38,10 @@ data DocumentExport =
instance Serialise DocumentExport where
deHyperdata :: DocumentExport -> [HyperdataDocument]
deHyperdata (DocumentExport { _de_documents }) =
(\(Document { _d_document }) -> _node_hyperdata _d_document) <$> _de_documents
-- | This is to represent a zipped document export. We want to have doc_id in zipped file name.
data DocumentExportZIP =
DocumentExportZIP { _dez_dexp :: DocumentExport
......@@ -60,26 +62,6 @@ instance Show Document where
show (Document _ _ h1) = "Document " <> Prelude.show h1
instance Serialise Document where
--instance Read Document where
-- read "" = panic "not implemented"
instance DefaultOrdered Document where
headerOrder _ = header ["Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"]
instance ToNamedRecord Document where
toNamedRecord (Document { _d_document = Node { .. }}) =
namedRecord
[ "Publication Day" .= _hd_publication_day _node_hyperdata
, "Publication Month" .= _hd_publication_month _node_hyperdata
, "Publication Year" .= _hd_publication_year _node_hyperdata
, "Authors" .= _hd_authors _node_hyperdata
, "Title" .= _hd_title _node_hyperdata
, "Source" .= (TE.encodeUtf8 <$> _hd_source _node_hyperdata)
, "Abstract" .= (TE.encodeUtf8 <$> _hd_abstract _node_hyperdata) ]
data Ngrams =
Ngrams { _ng_ngrams :: [Text]
......
......@@ -37,7 +37,7 @@ import Gargantext.API.Auth.PolicyCheck (remoteExportChecks)
import Gargantext.API.Errors.Types
import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams.Types (NgramsList)
import Gargantext.API.Node.Document.Export (get_document_json)
import Gargantext.API.Node.Document.Export (getDocumentExport)
import Gargantext.API.Node.Document.Export.Types ( DocumentExport(_de_documents) )
import Gargantext.API.Prelude (IsGargServer)
import Gargantext.API.Routes.Client (remoteImportClient)
......@@ -239,7 +239,7 @@ makeExportable userNodeId (TreeN x xs)
NodeCorpus -> EN_corpus <$> pure x
NodeGraph -> EN_graph <$> pure x
NodePhylo -> EN_phylo <$> pure x
NodeTexts -> EN_document <$> pure x <*> get_document_json userNodeId (_node_id x)
NodeTexts -> EN_document <$> pure x <*> getDocumentExport userNodeId (_node_id x)
NodeList -> EN_terms <$> pure x <*> runDBQuery (getNgramsList env (_node_id x))
Notes -> case JS.parseMaybe JS.parseJSON (_node_hyperdata x) of
Nothing
......
......@@ -22,7 +22,7 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore qualified as TsvIsidore
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults
......@@ -55,7 +55,7 @@ isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> IO ()
isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq
writeDocs2Tsv TSVv3.headerTsvGargV3 TSVv3.doc2tsv fp hdocs
writeDocs2Tsv TsvIsidore.headerIsidore TsvIsidore.doc2tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do
......
......@@ -20,17 +20,6 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..
import Gargantext.Prelude
headerTsvGargV3 :: Header
headerTsvGargV3 =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvGargV3 = TsvGargV3
{ d_docId :: !Int
, d_title :: !Text
......@@ -97,15 +86,24 @@ instance FromNamedRecord TsvDoc where
tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. }
instance DefaultOrdered TsvDoc where
headerOrder _ = header [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"]
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
namedRecord [ "Title" .= tsv_title
, "Source" .= tsv_source
, "Publication Year" .= tsv_publication_year
, "Publication Month" .= tsv_publication_month
, "Publication Day" .= tsv_publication_day
, "Abstract" .= tsv_abstract
, "Authors" .= tsv_authors
]
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore where
import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
headerIsidore :: Header
headerIsidore =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
doc2tsv :: HyperdataDocument -> TsvDoc
doc2tsv h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
......@@ -15,6 +15,8 @@ import Data.ByteString.Lazy qualified as BL
import Data.ByteString.Lazy.Char8 qualified as B8L
import Data.Csv
import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Vector (Vector)
import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
......@@ -82,7 +84,14 @@ writeDocs2Tsv :: ToNamedRecord tsvDoc
-> FilePath
-> [doc]
-> IO ()
writeDocs2Tsv hdr doc2tsv fp hs =
BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> hs)
writeDocs2Tsv hdr doc2tsv fp docs =
BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> docs)
writeDocs2TsvOrdered :: (ToNamedRecord tsvDoc, DefaultOrdered tsvDoc)
=> (doc -> tsvDoc)
-> [doc]
-> T.Text
writeDocs2TsvOrdered doc2tsv docs =
TE.decodeUtf8 $ B8L.toStrict $
encodeDefaultOrderedByNameWith ((tsvEncodeOptions Tab) { encQuoting = QuoteAll }) (doc2tsv <$> docs)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment