[tsv] separate tsvIsidore format, some refactoring

parent 2ceb742b
Pipeline #7988 passed with stages
in 49 minutes and 24 seconds
...@@ -234,6 +234,7 @@ library ...@@ -234,6 +234,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3
Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4 Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv4
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal Gargantext.Core.Text.Corpus.Parsers.TSV.TsvHal
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore
Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo Gargantext.Core.Text.Corpus.Parsers.TSV.TsvPhylo
Gargantext.Core.Text.Corpus.Parsers.TSV.Types Gargantext.Core.Text.Corpus.Parsers.TSV.Types
Gargantext.Core.Text.Corpus.Parsers.TSV.Utils Gargantext.Core.Text.Corpus.Parsers.TSV.Utils
......
...@@ -11,15 +11,12 @@ Portability : POSIX ...@@ -11,15 +11,12 @@ Portability : POSIX
module Gargantext.API.Node.Document.Export module Gargantext.API.Node.Document.Export
( documentExportAPI ( documentExportAPI
-- * Internals -- * Internals
, get_document_json , getDocumentExport
) )
where where
import Control.Lens (view) import Control.Lens (view)
import Data.ByteString.Lazy.Char8 qualified as BSC
import Data.Csv (encodeDefaultOrderedByNameWith, defaultEncodeOptions, encDelimiter, encQuoting, Quoting(..))
import Data.Text qualified as T import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Time.Clock.System (getSystemTime, systemSeconds) import Data.Time.Clock.System (getSystemTime, systemSeconds)
import Data.Time.LocalTime (getCurrentTimeZone, TimeZone (timeZoneMinutes)) import Data.Time.LocalTime (getCurrentTimeZone, TimeZone (timeZoneMinutes))
import Data.Version (showVersion) import Data.Version (showVersion)
...@@ -27,6 +24,8 @@ import Gargantext.API.Node.Document.Export.Types ...@@ -27,6 +24,8 @@ import Gargantext.API.Node.Document.Export.Types
import Gargantext.API.Prelude (IsGargServer) import Gargantext.API.Prelude (IsGargServer)
import Gargantext.API.Routes.Named.Document qualified as Named import Gargantext.API.Routes.Named.Document qualified as Named
import Gargantext.Core (toDBid) import Gargantext.Core (toDBid)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3
import Gargantext.Core.Text.Corpus.Parsers.TSV.Utils (writeDocs2TsvOrdered)
import Gargantext.Database.Admin.Types.Node (DocId, NodeId, NodeType(..)) import Gargantext.Database.Admin.Types.Node (DocId, NodeId, NodeType(..))
import Gargantext.Database.Prelude import Gargantext.Database.Prelude
import Gargantext.Database.Query.Facet (runViewDocuments, Facet(..)) import Gargantext.Database.Query.Facet (runViewDocuments, Facet(..))
...@@ -35,7 +34,7 @@ import Gargantext.Database.Query.Table.Node.User ( getNodeUser ) ...@@ -35,7 +34,7 @@ import Gargantext.Database.Query.Table.Node.User ( getNodeUser )
import Gargantext.Database.Schema.Node (NodePoly(..), node_user_id) import Gargantext.Database.Schema.Node (NodePoly(..), node_user_id)
import Gargantext.Prelude import Gargantext.Prelude
import Paths_gargantext qualified as PG -- cabal magic build module import Paths_gargantext qualified as PG -- cabal magic build module
import Servant ( addHeader, Header, Headers(getResponse) ) import Servant ( addHeader, Header, Headers )
import Servant.Server.Generic (AsServerT) import Servant.Server.Generic (AsServerT)
documentExportAPI :: IsGargServer env err m documentExportAPI :: IsGargServer env err m
...@@ -57,50 +56,21 @@ getDocumentsJSON :: IsGargServer env err m ...@@ -57,50 +56,21 @@ getDocumentsJSON :: IsGargServer env err m
-> DocId -> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] DocumentExport) -> m (Headers '[Header "Content-Disposition" T.Text] DocumentExport)
getDocumentsJSON nodeUserId pId = do getDocumentsJSON nodeUserId pId = do
dexp <- get_document_json nodeUserId pId dexp <- getDocumentExport nodeUserId pId
pure $ addHeader (T.concat [ "attachment; filename=" pure $ addHeader (T.concat [ "attachment; filename="
, "GarganText_DocsList-" , "GarganText_DocsList-"
, T.pack (show pId) , T.pack (show pId)
, ".json" ]) dexp , ".json" ]) dexp
get_document_json :: IsGargServer err env m => NodeId -> DocId -> m DocumentExport
get_document_json nodeUserId pId = do
runDBQuery $ do
uId <- view node_user_id <$> getNodeUser nodeUserId
mcId <- getClosestParentIdByType pId NodeCorpus
let cId = maybe (panicTrace "[G.A.N.D.Export] Node has no parent") identity mcId
docs <- runViewDocuments cId False Nothing Nothing Nothing Nothing Nothing
pure DocumentExport { _de_documents = mapFacetDoc uId <$> docs
, _de_garg_version = T.pack $ showVersion PG.version }
where
mapFacetDoc uId (FacetDoc { .. }) =
Document { _d_document =
Node { _node_id = facetDoc_id
, _node_hash_id = Nothing
, _node_typename = toDBid NodeDocument
, _node_user_id = uId
, _node_parent_id = Nothing
, _node_name = facetDoc_title
, _node_date = facetDoc_created
, _node_hyperdata = facetDoc_hyperdata }
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = "" }
_mapDoc d = Document { _d_document = d
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = ""}
getDocumentsJSONZip :: IsGargServer env err m getDocumentsJSONZip :: IsGargServer env err m
=> NodeId => NodeId
-- ^ The Node ID of the target user -- ^ The Node ID of the target user
-> DocId -> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] DocumentExportZIP) -- [Document] -> m (Headers '[Header "Content-Disposition" T.Text] DocumentExportZIP) -- [Document]
getDocumentsJSONZip userNodeId pId = do getDocumentsJSONZip userNodeId pId = do
dJSON <- getDocumentsJSON userNodeId pId dexp <- getDocumentExport userNodeId pId
systime <- liftBase getSystemTime systime <- liftBase getSystemTime
tz <- liftBase getCurrentTimeZone tz <- liftBase getCurrentTimeZone
let dexp = getResponse dJSON
let dexpz = DocumentExportZIP { _dez_dexp = dexp let dexpz = DocumentExportZIP { _dez_dexp = dexp
, _dez_doc_id = pId , _dez_doc_id = pId
-- see https://github.com/jgm/zip-archive/commit/efe4423a9a2b1dc2a4d413917a933828d3f8dc0f -- see https://github.com/jgm/zip-archive/commit/efe4423a9a2b1dc2a4d413917a933828d3f8dc0f
...@@ -116,11 +86,43 @@ getDocumentsTSV :: IsGargServer err env m ...@@ -116,11 +86,43 @@ getDocumentsTSV :: IsGargServer err env m
-> DocId -> DocId
-> m (Headers '[Header "Content-Disposition" T.Text] T.Text) -- [Document] -> m (Headers '[Header "Content-Disposition" T.Text] T.Text) -- [Document]
getDocumentsTSV userNodeId pId = do getDocumentsTSV userNodeId pId = do
dJSON <- getDocumentsJSON userNodeId pId dexp <- getDocumentExport userNodeId pId
let DocumentExport { _de_documents } = getResponse dJSON let ret = writeDocs2TsvOrdered TSVv3.doc2tsv (deHyperdata dexp)
let ret = TE.decodeUtf8 $ BSC.toStrict $ encodeDefaultOrderedByNameWith (defaultEncodeOptions {encDelimiter = fromIntegral $ ord '\t', encQuoting = QuoteAll }) _de_documents
pure $ addHeader (T.concat [ "attachment; filename=GarganText_DocsList-" pure $ addHeader (T.concat [ "attachment; filename=GarganText_DocsList-"
, T.pack $ show pId , T.pack $ show pId
, ".tsv"]) , ".tsv"])
ret ret
getDocumentExport :: IsGargServer err env m => NodeId -> DocId -> m DocumentExport
getDocumentExport nodeUserId pId = do
runDBQuery $ do
uId <- view node_user_id <$> getNodeUser nodeUserId
mcId <- getClosestParentIdByType pId NodeCorpus
let cId = maybe (panicTrace "[G.A.N.D.Export] Node has no parent") identity mcId
docs <- runViewDocuments cId False Nothing Nothing Nothing Nothing Nothing
pure DocumentExport { _de_documents = mapFacetDoc uId <$> docs
, _de_garg_version = T.pack $ showVersion PG.version }
where
mapFacetDoc uId (FacetDoc { .. }) =
Document { _d_document =
Node { _node_id = facetDoc_id
, _node_hash_id = Nothing
, _node_typename = toDBid NodeDocument
, _node_user_id = uId
, _node_parent_id = Nothing
, _node_name = facetDoc_title
, _node_date = facetDoc_created
, _node_hyperdata = facetDoc_hyperdata }
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = "" }
_mapDoc d = Document { _d_document = d
, _d_ngrams = Ngrams { _ng_ngrams = []
, _ng_hash = "" }
, _d_hash = ""}
...@@ -16,9 +16,7 @@ module Gargantext.API.Node.Document.Export.Types where ...@@ -16,9 +16,7 @@ module Gargantext.API.Node.Document.Export.Types where
import Codec.Serialise.Class hiding (encode) import Codec.Serialise.Class hiding (encode)
import Data.Aeson (encode) import Data.Aeson (encode)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Csv (DefaultOrdered(..), ToNamedRecord(..), (.=), header, namedRecord)
import Data.Swagger ( genericDeclareNamedSchema, ToParamSchema(..), ToSchema(..) ) import Data.Swagger ( genericDeclareNamedSchema, ToParamSchema(..), ToSchema(..) )
import Data.Text.Encoding qualified as TE
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.Core.Types ( Node, TODO ) import Gargantext.Core.Types ( Node, TODO )
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
...@@ -40,6 +38,10 @@ data DocumentExport = ...@@ -40,6 +38,10 @@ data DocumentExport =
instance Serialise DocumentExport where instance Serialise DocumentExport where
deHyperdata :: DocumentExport -> [HyperdataDocument]
deHyperdata (DocumentExport { _de_documents }) =
(\(Document { _d_document }) -> _node_hyperdata _d_document) <$> _de_documents
-- | This is to represent a zipped document export. We want to have doc_id in zipped file name. -- | This is to represent a zipped document export. We want to have doc_id in zipped file name.
data DocumentExportZIP = data DocumentExportZIP =
DocumentExportZIP { _dez_dexp :: DocumentExport DocumentExportZIP { _dez_dexp :: DocumentExport
...@@ -60,26 +62,6 @@ instance Show Document where ...@@ -60,26 +62,6 @@ instance Show Document where
show (Document _ _ h1) = "Document " <> Prelude.show h1 show (Document _ _ h1) = "Document " <> Prelude.show h1
instance Serialise Document where instance Serialise Document where
--instance Read Document where
-- read "" = panic "not implemented"
instance DefaultOrdered Document where
headerOrder _ = header ["Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"]
instance ToNamedRecord Document where
toNamedRecord (Document { _d_document = Node { .. }}) =
namedRecord
[ "Publication Day" .= _hd_publication_day _node_hyperdata
, "Publication Month" .= _hd_publication_month _node_hyperdata
, "Publication Year" .= _hd_publication_year _node_hyperdata
, "Authors" .= _hd_authors _node_hyperdata
, "Title" .= _hd_title _node_hyperdata
, "Source" .= (TE.encodeUtf8 <$> _hd_source _node_hyperdata)
, "Abstract" .= (TE.encodeUtf8 <$> _hd_abstract _node_hyperdata) ]
data Ngrams = data Ngrams =
Ngrams { _ng_ngrams :: [Text] Ngrams { _ng_ngrams :: [Text]
......
...@@ -37,7 +37,7 @@ import Gargantext.API.Auth.PolicyCheck (remoteExportChecks) ...@@ -37,7 +37,7 @@ import Gargantext.API.Auth.PolicyCheck (remoteExportChecks)
import Gargantext.API.Errors.Types import Gargantext.API.Errors.Types
import Gargantext.API.Ngrams.Prelude (getNgramsList) import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams.Types (NgramsList) import Gargantext.API.Ngrams.Types (NgramsList)
import Gargantext.API.Node.Document.Export (get_document_json) import Gargantext.API.Node.Document.Export (getDocumentExport)
import Gargantext.API.Node.Document.Export.Types ( DocumentExport(_de_documents) ) import Gargantext.API.Node.Document.Export.Types ( DocumentExport(_de_documents) )
import Gargantext.API.Prelude (IsGargServer) import Gargantext.API.Prelude (IsGargServer)
import Gargantext.API.Routes.Client (remoteImportClient) import Gargantext.API.Routes.Client (remoteImportClient)
...@@ -239,7 +239,7 @@ makeExportable userNodeId (TreeN x xs) ...@@ -239,7 +239,7 @@ makeExportable userNodeId (TreeN x xs)
NodeCorpus -> EN_corpus <$> pure x NodeCorpus -> EN_corpus <$> pure x
NodeGraph -> EN_graph <$> pure x NodeGraph -> EN_graph <$> pure x
NodePhylo -> EN_phylo <$> pure x NodePhylo -> EN_phylo <$> pure x
NodeTexts -> EN_document <$> pure x <*> get_document_json userNodeId (_node_id x) NodeTexts -> EN_document <$> pure x <*> getDocumentExport userNodeId (_node_id x)
NodeList -> EN_terms <$> pure x <*> runDBQuery (getNgramsList env (_node_id x)) NodeList -> EN_terms <$> pure x <*> runDBQuery (getNgramsList env (_node_id x))
Notes -> case JS.parseMaybe JS.parseJSON (_node_hyperdata x) of Notes -> case JS.parseMaybe JS.parseJSON (_node_hyperdata x) of
Nothing Nothing
......
...@@ -22,7 +22,7 @@ import Data.Text qualified as Text ...@@ -22,7 +22,7 @@ import Data.Text qualified as Text
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers (cleanText) import Gargantext.Core.Text.Corpus.Parsers (cleanText)
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv) import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Core.Text.Corpus.Parsers.TSV.TSVv3 qualified as TSVv3 import Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore qualified as TsvIsidore
import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date import Gargantext.Core.Text.Corpus.Parsers.Date qualified as Date
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) ) import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Defaults qualified as Defaults import Gargantext.Defaults qualified as Defaults
...@@ -55,7 +55,7 @@ isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit ...@@ -55,7 +55,7 @@ isidore2tsvFile :: FilePath -> Lang -> Maybe Isidore.Limit
-> IO () -> IO ()
isidore2tsvFile fp lang li tq aq = do isidore2tsvFile fp lang li tq aq = do
hdocs <- get lang li tq aq hdocs <- get lang li tq aq
writeDocs2Tsv TSVv3.headerTsvGargV3 TSVv3.doc2tsv fp hdocs writeDocs2Tsv TsvIsidore.headerIsidore TsvIsidore.doc2tsv fp hdocs
isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument isidoreToDoc :: Lang -> IsidoreDoc -> IO HyperdataDocument
isidoreToDoc lang (IsidoreDoc t a d u s as) = do isidoreToDoc lang (IsidoreDoc t a d u s as) = do
......
...@@ -20,17 +20,6 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(.. ...@@ -20,17 +20,6 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..
import Gargantext.Prelude import Gargantext.Prelude
headerTsvGargV3 :: Header
headerTsvGargV3 =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvGargV3 = TsvGargV3 data TsvGargV3 = TsvGargV3
{ d_docId :: !Int { d_docId :: !Int
, d_title :: !Text , d_title :: !Text
...@@ -97,15 +86,24 @@ instance FromNamedRecord TsvDoc where ...@@ -97,15 +86,24 @@ instance FromNamedRecord TsvDoc where
tsv_authors <- r .: "authors" <|> r .: "Authors" tsv_authors <- r .: "authors" <|> r .: "Authors"
pure $ TsvDoc { .. } pure $ TsvDoc { .. }
instance DefaultOrdered TsvDoc where
headerOrder _ = header [ "Publication Day"
, "Publication Month"
, "Publication Year"
, "Authors"
, "Title"
, "Source"
, "Abstract"]
instance ToNamedRecord TsvDoc where instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) = toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title namedRecord [ "Title" .= tsv_title
, "source" .= tsv_source , "Source" .= tsv_source
, "publication_year" .= tsv_publication_year , "Publication Year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month , "Publication Month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day , "Publication Day" .= tsv_publication_day
, "abstract" .= tsv_abstract , "Abstract" .= tsv_abstract
, "authors" .= tsv_authors , "Authors" .= tsv_authors
] ]
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.TSV.TsvIsidore where
import Data.Csv
import Gargantext.Core.Text.Corpus.Parsers.TSV.IntOrDec
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
headerIsidore :: Header
headerIsidore =
header [ "title"
, "source"
, "publication_year"
, "publication_month"
, "publication_day"
, "abstract"
, "authors"
]
data TsvDoc = TsvDoc
{ tsv_title :: !Text
, tsv_source :: !Text
, tsv_publication_year :: !(Maybe IntOrDec)
, tsv_publication_month :: !(Maybe Int)
, tsv_publication_day :: !(Maybe Int)
, tsv_abstract :: !Text
, tsv_authors :: !Text
}
deriving (Show)
instance ToNamedRecord TsvDoc where
toNamedRecord (TsvDoc{ .. }) =
namedRecord [ "title" .= tsv_title
, "source" .= tsv_source
, "publication_year" .= tsv_publication_year
, "publication_month" .= tsv_publication_month
, "publication_day" .= tsv_publication_day
, "abstract" .= tsv_abstract
, "authors" .= tsv_authors
]
doc2tsv :: HyperdataDocument -> TsvDoc
doc2tsv h = TsvDoc { tsv_title = m $ _hd_title h
, tsv_source = m $ _hd_source h
, tsv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
, tsv_publication_month = Just $ mI $ _hd_publication_month h
, tsv_publication_day = Just $ mI $ _hd_publication_day h
, tsv_abstract = m $ _hd_abstract h
, tsv_authors = m $ _hd_authors h }
where
m = maybe "" identity
mI = maybe 0 identity
...@@ -15,6 +15,8 @@ import Data.ByteString.Lazy qualified as BL ...@@ -15,6 +15,8 @@ import Data.ByteString.Lazy qualified as BL
import Data.ByteString.Lazy.Char8 qualified as B8L import Data.ByteString.Lazy.Char8 qualified as B8L
import Data.Csv import Data.Csv
import Data.Text (pack) import Data.Text (pack)
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Vector (Vector) import Data.Vector (Vector)
import Data.Vector qualified as V import Data.Vector qualified as V
import Gargantext.Core.Text.Corpus.Parsers.TSV.Types import Gargantext.Core.Text.Corpus.Parsers.TSV.Types
...@@ -82,7 +84,14 @@ writeDocs2Tsv :: ToNamedRecord tsvDoc ...@@ -82,7 +84,14 @@ writeDocs2Tsv :: ToNamedRecord tsvDoc
-> FilePath -> FilePath
-> [doc] -> [doc]
-> IO () -> IO ()
writeDocs2Tsv hdr doc2tsv fp hs = writeDocs2Tsv hdr doc2tsv fp docs =
BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> hs) BL.writeFile fp $ encodeByNameWith (tsvEncodeOptions Tab) hdr (doc2tsv <$> docs)
writeDocs2TsvOrdered :: (ToNamedRecord tsvDoc, DefaultOrdered tsvDoc)
=> (doc -> tsvDoc)
-> [doc]
-> T.Text
writeDocs2TsvOrdered doc2tsv docs =
TE.decodeUtf8 $ B8L.toStrict $
encodeDefaultOrderedByNameWith ((tsvEncodeOptions Tab) { encQuoting = QuoteAll }) (doc2tsv <$> docs)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment