From d5a4b1f9851fe7ca975ecdb5ec48b33f1fdbc3e3 Mon Sep 17 00:00:00 2001 From: Przemek Kaminski <pk@intrepidus.pl> Date: Thu, 30 Sep 2021 07:19:36 +0200 Subject: [PATCH] [frame write] implement flowDataText to insert docs into corpus --- nix/pkgs.nix | 1 + .../API/Node/DocumentsFromWriteNodes.hs | 57 +++++++++++++++++++ src/Gargantext/Database/Action/Flow.hs | 3 +- .../Database/Admin/Types/Hyperdata/Frame.hs | 12 +++- src/Gargantext/Database/Query/Table/Node.hs | 18 ++++++ 5 files changed, 89 insertions(+), 2 deletions(-) diff --git a/nix/pkgs.nix b/nix/pkgs.nix index c7ff4275..8c4bb364 100644 --- a/nix/pkgs.nix +++ b/nix/pkgs.nix @@ -9,6 +9,7 @@ rec { ]; nonhsBuildInputs = with pkgs; [ bzip2 + docker-compose git gmp gsl diff --git a/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs b/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs index 32d57e86..272e22d1 100644 --- a/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs +++ b/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs @@ -17,13 +17,25 @@ Portability : POSIX module Gargantext.API.Node.DocumentsFromWriteNodes where +import Control.Lens ((^.)) import Data.Aeson +import Data.Either (Either(..), rights) import Data.Swagger +import qualified Data.Text as T import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Prelude (GargServer) +import Gargantext.Core (Lang(..)) +import Gargantext.Core.Text.Corpus.Parsers.FrameWrite +import Gargantext.Core.Text.Terms (TermType(..)) +import Gargantext.Core.Types.Individu (User(..)) +import Gargantext.Database.Action.Flow (flowDataText, DataText(..)) import Gargantext.Database.Action.Flow.Types (FlowCmdM) +import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)) +import Gargantext.Database.Admin.Types.Hyperdata.Frame import Gargantext.Database.Admin.Types.Node +import Gargantext.Database.Query.Table.Node (getChildrenByType, getNodeWith) +import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Prelude import GHC.Generics (Generic) import Servant @@ -71,9 +83,54 @@ documentsFromWriteNodes uId nId p logStatus = do _ <- printDebug "[documentsFromWriteNodes] inside job, nId" nId _ <- printDebug "[documentsFromWriteNodes] inside job, p" p + frameWriteIds <- getChildrenByType nId NodeFrameWrite + _ <- printDebug "[documentsFromWriteNodes] children" frameWriteIds + + -- https://write.frame.gargantext.org/<frame_id>/download + frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds + + frameWritesWithContents <- liftBase $ + mapM (\node -> do + contents <- getHyperdataFrameContents (node ^. node_hyperdata) + pure (node, contents) + ) frameWrites + _ <- printDebug "[documentsFromWriteNodes] frameWritesWithContents" frameWritesWithContents + + let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents + let parsed = rights parsedE + _ <- printDebug "[documentsFromWriteNodes] parsed" parsed + + _ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) nId Nothing + pure JobLog { _scst_succeeded = Just 2 , _scst_failed = Just 0 , _scst_remaining = Just 0 , _scst_events = Just [] } ------------------------------------------------------------------------ +hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument +hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) = + case parseLines contents of + Left _ -> Left "Error parsing node" + Right (Parsed { authors, contents = c, date, source, title = t }) -> + let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] in + let authors' = T.concat $ authorJoinSingle <$> authors in + Right HyperdataDocument { _hd_bdd = Just "FrameWrite" + , _hd_doi = Nothing + , _hd_url = Nothing + , _hd_uniqId = Nothing + , _hd_uniqIdBdd = Nothing + , _hd_page = Nothing + , _hd_title = Just t + , _hd_authors = Just authors' + , _hd_institutes = Nothing + , _hd_source = source + , _hd_abstract = Just c + , _hd_publication_date = date + , _hd_publication_year = Nothing -- TODO + , _hd_publication_month = Nothing -- TODO + , _hd_publication_day = Nothing -- TODO + , _hd_publication_hour = Nothing + , _hd_publication_minute = Nothing + , _hd_publication_second = Nothing + , _hd_language_iso2 = Just $ T.pack $ show EN } diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs index 57bb1849..9fe79748 100644 --- a/src/Gargantext/Database/Action/Flow.hs +++ b/src/Gargantext/Database/Action/Flow.hs @@ -24,7 +24,8 @@ Portability : POSIX {-# LANGUAGE TemplateHaskell #-} module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) - ( getDataText + ( DataText(..) + , getDataText , flowDataText , flow diff --git a/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs b/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs index c1bfdec8..42e5fc69 100644 --- a/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs +++ b/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs @@ -21,15 +21,20 @@ Portability : POSIX module Gargantext.Database.Admin.Types.Hyperdata.Frame where +import Control.Lens +import Data.ByteString.Lazy (toStrict) +import qualified Data.Text as T +import Data.Text.Encoding (decodeUtf8) import Gargantext.Prelude import Gargantext.Database.Admin.Types.Hyperdata.Prelude +import qualified Network.Wreq as Wreq ------------------------------------------------------------------------ data HyperdataFrame = HyperdataFrame { _hf_base :: !Text , _hf_frame_id :: !Text } - deriving (Generic) + deriving (Generic, Show) defaultHyperdataFrame :: HyperdataFrame @@ -63,3 +68,8 @@ instance ToSchema HyperdataFrame where & mapped.schema.description ?~ "Frame Hyperdata" & mapped.schema.example ?~ toJSON defaultHyperdataFrame +getHyperdataFrameContents :: HyperdataFrame -> IO Text +getHyperdataFrameContents (HyperdataFrame { _hf_base, _hf_frame_id }) = do + let path = T.concat [_hf_base, "/", _hf_frame_id, "/download"] + r <- Wreq.get $ T.unpack path + pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody diff --git a/src/Gargantext/Database/Query/Table/Node.hs b/src/Gargantext/Database/Query/Table/Node.hs index f3544efe..9945bb1c 100644 --- a/src/Gargantext/Database/Query/Table/Node.hs +++ b/src/Gargantext/Database/Query/Table/Node.hs @@ -136,6 +136,24 @@ getClosestParentIdByType nId nType = do WHERE n1.id = ? AND 0 = ?; |] +-- | Given a node id, find all it's children (no matter how deep) of +-- given node type. +getChildrenByType :: HasDBid NodeType + => NodeId + -> NodeType + -> Cmd err [NodeId] +getChildrenByType nId nType = do + result <- runPGSQuery query (nId, 0 :: Int) + children_lst <- mapM (\(id, _) -> getChildrenByType id nType) result + pure $ concat $ [fst <$> filter (\(_, pTypename) -> pTypename == toDBid nType) result] ++ children_lst + where + query :: DPS.Query + query = [sql| + SELECT n.id, n.typename + FROM nodes n + WHERE n.parent_id = ? AND 0 = ?; + |] + ------------------------------------------------------------------------ getDocumentsV3WithParentId :: HasDBid NodeType => NodeId -> Cmd err [Node HyperdataDocumentV3] getDocumentsV3WithParentId n = runOpaQuery $ selectNodesWith' n (Just NodeDocument) -- 2.21.0