From d5a4b1f9851fe7ca975ecdb5ec48b33f1fdbc3e3 Mon Sep 17 00:00:00 2001
From: Przemek Kaminski <pk@intrepidus.pl>
Date: Thu, 30 Sep 2021 07:19:36 +0200
Subject: [PATCH] [frame write] implement flowDataText to insert docs into
 corpus

---
 nix/pkgs.nix                                  |  1 +
 .../API/Node/DocumentsFromWriteNodes.hs       | 57 +++++++++++++++++++
 src/Gargantext/Database/Action/Flow.hs        |  3 +-
 .../Database/Admin/Types/Hyperdata/Frame.hs   | 12 +++-
 src/Gargantext/Database/Query/Table/Node.hs   | 18 ++++++
 5 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/nix/pkgs.nix b/nix/pkgs.nix
index c7ff4275..8c4bb364 100644
--- a/nix/pkgs.nix
+++ b/nix/pkgs.nix
@@ -9,6 +9,7 @@ rec {
   ];
   nonhsBuildInputs = with pkgs; [
     bzip2
+    docker-compose
     git
     gmp
     gsl
diff --git a/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs b/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
index 32d57e86..272e22d1 100644
--- a/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
+++ b/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
@@ -17,13 +17,25 @@ Portability : POSIX
 module Gargantext.API.Node.DocumentsFromWriteNodes
       where
 
+import Control.Lens ((^.))
 import Data.Aeson
+import Data.Either (Either(..), rights)
 import Data.Swagger
+import qualified Data.Text as T
 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
 import Gargantext.API.Admin.Types (HasSettings)
 import Gargantext.API.Prelude (GargServer)
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
+import Gargantext.Core.Text.Terms (TermType(..))
+import Gargantext.Core.Types.Individu (User(..))
+import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
+import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
+import Gargantext.Database.Admin.Types.Hyperdata.Frame
 import Gargantext.Database.Admin.Types.Node
+import Gargantext.Database.Query.Table.Node (getChildrenByType, getNodeWith)
+import Gargantext.Database.Schema.Node (node_hyperdata)
 import Gargantext.Prelude
 import GHC.Generics (Generic)
 import Servant
@@ -71,9 +83,54 @@ documentsFromWriteNodes uId nId p logStatus = do
   _ <- printDebug "[documentsFromWriteNodes] inside job, nId" nId
   _ <- printDebug "[documentsFromWriteNodes] inside job, p" p
 
+  frameWriteIds <- getChildrenByType nId NodeFrameWrite
+  _ <- printDebug "[documentsFromWriteNodes] children" frameWriteIds
+
+  -- https://write.frame.gargantext.org/<frame_id>/download
+  frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
+
+  frameWritesWithContents <- liftBase $
+    mapM (\node -> do
+             contents <- getHyperdataFrameContents (node ^. node_hyperdata)
+             pure (node, contents)
+         ) frameWrites
+  _ <- printDebug "[documentsFromWriteNodes] frameWritesWithContents" frameWritesWithContents
+
+  let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
+  let parsed = rights parsedE
+  _ <- printDebug "[documentsFromWriteNodes] parsed" parsed
+
+  _ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) nId Nothing
+
   pure  JobLog { _scst_succeeded = Just 2
                , _scst_failed    = Just 0
                , _scst_remaining = Just 0
                , _scst_events    = Just []
                }
 ------------------------------------------------------------------------
+hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
+hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
+  case parseLines contents of
+    Left _ -> Left "Error parsing node"
+    Right (Parsed { authors, contents = c, date, source, title = t }) ->
+      let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] in
+      let authors' = T.concat $ authorJoinSingle <$> authors in
+      Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
+                                     , _hd_doi = Nothing
+                                     , _hd_url = Nothing
+                                     , _hd_uniqId = Nothing
+                                     , _hd_uniqIdBdd = Nothing
+                                     , _hd_page = Nothing
+                                     , _hd_title = Just t
+                                     , _hd_authors = Just authors'
+                                     , _hd_institutes = Nothing
+                                     , _hd_source = source
+                                     , _hd_abstract = Just c
+                                     , _hd_publication_date = date
+                                     , _hd_publication_year = Nothing  -- TODO
+                                     , _hd_publication_month = Nothing  -- TODO
+                                     , _hd_publication_day = Nothing  -- TODO
+                                     , _hd_publication_hour = Nothing
+                                     , _hd_publication_minute = Nothing
+                                     , _hd_publication_second = Nothing
+                                     , _hd_language_iso2 = Just $ T.pack $ show EN }
diff --git a/src/Gargantext/Database/Action/Flow.hs b/src/Gargantext/Database/Action/Flow.hs
index 57bb1849..9fe79748 100644
--- a/src/Gargantext/Database/Action/Flow.hs
+++ b/src/Gargantext/Database/Action/Flow.hs
@@ -24,7 +24,8 @@ Portability : POSIX
 {-# LANGUAGE TemplateHaskell         #-}
 
 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
-  ( getDataText
+  ( DataText(..)
+  , getDataText
   , flowDataText
   , flow
 
diff --git a/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs b/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs
index c1bfdec8..42e5fc69 100644
--- a/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs
+++ b/src/Gargantext/Database/Admin/Types/Hyperdata/Frame.hs
@@ -21,15 +21,20 @@ Portability : POSIX
 module Gargantext.Database.Admin.Types.Hyperdata.Frame
   where
 
+import Control.Lens
+import Data.ByteString.Lazy (toStrict)
+import qualified Data.Text as T
+import Data.Text.Encoding (decodeUtf8)
 import Gargantext.Prelude
 import Gargantext.Database.Admin.Types.Hyperdata.Prelude
+import qualified Network.Wreq as Wreq
 
 ------------------------------------------------------------------------
 data HyperdataFrame =
   HyperdataFrame { _hf_base     :: !Text
                  , _hf_frame_id :: !Text
                  }
-    deriving (Generic)
+    deriving (Generic, Show)
 
 
 defaultHyperdataFrame :: HyperdataFrame
@@ -63,3 +68,8 @@ instance ToSchema HyperdataFrame where
     & mapped.schema.description ?~ "Frame Hyperdata"
     & mapped.schema.example ?~ toJSON defaultHyperdataFrame
 
+getHyperdataFrameContents :: HyperdataFrame -> IO Text
+getHyperdataFrameContents (HyperdataFrame { _hf_base, _hf_frame_id }) = do
+  let path = T.concat [_hf_base, "/", _hf_frame_id, "/download"]
+  r <- Wreq.get $ T.unpack path
+  pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody
diff --git a/src/Gargantext/Database/Query/Table/Node.hs b/src/Gargantext/Database/Query/Table/Node.hs
index f3544efe..9945bb1c 100644
--- a/src/Gargantext/Database/Query/Table/Node.hs
+++ b/src/Gargantext/Database/Query/Table/Node.hs
@@ -136,6 +136,24 @@ getClosestParentIdByType nId nType = do
         WHERE n1.id = ? AND 0 = ?;
     |]
 
+-- | Given a node id, find all it's children (no matter how deep) of
+-- given node type.
+getChildrenByType :: HasDBid NodeType
+                  => NodeId
+                  -> NodeType
+                  -> Cmd err [NodeId]
+getChildrenByType nId nType = do
+  result <- runPGSQuery query (nId, 0 :: Int)
+  children_lst <- mapM (\(id, _) -> getChildrenByType id nType) result
+  pure $ concat $ [fst <$> filter (\(_, pTypename) -> pTypename == toDBid nType) result] ++ children_lst
+  where
+    query :: DPS.Query
+    query = [sql|
+      SELECT n.id, n.typename
+      FROM nodes n
+        WHERE n.parent_id = ? AND 0 = ?;
+    |]
+
 ------------------------------------------------------------------------
 getDocumentsV3WithParentId :: HasDBid NodeType => NodeId -> Cmd err [Node HyperdataDocumentV3]
 getDocumentsV3WithParentId n = runOpaQuery $ selectNodesWith' n (Just NodeDocument)
-- 
2.21.0