Commit d5a4b1f9 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[frame write] implement flowDataText to insert docs into corpus

parent 852fae28
Pipeline #1899 passed with stage
in 38 minutes and 22 seconds
......@@ -9,6 +9,7 @@ rec {
];
nonhsBuildInputs = with pkgs; [
bzip2
docker-compose
git
gmp
gsl
......
......@@ -17,13 +17,25 @@ Portability : POSIX
module Gargantext.API.Node.DocumentsFromWriteNodes
where
import Control.Lens ((^.))
import Data.Aeson
import Data.Either (Either(..), rights)
import Data.Swagger
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Prelude (GargServer)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
import GHC.Generics (Generic)
import Servant
......@@ -71,9 +83,54 @@ documentsFromWriteNodes uId nId p logStatus = do
_ <- printDebug "[documentsFromWriteNodes] inside job, nId" nId
_ <- printDebug "[documentsFromWriteNodes] inside job, p" p
frameWriteIds <- getChildrenByType nId NodeFrameWrite
_ <- printDebug "[documentsFromWriteNodes] children" frameWriteIds
-- https://write.frame.gargantext.org/<frame_id>/download
frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
frameWritesWithContents <- liftBase $
mapM (\node -> do
contents <- getHyperdataFrameContents (node ^. node_hyperdata)
pure (node, contents)
) frameWrites
_ <- printDebug "[documentsFromWriteNodes] frameWritesWithContents" frameWritesWithContents
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = rights parsedE
_ <- printDebug "[documentsFromWriteNodes] parsed" parsed
_ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) nId Nothing
pure JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
case parseLines contents of
Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = c, date, source, title = t }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] in
let authors' = T.concat $ authorJoinSingle <$> authors in
Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just t
, _hd_authors = Just authors'
, _hd_institutes = Nothing
, _hd_source = source
, _hd_abstract = Just c
, _hd_publication_date = date
, _hd_publication_year = Nothing -- TODO
, _hd_publication_month = Nothing -- TODO
, _hd_publication_day = Nothing -- TODO
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN }
......@@ -24,7 +24,8 @@ Portability : POSIX
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( getDataText
( DataText(..)
, getDataText
, flowDataText
, flow
......
......@@ -21,15 +21,20 @@ Portability : POSIX
module Gargantext.Database.Admin.Types.Hyperdata.Frame
where
import Control.Lens
import Data.ByteString.Lazy (toStrict)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import qualified Network.Wreq as Wreq
------------------------------------------------------------------------
data HyperdataFrame =
HyperdataFrame { _hf_base :: !Text
, _hf_frame_id :: !Text
}
deriving (Generic)
deriving (Generic, Show)
defaultHyperdataFrame :: HyperdataFrame
......@@ -63,3 +68,8 @@ instance ToSchema HyperdataFrame where
& mapped.schema.description ?~ "Frame Hyperdata"
& mapped.schema.example ?~ toJSON defaultHyperdataFrame
getHyperdataFrameContents :: HyperdataFrame -> IO Text
getHyperdataFrameContents (HyperdataFrame { _hf_base, _hf_frame_id }) = do
let path = T.concat [_hf_base, "/", _hf_frame_id, "/download"]
r <- Wreq.get $ T.unpack path
pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody
......@@ -136,6 +136,24 @@ getClosestParentIdByType nId nType = do
WHERE n1.id = ? AND 0 = ?;
|]
-- | Given a node id, find all it's children (no matter how deep) of
-- given node type.
getChildrenByType :: HasDBid NodeType
=> NodeId
-> NodeType
-> Cmd err [NodeId]
getChildrenByType nId nType = do
result <- runPGSQuery query (nId, 0 :: Int)
children_lst <- mapM (\(id, _) -> getChildrenByType id nType) result
pure $ concat $ [fst <$> filter (\(_, pTypename) -> pTypename == toDBid nType) result] ++ children_lst
where
query :: DPS.Query
query = [sql|
SELECT n.id, n.typename
FROM nodes n
WHERE n.parent_id = ? AND 0 = ?;
|]
------------------------------------------------------------------------
getDocumentsV3WithParentId :: HasDBid NodeType => NodeId -> Cmd err [Node HyperdataDocumentV3]
getDocumentsV3WithParentId n = runOpaQuery $ selectNodesWith' n (Just NodeDocument)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment