Commit d5a4b1f9 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[frame write] implement flowDataText to insert docs into corpus

parent 852fae28
Pipeline #1899 passed with stage
in 38 minutes and 22 seconds
...@@ -9,6 +9,7 @@ rec { ...@@ -9,6 +9,7 @@ rec {
]; ];
nonhsBuildInputs = with pkgs; [ nonhsBuildInputs = with pkgs; [
bzip2 bzip2
docker-compose
git git
gmp gmp
gsl gsl
......
...@@ -17,13 +17,25 @@ Portability : POSIX ...@@ -17,13 +17,25 @@ Portability : POSIX
module Gargantext.API.Node.DocumentsFromWriteNodes module Gargantext.API.Node.DocumentsFromWriteNodes
where where
import Control.Lens ((^.))
import Data.Aeson import Data.Aeson
import Data.Either (Either(..), rights)
import Data.Swagger import Data.Swagger
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Prelude (GargServer) import Gargantext.API.Prelude (GargServer)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
import Gargantext.Core.Text.Terms (TermType(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude import Gargantext.Prelude
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Servant import Servant
...@@ -71,9 +83,54 @@ documentsFromWriteNodes uId nId p logStatus = do ...@@ -71,9 +83,54 @@ documentsFromWriteNodes uId nId p logStatus = do
_ <- printDebug "[documentsFromWriteNodes] inside job, nId" nId _ <- printDebug "[documentsFromWriteNodes] inside job, nId" nId
_ <- printDebug "[documentsFromWriteNodes] inside job, p" p _ <- printDebug "[documentsFromWriteNodes] inside job, p" p
frameWriteIds <- getChildrenByType nId NodeFrameWrite
_ <- printDebug "[documentsFromWriteNodes] children" frameWriteIds
-- https://write.frame.gargantext.org/<frame_id>/download
frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
frameWritesWithContents <- liftBase $
mapM (\node -> do
contents <- getHyperdataFrameContents (node ^. node_hyperdata)
pure (node, contents)
) frameWrites
_ <- printDebug "[documentsFromWriteNodes] frameWritesWithContents" frameWritesWithContents
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = rights parsedE
_ <- printDebug "[documentsFromWriteNodes] parsed" parsed
_ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) nId Nothing
pure JobLog { _scst_succeeded = Just 2 pure JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0 , _scst_failed = Just 0
, _scst_remaining = Just 0 , _scst_remaining = Just 0
, _scst_events = Just [] , _scst_events = Just []
} }
------------------------------------------------------------------------ ------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
case parseLines contents of
Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = c, date, source, title = t }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] in
let authors' = T.concat $ authorJoinSingle <$> authors in
Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just t
, _hd_authors = Just authors'
, _hd_institutes = Nothing
, _hd_source = source
, _hd_abstract = Just c
, _hd_publication_date = date
, _hd_publication_year = Nothing -- TODO
, _hd_publication_month = Nothing -- TODO
, _hd_publication_day = Nothing -- TODO
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN }
...@@ -24,7 +24,8 @@ Portability : POSIX ...@@ -24,7 +24,8 @@ Portability : POSIX
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
( getDataText ( DataText(..)
, getDataText
, flowDataText , flowDataText
, flow , flow
......
...@@ -21,15 +21,20 @@ Portability : POSIX ...@@ -21,15 +21,20 @@ Portability : POSIX
module Gargantext.Database.Admin.Types.Hyperdata.Frame module Gargantext.Database.Admin.Types.Hyperdata.Frame
where where
import Control.Lens
import Data.ByteString.Lazy (toStrict)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata.Prelude import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import qualified Network.Wreq as Wreq
------------------------------------------------------------------------ ------------------------------------------------------------------------
data HyperdataFrame = data HyperdataFrame =
HyperdataFrame { _hf_base :: !Text HyperdataFrame { _hf_base :: !Text
, _hf_frame_id :: !Text , _hf_frame_id :: !Text
} }
deriving (Generic) deriving (Generic, Show)
defaultHyperdataFrame :: HyperdataFrame defaultHyperdataFrame :: HyperdataFrame
...@@ -63,3 +68,8 @@ instance ToSchema HyperdataFrame where ...@@ -63,3 +68,8 @@ instance ToSchema HyperdataFrame where
& mapped.schema.description ?~ "Frame Hyperdata" & mapped.schema.description ?~ "Frame Hyperdata"
& mapped.schema.example ?~ toJSON defaultHyperdataFrame & mapped.schema.example ?~ toJSON defaultHyperdataFrame
getHyperdataFrameContents :: HyperdataFrame -> IO Text
getHyperdataFrameContents (HyperdataFrame { _hf_base, _hf_frame_id }) = do
let path = T.concat [_hf_base, "/", _hf_frame_id, "/download"]
r <- Wreq.get $ T.unpack path
pure $ decodeUtf8 $ toStrict $ r ^. Wreq.responseBody
...@@ -136,6 +136,24 @@ getClosestParentIdByType nId nType = do ...@@ -136,6 +136,24 @@ getClosestParentIdByType nId nType = do
WHERE n1.id = ? AND 0 = ?; WHERE n1.id = ? AND 0 = ?;
|] |]
-- | Given a node id, find all it's children (no matter how deep) of
-- given node type.
getChildrenByType :: HasDBid NodeType
=> NodeId
-> NodeType
-> Cmd err [NodeId]
getChildrenByType nId nType = do
result <- runPGSQuery query (nId, 0 :: Int)
children_lst <- mapM (\(id, _) -> getChildrenByType id nType) result
pure $ concat $ [fst <$> filter (\(_, pTypename) -> pTypename == toDBid nType) result] ++ children_lst
where
query :: DPS.Query
query = [sql|
SELECT n.id, n.typename
FROM nodes n
WHERE n.parent_id = ? AND 0 = ?;
|]
------------------------------------------------------------------------ ------------------------------------------------------------------------
getDocumentsV3WithParentId :: HasDBid NodeType => NodeId -> Cmd err [Node HyperdataDocumentV3] getDocumentsV3WithParentId :: HasDBid NodeType => NodeId -> Cmd err [Node HyperdataDocumentV3]
getDocumentsV3WithParentId n = runOpaQuery $ selectNodesWith' n (Just NodeDocument) getDocumentsV3WithParentId n = runOpaQuery $ selectNodesWith' n (Just NodeDocument)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment