Commit 9a3c78ea authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Insert function of context of text in database.

parent a4cdd3f9
......@@ -37,6 +37,7 @@ library:
- Gargantext.Core.Types.Node
- Gargantext.Core.Utils.Prefix
- Gargantext.Database
- Gargantext.Database.Node.Document.Import
- Gargantext.Prelude
- Gargantext.Text
- Gargantext.Text.Context
......@@ -112,6 +113,7 @@ library:
- profunctors
- protolude
- pureMD5
- SHA
- regex-compat
- resourcet
- safe
......
......@@ -42,6 +42,7 @@ import Data.Swagger
import Text.Read (read)
import Text.Show (Show())
import Database.PostgreSQL.Simple.ToField (ToField, toField, toJSONField)
import Servant
import Test.QuickCheck.Arbitrary
......@@ -102,30 +103,34 @@ $(deriveJSON (unPrefix "hyperdataDocumentV3_") ''HyperdataDocumentV3)
------------------------------------------------------------------------
data HyperdataDocument = HyperdataDocument { hyperdataDocument_bdd :: Maybe Text
, hyperdataDocument_doi :: Maybe Int
, hyperdataDocument_url :: Maybe Text
, hyperdataDocument_page :: Maybe Int
, hyperdataDocument_title :: Maybe Text
, hyperdataDocument_authors :: Maybe Text
, hyperdataDocument_source :: Maybe Text
, hyperdataDocument_abstract :: Maybe Text
, hyperdataDocument_statuses :: Maybe [Status]
, hyperdataDocument_publication_date :: Maybe Text
, hyperdataDocument_publication_year :: Maybe Int
, hyperdataDocument_publication_month :: Maybe Int
, hyperdataDocument_publication_hour :: Maybe Int
, hyperdataDocument_publication_minute :: Maybe Int
, hyperdataDocument_publication_second :: Maybe Int
, hyperdataDocument_language_iso2 :: Maybe Text
, hyperdataDocument_language_iso3 :: Maybe Text
data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd :: Maybe Text
, _hyperdataDocument_doi :: Maybe Int
, _hyperdataDocument_url :: Maybe Text
, _hyperdataDocument_uniqId :: Maybe Text
, _hyperdataDocument_page :: Maybe Int
, _hyperdataDocument_title :: Maybe Text
, _hyperdataDocument_authors :: Maybe Text
, _hyperdataDocument_source :: Maybe Text
, _hyperdataDocument_abstract :: Maybe Text
, _hyperdataDocument_publication_date :: Maybe Text
, _hyperdataDocument_publication_year :: Maybe Int
, _hyperdataDocument_publication_month :: Maybe Int
, _hyperdataDocument_publication_hour :: Maybe Int
, _hyperdataDocument_publication_minute :: Maybe Int
, _hyperdataDocument_publication_second :: Maybe Int
, _hyperdataDocument_language_iso2 :: Maybe Text
, _hyperdataDocument_language_iso3 :: Maybe Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "hyperdataDocument_") ''HyperdataDocument)
$(deriveJSON (unPrefix "_hyperdataDocument_") ''HyperdataDocument)
$(makeLenses ''HyperdataDocument)
instance ToField HyperdataDocument where
toField = toJSONField
toHyperdataDocuments :: [(Text, Text)] -> [HyperdataDocument]
toHyperdataDocuments ts = map (\(t1,t2) -> HyperdataDocument Nothing Nothing Nothing Nothing (Just t1)
Nothing (Just t2) Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing Nothing Nothing
toHyperdataDocuments ts = map (\(t1,t2) -> HyperdataDocument Nothing Nothing Nothing Nothing Nothing (Just t1)
Nothing (Just t2) Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing Nothing
) ts
hyperdataDocuments :: [HyperdataDocument]
......@@ -262,11 +267,11 @@ type NodeName = Text
--type NodeUser = Node HyperdataUser
-- | Then a Node can be either a Folder or a Corpus or a Document
type NodeUser = Node HyperdataUser
type Folder = Node HyperdataFolder
type Project = Folder -- NP Node HyperdataProject ?
type NodeCorpus = Node HyperdataCorpus
type Document = Node HyperdataDocument
type NodeUser = Node HyperdataUser
type Folder = Node HyperdataFolder
type Project = Node HyperdataProject
type NodeCorpus = Node HyperdataCorpus
type Document = Node HyperdataDocument
------------------------------------------------------------------------
data NodeType = NodeUser | Project | Folder | NodeCorpus | Annuaire | Document | UserPage | DocumentCopy | Favorites
......@@ -326,7 +331,7 @@ hyperdataDocument = case decode docExample of
Nothing Nothing Nothing Nothing
Nothing
docExample :: ByteString
docExample = "{\"publication_day\":6,\"language_iso2\":\"en\",\"publication_minute\":0,\"publication_month\":7,\"language_iso3\":\"eng\",\"publication_second\":0,\"authors\":\"Nils Hovdenak, Kjell Haram\",\"publication_year\":2012,\"publication_date\":\"2012-07-06 00:00:00+00:00\",\"language_name\":\"English\",\"statuses\":[],\"realdate_full_\":\"2012 01 12\",\"source\":\"European journal of obstetrics, gynecology, and reproductive biology\",\"abstract\":\"The literature was searched for publications on minerals and vitamins during pregnancy and the possible influence of supplements on pregnancy outcome.\",\"title\":\"Influence of mineral and vitamin supplements on pregnancy outcome.\",\"publication_hour\":0}"
docExample = "{\"doi\":\"sdfds\",\"publication_day\":6,\"language_iso2\":\"en\",\"publication_minute\":0,\"publication_month\":7,\"language_iso3\":\"eng\",\"publication_second\":0,\"authors\":\"Nils Hovdenak, Kjell Haram\",\"publication_year\":2012,\"publication_date\":\"2012-07-06 00:00:00+00:00\",\"language_name\":\"English\",\"realdate_full_\":\"2012 01 12\",\"source\":\"European journal of obstetrics, gynecology, and reproductive biology\",\"abstract\":\"The literature was searched for publications on minerals and vitamins during pregnancy and the possible influence of supplements on pregnancy outcome.\",\"title\":\"Influence of mineral and vitamin supplements on pregnancy outcome.\",\"publication_hour\":0}"
instance ToSchema HyperdataDocument where
......@@ -341,26 +346,26 @@ instance ToSchema Value where
L.& mapped.schema.example ?~ toJSON ("" :: Text)
instance ToSchema (NodePoly NodeId NodeTypeId NodeUserId
instance ToSchema (NodePoly NodeId NodeTypeId NodeUserId
(Maybe NodeParentId) NodeName
UTCTime HyperdataDocument
)
instance ToSchema (NodePoly NodeId NodeTypeId
(Maybe NodeUserId)
NodeParentId NodeName
instance ToSchema (NodePoly NodeId NodeTypeId
(Maybe NodeUserId)
NodeParentId NodeName
UTCTime HyperdataDocument
)
instance ToSchema (NodePoly NodeId NodeTypeId
(Maybe NodeUserId)
NodeParentId NodeName
instance ToSchema (NodePoly NodeId NodeTypeId
(Maybe NodeUserId)
NodeParentId NodeName
UTCTime Value
)
instance ToSchema (NodePoly NodeId NodeTypeId
(NodeUserId)
(Maybe NodeParentId) NodeName
instance ToSchema (NodePoly NodeId NodeTypeId
(NodeUserId)
(Maybe NodeParentId) NodeName
UTCTime Value
)
......
......@@ -7,8 +7,6 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
......
......@@ -176,9 +176,9 @@ selectNodesWith' parentId maybeNodeType = proc () -> do
node <- (proc () -> do
row@(Node _ typeId _ parentId' _ _ _) <- queryNodeTable -< ()
restrict -< parentId' .== (toNullable $ pgInt4 parentId)
let typeId' = maybe 0 nodeTypeId maybeNodeType
restrict -< if typeId' > 0
then typeId .== (pgInt4 (typeId' :: Int))
else (pgBool True)
......@@ -213,7 +213,6 @@ getNodesWithParentId' :: Connection -> Int
getNodesWithParentId' conn n _ = runQuery conn $ selectNodesWithParentID n
------------------------------------------------------------------------
getDocumentsV3WithParentId :: Connection -> Int -> IO [Node HyperdataDocumentV3]
getDocumentsV3WithParentId conn n = runQuery conn $ selectNodesWith' n (Just Document)
......
{-|
Module : Gargantext.Database.Node.Document.Import
Description : Importing context of texts (documents)
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
* Main purpose of this module: enabling "common goods" of text data and respecting privacy.
Gargantext shares as "common good" the links between context of texts
and terms / words / ngrams.
Basically a context of text can be defined as a document (see 'Gargantext.Text').
Issue to tackle in that module: each global document of Gargantext has
to be unique, then shared, but how to respect privacy if needed ?
* Methodology to get uniqueness and privacy by design
As a consequence, when importing a new document in Gargantext, a policy
for the uniqueness of the inserted docuemnts has to be defined.
That is the purpose of this module which defines its main concepts.
Unique identifier in database is of a 3-tuple of 3 policies that
together define uniqueness:
- Design policy: type of node is needed as TypenameId, that is a
Document or Individual or something else;
- Privacy policy: with ParentId, parent becomes unique, then it enables
users to get their own copy without sharing it with all the users of the
database (in others words parent_id is necessary to preserve privacy for
instance).
- Hash policy: this UniqId is a sha256 uniq id which is the result of
the concatenation of the parameters defined by @hashParameters@.
* Database configuration
Administrator of the database has to create a uniq index as following SQL command:
`create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeSynonymInstances #-}
------------------------------------------------------------------------
------------------------------------------------------------------------
module Gargantext.Database.Node.Document.Import where
import Control.Lens (set)
import Control.Monad ((>>=))
import Data.Aeson (toJSON, Value)
import Data.ByteString.Internal (ByteString)
import Data.Maybe (maybe)
import Data.Typeable (Typeable)
import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ
import Database.PostgreSQL.Simple.ToField (toField)
import Database.PostgreSQL.Simple.ToRow (ToRow(..))
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Data.Text (Text)
import qualified Data.Text as DT (pack, unpack, concat)
import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
import qualified Data.ByteString.Lazy.Char8 as DC (pack)
import Gargantext (connectGargandb)
import Gargantext.Core.Types.Main (nodeTypeId)
import Gargantext.Core.Types.Node
-- FIXME : the import of Document constructor below does not work
-- import Gargantext.Core.Types.Node (Document)
--import Gargantext.Core.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
-- , hyperdataDocument_uniqId
-- , hyperdataDocument_title
-- , hyperdataDocument_abstract
-- , hyperdataDocument_source
-- , Node(..), node_typename
-- , node_userId
-- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
-- , NodeTypeId
-- )
import Gargantext.Prelude
import GHC.Generics (Generic)
---------------------------------------------------------------------------
-- * Main Insert functions
-- | Insert Document main function
-- UserId : user who is inserting the documents
-- ParentId : folder ID which is parent of the inserted documents
insertDocuments :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO [ReturnId]
insertDocuments conn uId pId hs = query conn queryInsert (Only $ Values fields inputData)
where
fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
inputData = prepare uId pId hs
-- | Debug SQL function
--
-- to print rendered query (Debug purpose) use @formatQuery@ function.
insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
where
fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
inputData = prepare uId pId hs
-- | Input Tables: types of the tables
inputSqlTypes :: [Text]
inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
-- | SQL query to insert documents inside the database
queryInsert :: Query
queryInsert = [sql|
WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
, ins AS (
INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
SELECT * FROM input_rows
ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
RETURNING id,hyperdata
)
SELECT true AS source -- true for 'newly inserted'
, id
, hyperdata ->> 'uniqId' as doi
FROM ins
UNION ALL
SELECT false AS source -- false for 'not inserted'
, c.id
, hyperdata ->> 'uniqId' as doi
FROM input_rows
JOIN nodes c USING (hyperdata); -- columns of unique index
|]
prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ unicize h))
where
tId = nodeTypeId Document
------------------------------------------------------------------------
-- * Main Types used
-- ** Return Types
-- | When documents are inserted
-- ReturnType after insertion
data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
, reId :: Int -- ^ always return the id of the document (even new or not new)
-- this is the uniq id in the database
, reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
} deriving (Show, Generic)
instance FromRow ReturnId where
fromRow = ReturnId <$> field <*> field <*> field
-- ** Insert Types
type UserId = Int
type ParentId = Int
data InputData = InputData { inTypenameId :: NodeTypeId
, inUserId :: UserId
, inParentId :: ParentId
, inName :: Text
, inHyper :: Value
} deriving (Show, Generic, Typeable)
instance ToRow InputData where
toRow inputData = [ toField (inTypenameId inputData)
, toField (inUserId inputData)
, toField (inParentId inputData)
, toField (inName inputData)
, toField (inHyper inputData)
]
---------------------------------------------------------------------------
-- * Uniqueness of document definition
hashParameters :: [(HyperdataDocument -> Text)]
hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
, \d -> maybe' (_hyperdataDocument_abstract d)
, \d -> maybe' (_hyperdataDocument_source d)
, \d -> maybe' (_hyperdataDocument_publication_date d)
]
where
maybe' = maybe (DT.pack "") identity
unicize :: HyperdataDocument -> HyperdataDocument
unicize = unicize' hashParameters
where
unicize' :: [(HyperdataDocument -> Text)] -> HyperdataDocument -> HyperdataDocument
unicize' fields doc = set hyperdataDocument_uniqId (Just hash) doc
where
hash = uniqId $ DT.concat $ map (\f -> f doc) fields
uniqId :: Text -> Text
uniqId txt = (sha256 txt)
where
sha256 :: Text -> Text
sha256 = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
---------------------------------------------------------------------------
-- * Tests
--insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
insertTest :: IO [ReturnId]
insertTest = connectGargandb "gargantext.ini"
>>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
......@@ -53,14 +53,14 @@ data Doc = Doc
-- | Doc 2 HyperdataDocument
doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV")
(Just did)
Nothing
Nothing
Nothing
(Just dt)
(Just dau)
(Just ds)
(Just dab)
(Nothing)
Nothing
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment