Insert.hs 9.99 KB
Newer Older
1
{-|
2
Module      : Gargantext.Database.Node.Document.Insert
3 4 5 6 7 8 9
Description : Importing context of texts (documents)
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12
* Purpose of this module

Enabling "common goods" of text data and respecting privacy.
13 14 15 16 17 18 19 20 21 22 23 24

Gargantext shares as "common good" the links between context of texts
and terms / words / ngrams.

Basically a context of text can be defined as a document (see 'Gargantext.Text').

Issue to tackle in that module: each global document of Gargantext has
to be unique, then shared, but how to respect privacy if needed ?


* Methodology to get uniqueness and privacy by design

25 26
As a consequence, when importing/inserting a new document in Gargantext,
a policy for the uniqueness of the inserted docuemnts has to be defined.
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43

That is the purpose of this module which defines its main concepts.

Unique identifier in database is of a 3-tuple of 3 policies that
together define uniqueness:

- Design policy: type of node is needed as TypenameId, that is a
Document or Individual or something else;

- Privacy policy: with ParentId, parent becomes unique, then it enables
users to get their own copy without sharing it with all the users of the
database (in others words parent_id is necessary to preserve privacy for
instance).

- Hash policy: this UniqId is a sha256 uniq id which is the result of
the concatenation of the parameters defined by @hashParameters@.

44 45 46
> -- * Example
> insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
> insertTest :: IO [ReturnId]
47
> insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
48

49
-}
50
------------------------------------------------------------------------
51
{-# LANGUAGE DeriveGeneric        #-}
52
{-# LANGUAGE FlexibleInstances    #-}
53
{-# LANGUAGE NoImplicitPrelude    #-}
54
{-# LANGUAGE OverloadedStrings    #-}
55
{-# LANGUAGE QuasiQuotes          #-}
56
{-# LANGUAGE RankNTypes           #-}
57 58
{-# LANGUAGE TypeSynonymInstances #-}
------------------------------------------------------------------------
59
module Gargantext.Database.Node.Document.Insert where
60

61 62 63
import Control.Lens (set, view)
import Control.Lens.Prism
import Control.Lens.Cons
64
import Data.Aeson (toJSON)
65
import Data.Maybe (maybe)
66
import Data.Time.Segment (jour)
67
import Data.Text (Text)
68
import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
69 70
import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ
71
import Database.PostgreSQL.Simple.ToField (toField, Action)
72
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
73
import GHC.Generics (Generic)
74
import Gargantext.Database.Config (nodeTypeId)
75
import Gargantext.Database.Utils (Cmd, runPGSQuery)
76
import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
77
import Gargantext.Database.Types.Node
78 79 80 81
import Gargantext.Prelude
import qualified Data.ByteString.Lazy.Char8  as DC (pack)
import qualified Data.Digest.Pure.SHA        as SHA (sha256, showDigest)
import qualified Data.Text                   as DT (pack, unpack, concat, take)
82
import Gargantext.Prelude.Utils (hash)
83
-- TODO : the import of Document constructor below does not work
84 85
-- import Gargantext.Database.Types.Node (Document)
--import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
86 87 88 89 90 91 92 93 94
--                                  , hyperdataDocument_uniqId
--                                  , hyperdataDocument_title
--                                  , hyperdataDocument_abstract
--                                  , hyperdataDocument_source
--                                  , Node(..), node_typename
--                                            , node_userId
--                                            , node_parentId, node_name, node_hyperdata, hyperdataDocuments
--                                  , NodeTypeId
--                                  )
95 96 97 98
{-| To Print result query
import Data.ByteString.Internal (ByteString)
import Database.PostgreSQL.Simple (formatQuery)
-}
99 100 101 102 103 104 105

---------------------------------------------------------------------------
-- * Main Insert functions

-- | Insert Document main function
-- UserId : user who is inserting the documents
-- ParentId : folder ID which is parent of the inserted documents
106 107 108 109 110 111
-- Administrator of the database has to create a uniq index as following SQL command:
-- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
insertDb :: InsertDb a => UserId -> ParentId -> [a] -> Cmd err [ReturnId]
insertDb u p = runPGSQuery queryInsert . Only . Values fields . map (insertDb' u p)
      where
        fields    = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
112

113 114 115
class InsertDb a
  where
    insertDb' :: UserId -> ParentId -> a -> [Action]
116 117


118
instance InsertDb HyperdataDocument
119
  where
120 121 122 123
    insertDb' u p h = [ toField $ nodeTypeId NodeDocument
                      , toField u
                      , toField p
                      , toField $ maybe "No Title" (DT.take 255)  (_hyperdataDocument_title h)
124
                      , toField $ _hyperdataDocument_publication_date h -- TODO USE UTCTime
125 126 127 128 129 130 131 132 133
                      , (toField . toJSON) h
                      ]

instance InsertDb HyperdataContact
  where
    insertDb' u p h = [ toField $ nodeTypeId NodeContact
                      , toField u
                      , toField p
                      , toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
134
                      , toField $ jour 2010 1 1 -- TODO put default date
135 136 137
                      , (toField . toJSON) h
                      ]

138 139 140 141

-- | Debug SQL function
--
-- to print rendered query (Debug purpose) use @formatQuery@ function.
142 143
{-
insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a) => UserId -> ParentId -> [a] -> Cmd ByteString
144
insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
145 146 147
  where
    fields    = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
    inputData = prepare uId pId hs
148
-}
149 150 151 152


-- | Input Tables: types of the tables
inputSqlTypes :: [Text]
153
inputSqlTypes = map DT.pack ["int4","int4","int4","text","date","jsonb"]
154 155 156 157

-- | SQL query to insert documents inside the database
queryInsert :: Query
queryInsert = [sql|
158
    WITH input_rows(typename,user_id,parent_id,name,date,hyperdata) AS (?)
159
    , ins AS (
160
       INSERT INTO nodes (typename,user_id,parent_id,name,date,hyperdata)
161
       SELECT * FROM input_rows
162 163
       ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
       -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
       RETURNING id,hyperdata
       )

    SELECT true AS source                     -- true for 'newly inserted'
         , id
         , hyperdata ->> 'uniqId'  as doi
    FROM   ins
    UNION  ALL
    SELECT false AS source                    -- false for 'not inserted'
         , c.id
         , hyperdata ->> 'uniqId' as doi
    FROM   input_rows
    JOIN   nodes c USING (hyperdata);         -- columns of unique index
           |]

------------------------------------------------------------------------
-- * Main Types used

-- ** Return Types

-- | When documents are inserted
-- ReturnType after insertion
186 187
data ReturnId = ReturnId { reInserted :: Bool -- if the document is inserted (True: is new, False: is not new)
                         , reId       :: NodeId  -- always return the id of the document (even new or not new)
188
                                         --   this is the uniq id in the database
189
                         , reUniqId   :: Text -- Hash Id with concatenation of hash parameters
190 191 192 193 194 195 196 197
                         } deriving (Show, Generic)

instance FromRow ReturnId where
  fromRow = ReturnId <$> field <*> field <*> field

---------------------------------------------------------------------------
-- * Uniqueness of document definition

198
class AddUniqId a
199
  where
200
    addUniqId :: a -> a
201

202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
instance AddUniqId HyperdataDocument
  where
    addUniqId = addUniqIdsDoc
      where
        addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
        addUniqIdsDoc doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
                          $ set hyperdataDocument_uniqId    (Just hashUni) doc
          where
            hashUni = hash $ DT.concat $ map ($ doc) hashParametersDoc
            hashBdd = hash $ DT.concat $ map ($ doc) ([(\d -> maybeText (_hyperdataDocument_bdd d))] <> hashParametersDoc)

        hashParametersDoc :: [(HyperdataDocument -> Text)]
        hashParametersDoc = [ \d -> maybeText (_hyperdataDocument_title    d)
                            , \d -> maybeText (_hyperdataDocument_abstract d)
                            , \d -> maybeText (_hyperdataDocument_source   d)
                            , \d -> maybeText (_hyperdataDocument_publication_date   d)
                            ]

    ---------------------------------------------------------------------------
221 222
-- * Uniqueness of document definition
-- TODO factorize with above (use the function below for tests)
223 224 225 226 227

instance AddUniqId HyperdataContact
  where
    addUniqId = addUniqIdsContact

228
addUniqIdsContact :: HyperdataContact -> HyperdataContact
229
addUniqIdsContact hc = set (hc_uniqIdBdd) (Just hashBdd)
230
                     $ set (hc_uniqId   ) (Just hashUni) hc
231
  where
232 233
    hashUni = uniqId $ DT.concat $ map ($ hc) hashParametersContact
    hashBdd = uniqId $ DT.concat $ map ($ hc) ([\d -> maybeText (view hc_bdd d)] <> hashParametersContact)
234 235 236 237

    uniqId :: Text -> Text
    uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack

238 239 240 241 242 243
    -- | TODO add more hashparameters
    hashParametersContact :: [(HyperdataContact -> Text)]
    hashParametersContact = [ \d -> maybeText $ view (hc_who . _Just . cw_firstName) d
                            , \d -> maybeText $ view (hc_who . _Just . cw_lastName ) d
                            , \d -> maybeText $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
                            ]
244 245


246

247 248
maybeText :: Maybe Text -> Text
maybeText = maybe (DT.pack "") identity
249

250
---------------------------------------------------------------------------