1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
{-|
Module : Gargantext.API.Node.Corpus.New
Description : New corpus API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
New corpus means either:
- new corpus
- new data in existing corpus
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Node.Corpus.New
where
import Conduit
import Control.Lens hiding (elements, Empty)
import Control.Monad
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString.Base64 as BSB64
import Data.Conduit.Internal (zipSources)
import Data.Either
import Data.Maybe (fromMaybe)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import Servant
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Node.Corpus.New.Types
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_max_docs_parsers)
import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import qualified Gargantext.Database.GargDB as GargDB
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
, query_node_id :: Int
, query_lang :: Lang
, query_databases :: [DataOrigin]
}
deriving (Eq, Generic)
deriveJSON (unPrefix "query_") 'Query
instance Arbitrary Query where
arbitrary = elements [ Query q n la fs
| q <- ["honeybee* AND collapse"
,"covid 19"
]
, n <- [0..10]
, la <- allLangs
, fs <- take 3 $ repeat allDataOrigins
]
instance ToSchema Query where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
-}
------------------------------------------------------------------------
{-
type Api = PostApi
:<|> GetApi
type PostApi = Summary "New Corpus endpoint"
:> ReqBody '[JSON] Query
:> Post '[JSON] CorpusId
type GetApi = Get '[JSON] ApiInfo
-}
-- | TODO manage several apis
-- TODO-ACCESS
-- TODO this is only the POST
{-
api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
api uid (Query q _ as) = do
cId <- case head as of
Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
Just a -> do
docs <- liftBase $ API.get a q (Just 1000)
cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
pure cId'
pure cId
-}
------------------------------------------------
-- TODO use this route for Client implementation
data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
deriving (Generic)
instance Arbitrary ApiInfo where
arbitrary = ApiInfo <$> arbitrary
deriveJSON (unPrefix "") 'ApiInfo
instance ToSchema ApiInfo
info :: FlowCmdM env err m => UserId -> m ApiInfo
info _u = do
ext <- API.externalAPIs
pure $ ApiInfo ext
------------------------------------------------------------------------
------------------------------------------------------------------------
data WithQuery = WithQuery
{ _wq_query :: !Text
, _wq_databases :: !Database
, _wq_datafield :: !(Maybe Datafield)
, _wq_lang :: !Lang
, _wq_node_id :: !Int
, _wq_flowListWith :: !FlowSocialListWith
}
deriving Generic
makeLenses ''WithQuery
instance FromJSON WithQuery where
parseJSON = genericParseJSON $ jsonOptions "_wq_"
instance ToJSON WithQuery where
toJSON = genericToJSON $ jsonOptions "_wq_"
instance ToSchema WithQuery where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
------------------------------------------------------------------------
type AddWithQuery = Summary "Add with Query to corpus endpoint"
:> "corpus"
:> Capture "corpus_id" CorpusId
:> "query"
:> AsyncJobs JobLog '[JSON] WithQuery JobLog
{-
type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
:> "corpus"
:> Capture "corpus_id" CorpusId
:> "add"
:> "file"
:> MultipartForm Mem (MultipartData Mem)
:> QueryParam "fileType" FileType
:> "async"
:> AsyncJobs JobLog '[JSON] () JobLog
-}
------------------------------------------------------------------------
-- TODO WithQuery also has a corpus id
addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> WithQuery
-> Maybe Integer
-> JobHandle m
-> m ()
addToCorpusWithQuery user cid (WithQuery { _wq_query = q
, _wq_databases = dbs
, _wq_datafield = datafield
, _wq_lang = l
, _wq_flowListWith = flw }) maybeLimit jobHandle = do
-- TODO ...
-- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
-- printDebug "[addToCorpusWithQuery] datafield" datafield
-- printDebug "[addToCorpusWithQuery] flowListWith" flw
case datafield of
Just Web -> do
-- printDebug "[addToCorpusWithQuery] processing web request" datafield
markStarted 1 jobHandle
_ <- triggerSearxSearch user cid q l jobHandle
markComplete jobHandle
_ -> do
case datafield of
Just (External (PubMed { _api_key })) -> do
printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
_ <- updateCorpusPubmedAPIKey cid _api_key
pure ()
_ -> pure ()
markStarted 3 jobHandle
-- TODO add cid
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
-- printDebug "[G.A.N.C.New] getDataText with query" q
db <- database2origin dbs
eTxt <- getDataText db (Multi l) q maybeLimit
-- printDebug "[G.A.N.C.New] lTxts" lTxts
case eTxt of
Right txt -> do
-- TODO Sum lenghts of each txt elements
markProgress 1 jobHandle
void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
-- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
markComplete jobHandle
Left err -> do
-- printDebug "Error: " err
markFailed (Just $ T.pack (show err)) jobHandle
type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "corpus"
:> Capture "corpus_id" CorpusId
:> "add"
:> "form"
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> NewWithForm
-> JobHandle m
-> m ()
addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
-- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' :: Integer
let
parseC = case ft of
CSV_HAL -> Parser.parseFormatC Parser.CsvHal
CSV -> Parser.parseFormatC Parser.CsvGargV3
WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse
Iramuteq -> Parser.parseFormatC Parser.Iramuteq
JSON -> Parser.parseFormatC Parser.JSON
-- TODO granularity of the logStatus
let data' = case ff of
Plain -> cs d
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded
eDocsC <- liftBase $ parseC ff data'
case eDocsC of
Right (mCount, docsC) -> do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
let docsC' = zipSources (yieldMany [1..]) docsC
.| mapMC (\(idx, doc) ->
if idx > limit then do
--printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
let panicMsg' = [ "[addToCorpusWithForm] number of docs "
, "exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
--logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure doc)
.| mapC toHyperdataDocument
--printDebug "Parsing corpus finished : " cid
--logStatus jobLog2
--printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
-- printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
_cid' <- flowCorpus user
(Right [cid])
(Multi $ fromMaybe EN l)
(Just sel)
--(Just $ fromIntegral $ length docs, docsC')
(mCount, transPipe liftBase docsC') -- TODO fix number of docs
--(map (map toHyperdataDocument) docs)
jobHandle
-- printDebug "Extraction finished : " cid
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
-- TODO uncomment this
--sendMail user
markComplete jobHandle
Left e -> do
printDebug "[addToCorpusWithForm] parse error" e
markFailed (Just $ T.pack e) jobHandle
{-
addToCorpusWithFile :: FlowCmdM env err m
=> CorpusId
-> MultipartData Mem
-> Maybe FileType
-> (JobLog -> m ())
-> m JobLog
addToCorpusWithFile cid input filetype logStatus = do
logStatus JobLog { _scst_succeeded = Just 10
, _scst_failed = Just 2
, _scst_remaining = Just 138
, _scst_events = Just []
}
printDebug "addToCorpusWithFile" cid
_h <- postUpload cid filetype input
pure JobLog { _scst_succeeded = Just 137
, _scst_failed = Just 13
, _scst_remaining = Just 0
, _scst_events = Just []
}
-}
type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
:> "corpus"
:> Capture "corpus_id" CorpusId
:> "add"
:> "file"
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> NewWithFile
-> JobHandle m
-> m ()
addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
markStarted 1 jobHandle
fPath <- GargDB.writeFile nwf
printDebug "[addToCorpusWithFile] File saved as: " fPath
uId <- getUserId user
nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
_ <- case nIds of
[nId] -> do
node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
let hl = node ^. node_hyperdata
_ <- updateHyperdata nId $ hl { _hff_name = fName
, _hff_path = T.pack fPath }
printDebug "[addToCorpusWithFile] Created node with id: " nId
_ -> pure ()
printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
markComplete jobHandle