1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
{-|
Module : Gargantext.API.Ngrams.List
Description : Get Ngrams (lists)
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.API.Ngrams.List
where
import Data.ByteString.Lazy qualified as BSL
import Data.Csv qualified as Tsv
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.Map.Strict qualified as Map
import Data.Map.Strict (toList)
import Data.Set qualified as Set
import Data.Text (concat, pack, splitOn)
import Data.Vector qualified as Vec
import Data.Vector (Vector)
import Gargantext.API.Admin.EnvTypes (Env)
import Gargantext.API.Errors.Types (BackendInternalError(InternalServerError))
import Gargantext.API.Ngrams.List.Types (_wjf_data, _wtf_data)
import Gargantext.API.Ngrams.Prelude (getNgramsList)
import Gargantext.API.Ngrams (setListNgrams)
import Gargantext.API.Ngrams.Types
import Gargantext.API.Prelude (GargM, serverError, HasServerError)
import Gargantext.API.Routes.Named.List qualified as Named
import Gargantext.API.Worker (serveWorkerAPI, serveWorkerAPIEJob)
import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType(NgramsTerms))
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Worker.Jobs.Types qualified as Jobs
import Gargantext.Database.Action.Flow (reIndexWith)
import Gargantext.Database.Admin.Types.Node ( NodeId(_NodeId), ListId )
import Gargantext.Database.Query.Table.Node (getNode)
import Gargantext.Database.Schema.Ngrams ( text2ngrams, NgramsId )
import Gargantext.Database.Schema.Node (_node_parent_id)
import Gargantext.Database.Types (Indexed(..))
import Gargantext.Prelude hiding (concat, toList)
import Gargantext.System.Logging (logLocM, MonadLogger)
import Gargantext.Utils.Jobs.Monad (MonadJobStatus(..))
import Prelude qualified
import Protolude qualified as P
import Servant
import Servant.Server.Generic (AsServerT)
import Gargantext.System.Logging (LogLevel(..))
getAPI :: Named.GETAPI (AsServerT (GargM Env BackendInternalError))
getAPI = Named.GETAPI $ \listId -> Named.ListEndpoints
{ listJSONEp = getJson listId
, listJSONZipEp = getJsonZip listId
, listTSVEp = getTsv listId
}
--
-- JSON API
--
jsonAPI :: Named.JSONAPI (AsServerT (GargM Env BackendInternalError))
jsonAPI = jsonPostAsync
------------------------------------------------------------------------
getJson :: HasNodeStory env err m
=> ListId
-> m (Headers '[Header "Content-Disposition" Text] NgramsList)
getJson lId = do
lst <- getNgramsList lId
pure $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
, pack $ show (_NodeId lId)
, ".json"
]
) lst
getJsonZip :: HasNodeStory env err m
=> ListId
-> m (Headers '[Header "Content-Disposition" Text] NgramsListZIP)
getJsonZip lId = do
lst <- getNgramsList lId
let nlz = NgramsListZIP { _nlz_nl = lst, _nlz_list_id = lId}
pure $ addHeader (concat [ "attachment; filename="
, nlzFileName nlz
, ".zip"
]
) nlz
getTsv :: HasNodeStory env err m
=> ListId
-> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
getTsv lId = do
lst <- getNgramsList lId
pure $ case Map.lookup NgramsTerms lst of
Nothing -> noHeader Map.empty
Just (Versioned { _v_data }) ->
addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
, pack $ show (_NodeId lId)
, ".tsv"
]
) _v_data
------------------------------------------------------------------------
jsonPostAsync :: Named.JSONAPI (AsServerT (GargM Env BackendInternalError))
jsonPostAsync = Named.JSONAPI {
updateListJSONEp = \lId -> serveWorkerAPI $ \p ->
Jobs.JSONPost { _jp_list_id = lId
, _jp_ngrams_list = _wjf_data p }
}
------------------------------------------------------------------------
postAsyncJSON :: (HasNodeStory env err m, MonadJobStatus m, MonadLogger m)
=> ListId
-> NgramsList
-> JobHandle m
-> m ()
postAsyncJSON l ngramsList jobHandle = do
markStarted 2 jobHandle
$(logLocM) DEBUG "[postAsyncJSON] Setting the Ngrams list ..."
setList
$(logLocM) DEBUG "[postAsyncJSON] Done."
markProgress 1 jobHandle
corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
let corpus_id = fromMaybe (panicTrace "no parent_id") (_node_parent_id corpus_node)
$(logLocM) DEBUG "[postAsyncJSON] Executing re-indexing..."
_ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])
$(logLocM) DEBUG "[postAsyncJSON] Re-indexing done."
markComplete jobHandle
where
setList :: HasNodeStory env err m => m ()
setList = do
-- TODO check with Version for optim
mapM_ (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList ngramsList
-- TODO reindex
--
-- TSV API
--
tsvAPI :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvAPI = tsvPostAsync
------------------------------------------------------------------------
tsvPostAsync :: Named.TSVAPI (AsServerT (GargM Env BackendInternalError))
tsvPostAsync =
Named.TSVAPI {
updateListTSVEp = \lId -> serveWorkerAPIEJob $ \p ->
case ngramsListFromTSVData (_wtf_data p) of
Left err -> Left $ InternalServerError $ err500 { errReasonPhrase = err }
Right ngramsList -> Right $ Jobs.JSONPost { _jp_list_id = lId
, _jp_ngrams_list = ngramsList }
}
-- | Tries converting a text file into an 'NgramList', so that we can reuse the
-- existing JSON endpoint for the TSV upload.
ngramsListFromTSVData :: Text -> Either Prelude.String NgramsList
ngramsListFromTSVData tsvData = case decodeTsv of
-- /NOTE/ The legacy TSV data only supports terms in imports and exports, so this is
-- all we care about.
Left err -> Left $ "Invalid TSV found in ngramsListFromTSVData: " <> err
Right terms -> pure $ Map.fromList [ (NgramsTerms, Versioned 0 $ mconcat . Vec.toList $ terms) ]
where
binaryData = BSL.fromStrict $ P.encodeUtf8 tsvData
decodeTsv :: Either Prelude.String (Vector NgramsTableMap)
decodeTsv = Tsv.decodeWithP tsvToNgramsTableMap
(Tsv.defaultDecodeOptions { Tsv.decDelimiter = fromIntegral (P.ord '\t') })
Tsv.HasHeader
binaryData
-- | Converts a plain TSV 'Record' into an NgramsTableMap
tsvToNgramsTableMap :: Tsv.Record -> Tsv.Parser NgramsTableMap
tsvToNgramsTableMap record = case Vec.toList record of
(map P.decodeUtf8 -> [status, label, forms])
-> pure $ conv status label forms
_ -> Prelude.fail "tsvToNgramsTableMap failed"
where
conv :: Text -> Text -> Text -> NgramsTableMap
conv status label forms = Map.singleton (NgramsTerm label)
$ NgramsRepoElement { _nre_size = 1
, _nre_list = case status == "map" of
True -> MapTerm
False -> case status == "main" of
True -> CandidateTerm
False -> StopTerm
, _nre_root = Nothing
, _nre_parent = Nothing
, _nre_children = MSet
$ Map.fromList
$ map (\form -> (NgramsTerm form, ()))
$ filter (\w -> w /= "" && w /= label)
$ splitOn "|&|" forms
}
------------------------------------------------------------------------
-- | This is for debugging the TSV parser in the REPL
importTsvFile :: forall env err m. (HasNodeStory env err m, HasServerError err, MonadJobStatus m, MonadLogger m)
=> ListId -> P.FilePath -> m ()
importTsvFile lId fp = do
contents <- liftBase $ P.readFile fp
case ngramsListFromTSVData contents of
Left err -> serverError $ err500 { errReasonPhrase = err }
Right ngramsList -> postAsyncJSON lId ngramsList (noJobHandle @m Proxy)
--
-- Utils
--
------------------------------------------------------------------------
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
where
i = HashMap.lookup t m
n = Just (text2ngrams t)