List.hs 10.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
{-|
Module      : Gargantext.API.Ngrams.List
Description : Get Ngrams (lists)
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

{-# LANGUAGE TemplateHaskell   #-}
{-# LANGUAGE TypeOperators     #-}

module Gargantext.API.Ngrams.List
  where

18
import Control.Lens hiding (elements, Indexed)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
19
import Data.Either (Either(..))
20
import Data.HashMap.Strict (HashMap)
21
import Data.Map.Strict (Map, toList)
22
import Data.Maybe (catMaybes, fromMaybe)
23
import Data.Set (Set)
24
import Data.Text (Text, concat, pack, splitOn)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
25
import Data.Vector (Vector)
26
import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
27
import Gargantext.API.Admin.Orchestrator.Types
28
import Gargantext.API.Ngrams (setListNgrams)
29 30
import Gargantext.API.Ngrams.List.Types
import Gargantext.API.Ngrams.Prelude (getNgramsList)
31
import Gargantext.API.Ngrams.Tools (getTermsWith)
32
import Gargantext.API.Ngrams.Types
33
import Gargantext.API.Prelude (GargServer, GargM, GargError)
34
import Gargantext.API.Types
35
import Gargantext.Core.NodeStory
36
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
37 38
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(..))
39
import Gargantext.Database.Action.Flow (saveDocNgramsWith)
40
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
41
import Gargantext.Database.Admin.Types.Hyperdata.Document
42
import Gargantext.Database.Admin.Types.Node
43
import Gargantext.Database.Query.Table.Node (getNode)
44
import Gargantext.Database.Query.Table.NodeContext (selectDocNodes)
45
import Gargantext.Database.Schema.Context
46
import Gargantext.Database.Schema.Ngrams
47
import Gargantext.Database.Schema.Node (_node_parent_id)
48
import Gargantext.Database.Types (Indexed(..))
49
import Gargantext.Prelude
Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
50
import Gargantext.Utils.Jobs (serveJobsAPI, MonadJobStatus(..))
51
import Servant
52
-- import Servant.Job.Async
Alexandre Delanoë's avatar
Alexandre Delanoë committed
53 54
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Csv as Csv
55 56
import qualified Data.HashMap.Strict as HashMap
import qualified Data.List           as List
57
import qualified Data.Map.Strict     as Map
58
import qualified Data.Set            as Set
59
import qualified Data.Text           as Text
Alexandre Delanoë's avatar
Alexandre Delanoë committed
60
import qualified Data.Vector         as Vec
61
import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
62
import qualified Gargantext.Utils.Servant as GUS
63
import qualified Prelude
Alexandre Delanoë's avatar
Alexandre Delanoë committed
64
import qualified Protolude           as P
65
------------------------------------------------------------------------
66 67 68
type GETAPI = Summary "Get List"
            :> "lists"
              :> Capture "listId" ListId
69 70 71 72 73 74
              :> "json"
              :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
            :<|> "lists"
              :> Capture "listId" ListId
              :> "csv"
              :> Get '[GUS.CSV] (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
75
getApi :: GargServer GETAPI
76
getApi = getJson :<|> getCsv
77 78 79 80 81 82 83 84

----------------------
type JSONAPI = Summary "Update List"
          :> "lists"
            :> Capture "listId" ListId
          :> "add"
          :> "form"
          :> "async"
85
            :> AsyncJobs JobLog '[FormUrlEncoded] WithJsonFile JobLog
86

87
jsonApi :: ServerT JSONAPI (GargM Env GargError)
88
jsonApi = jsonPostAsync
89 90 91 92 93 94 95 96 97

----------------------
type CSVAPI = Summary "Update List (legacy v3 CSV)"
          :> "lists"
            :> Capture "listId" ListId
          :> "csv"
          :> "add"
          :> "form"
          :> "async"
Alexandre Delanoë's avatar
Alexandre Delanoë committed
98
            :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
99

100
csvApi :: ServerT CSVAPI (GargM Env GargError)
101 102
csvApi = csvPostAsync

103
------------------------------------------------------------------------
104 105 106
getJson :: HasNodeStory env err m =>
       ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
getJson lId = do
107
  lst <- getNgramsList lId
108
  let (NodeId id') = lId
109
  return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
110
                             , pack $ show id'
111 112 113
                             , ".json"
                             ]
                     ) lst
114 115 116 117

getCsv :: HasNodeStory env err m =>
       ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsTableMap)
getCsv lId = do
118 119
  lst <- getNgramsList lId
  let (NodeId id') = lId
120 121 122 123 124 125 126 127
  return $ case Map.lookup TableNgrams.NgramsTerms lst of
    Nothing -> noHeader Map.empty
    Just (Versioned { _v_data }) ->
      addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
                        , pack $ show id'
                        , ".csv"
                        ]
                ) _v_data
128 129

------------------------------------------------------------------------
130
-- TODO : purge list
131
-- TODO talk
132
setList :: FlowCmdM env err m
133 134 135
    => ListId
    -> NgramsList
    -> m Bool
136
setList l m  = do
137
  -- TODO check with Version for optim
138
  -- printDebug "New list as file" l
139
  _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
140
  -- TODO reindex
141 142
  pure True

143
------------------------------------------------------------------------
Alexandre Delanoë's avatar
Alexandre Delanoë committed
144
-- | Re-index documents of a corpus with new ngrams (called orphans here)
145 146
reIndexWith :: ( HasNodeStory env err m
               , FlowCmdM     env err m
147 148
               )
            => CorpusId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
149 150
            -> ListId
            -> NgramsType
151
            -> Set ListType
152
            -> m ()
153
reIndexWith cId lId nt lts = do
154
  -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
155

156 157 158 159 160
  -- Getting [NgramsTerm]
  ts <- List.concat
     <$> map (\(k,vs) -> k:vs)
     <$> HashMap.toList
     <$> getTermsWith identity [lId] nt lts
161

162
  -- Get all documents of the corpus
163 164
  docs <- selectDocNodes cId

Alexandre Delanoë's avatar
Alexandre Delanoë committed
165
  let
166
    -- fromListWith (<>)
167
    ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
168 169
                  $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
                  $ map (\doc -> List.zip
170 171 172 173 174
                                 (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
                                              $ Text.unlines $ catMaybes
                                              [ doc ^. context_hyperdata . hd_title
                                              , doc ^. context_hyperdata . hd_abstract
                                              ]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
175
                                 )
176
                                 (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
177
                        ) docs
Alexandre Delanoë's avatar
Alexandre Delanoë committed
178

179
  -- printDebug "ngramsByDoc: " ngramsByDoc
180

Alexandre Delanoë's avatar
Alexandre Delanoë committed
181
  -- Saving the indexation in database
182
  _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
183

184
  pure ()
Alexandre Delanoë's avatar
Alexandre Delanoë committed
185

186 187 188 189 190 191
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
  where
    i = HashMap.lookup t m
    n = Just (text2ngrams t)

192
------------------------------------------------------------------------
193 194
jsonPostAsync :: ServerT JSONAPI (GargM Env GargError)
jsonPostAsync lId =
195
  serveJobsAPI UpdateNgramsListJobJSON $ \jHandle f ->
Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
196
    postAsync' lId f jHandle
197

Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
198
postAsync' :: (FlowCmdM env err m, MonadJobStatus m)
199
          => ListId
200
          -> WithJsonFile
Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
201 202 203 204 205
          -> JobHandle m
          -> m ()
postAsync' l (WithJsonFile m _) jobHandle = do

  markStarted 2 jobHandle
206
  -- printDebug "New list as file" l
207
  _ <- setList l m
208
  -- printDebug "Done" r
209

Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
210
  markProgress 1 jobHandle
211 212 213 214 215

  corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
  let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
  _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])

Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
216
  markComplete jobHandle
217

218 219
------------------------------------------------------------------------

220
readCsvText :: Text -> Either Text [(Text, Text, Text)]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
221
readCsvText t = case eDec of
222 223
  Left err -> Left $ pack err
  Right dec -> Right $ Vec.toList dec
Alexandre Delanoë's avatar
Alexandre Delanoë committed
224 225 226 227 228 229 230 231 232
  where
    lt = BSL.fromStrict $ P.encodeUtf8 t
    eDec = Csv.decodeWith
             (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
             Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))

parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
parseCsvData lst = Map.fromList $ conv <$> lst
  where
233
    conv (status, label, forms) =
Alexandre Delanoë's avatar
Alexandre Delanoë committed
234
        (NgramsTerm label, NgramsRepoElement { _nre_size = 1
235 236 237 238 239
                                             , _nre_list = case status == "map" of
                                                             True  -> MapTerm
                                                             False -> case status == "main" of
                                                                True  -> CandidateTerm
                                                                False -> StopTerm
Alexandre Delanoë's avatar
Alexandre Delanoë committed
240 241
                                             , _nre_root = Nothing
                                             , _nre_parent = Nothing
242 243 244
                                             , _nre_children = MSet
                                                             $ Map.fromList
                                                             $ map (\form -> (NgramsTerm form, ()))
245
                                                             $ filter (\w -> w /= "" && w /= label)
246 247 248
                                                             $ splitOn "|&|" forms
                                             }
         )
Alexandre Delanoë's avatar
Alexandre Delanoë committed
249 250 251 252

csvPost :: FlowCmdM env err m
        => ListId
        -> Text
253
        -> m (Either Text ())
Alexandre Delanoë's avatar
Alexandre Delanoë committed
254
csvPost l m  = do
255
  -- printDebug "[csvPost] l" l
Alexandre Delanoë's avatar
Alexandre Delanoë committed
256 257
  -- printDebug "[csvPost] m" m
  -- status label forms
258 259 260 261 262 263 264 265 266 267 268 269 270 271
  let eLst = readCsvText m
  case eLst of
    Left err -> pure $ Left err
    Right lst -> do
      let p = parseCsvData lst
      --printDebug "[csvPost] lst" lst
      -- printDebug "[csvPost] p" p
      _ <- setListNgrams l NgramsTerms p
      -- printDebug "ReIndexing List" l
      corpus_node <- getNode l -- (Proxy :: Proxy HyperdataList)
      let corpus_id = fromMaybe (panic "") (_node_parent_id corpus_node)
      _ <- reIndexWith corpus_id l NgramsTerms (Set.fromList [MapTerm, CandidateTerm])

      pure $ Right ()
Alexandre Delanoë's avatar
Alexandre Delanoë committed
272

273
------------------------------------------------------------------------
274
csvPostAsync :: ServerT CSVAPI (GargM Env GargError)
275
csvPostAsync lId =
276
  serveJobsAPI UpdateNgramsListJobCSV $ \jHandle f -> do
Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
277
      markStarted 1 jHandle
278
      ePost <- csvPost lId (_wtf_data f)
Alfredo Di Napoli's avatar
Alfredo Di Napoli committed
279 280 281 282 283
      case ePost of
        Left err -> markFailed (Just err) jHandle
        Right () -> markComplete jHandle

      getLatestJobStatus jHandle >>= printDebug "[csvPostAsync] job ended with joblog: "
284

285
------------------------------------------------------------------------
286 287 288 289 290 291 292 293


-- | This is for debugging the CSV parser in the REPL
importCsvFile :: FlowCmdM env err m
              => ListId -> P.FilePath -> m (Either Text ())
importCsvFile lId fp = do
  contents <- liftBase $ P.readFile fp
  csvPost lId contents