List.hs 9.97 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
{-|
Module      : Gargantext.API.Ngrams.List
Description : Get Ngrams (lists)
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

12
{-# LANGUAGE MonoLocalBinds #-}
13 14 15 16 17 18
{-# LANGUAGE TemplateHaskell   #-}
{-# LANGUAGE TypeOperators     #-}

module Gargantext.API.Ngrams.List
  where

19
import Control.Lens hiding (elements, Indexed)
20
import Data.Aeson
21 22 23
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Csv as Csv
import Data.Either (Either(..))
24
import Data.HashMap.Strict (HashMap)
25 26 27 28
import qualified Data.HashMap.Strict as HashMap
import qualified Data.List           as List
import Data.Map (Map, toList, fromList)
import qualified Data.Map            as Map
29 30
import Data.Maybe (catMaybes)
import Data.Set (Set)
31
import Data.Text (Text, concat, pack)
32 33 34
import qualified Data.Text           as Text
import Data.Vector (Vector)
import qualified Data.Vector as Vec
35
import Network.HTTP.Media ((//), (/:))
36
import qualified Prelude as Prelude
37 38
import Servant
import Servant.Job.Async
39 40

import qualified Protolude as P
41

42
import Gargantext.API.Admin.Orchestrator.Types
43
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
44
import Gargantext.API.Ngrams.List.Types
45
import Gargantext.API.Ngrams.Tools (getTermsWith)
46 47 48
import Gargantext.API.Ngrams.Types
import Gargantext.API.Prelude (GargServer)
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
49 50
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(..))
51
import Gargantext.Database.Action.Flow (saveDocNgramsWith)
52
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
53 54
import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
import Gargantext.Database.Admin.Types.Hyperdata.Document
55
import Gargantext.Database.Admin.Types.Node
56
import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
57 58 59
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node
import Gargantext.Database.Types (Indexed(..))
60
import Gargantext.Prelude
61

62
------------------------------------------------------------------------
Alexandre Delanoë's avatar
Alexandre Delanoë committed
63
{-
64
-- | TODO refactor 
65
type API =  Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
66 67
       -- :<|> ReqBody '[JSON] NgramsList :> Post '[JSON] Bool
       :<|> PostAPI
68
       :<|> CSVPostAPI
69

70 71
api :: ListId -> GargServer API
api l = get l :<|> postAsync l :<|> csvPostAsync l
Alexandre Delanoë's avatar
Alexandre Delanoë committed
72 73
-}

74 75 76 77 78 79 80 81 82

----------------------
type GETAPI = Summary "Get List"
            :> "lists"
              :> Capture "listId" ListId
            :> Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
getApi :: GargServer GETAPI
getApi = get

Alexandre Delanoë's avatar
Alexandre Delanoë committed
83 84 85 86 87 88 89
data HTML
instance Accept HTML where
  contentType _ = "text" // "html" /: ("charset", "utf-8")
instance ToJSON a => MimeRender HTML a where
  mimeRender _ = encode


90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
----------------------
type JSONAPI = Summary "Update List"
          :> "lists"
            :> Capture "listId" ListId
          :> "add"
          :> "form"
          :> "async"
            :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog

jsonApi :: GargServer JSONAPI
jsonApi = postAsync

----------------------
type CSVAPI = Summary "Update List (legacy v3 CSV)"
          :> "lists"
            :> Capture "listId" ListId
          :> "csv"
          :> "add"
          :> "form"
          :> "async"
Alexandre Delanoë's avatar
Alexandre Delanoë committed
110
            :> AsyncJobs JobLog '[FormUrlEncoded] WithTextFile JobLog
111 112 113 114 115 116 117

csvApi :: GargServer CSVAPI
csvApi = csvPostAsync

----------------------


118

119
------------------------------------------------------------------------
120 121
get :: RepoCmdM env err m =>
       ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
122 123
get lId = do
  lst <- get' lId
124
  let (NodeId id') = lId
125
  return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
126
                             , pack $ show id'
127 128 129 130 131
                             , ".json"
                             ]
                     ) lst

get' :: RepoCmdM env err m
132
    => ListId -> m NgramsList
133
get' lId = fromList
134 135 136
       <$> zip ngramsTypes
       <$> mapM (getNgramsTableMap lId) ngramsTypes

137
------------------------------------------------------------------------
138
-- TODO : purge list
139
-- TODO talk
140
post :: FlowCmdM env err m
141 142 143
    => ListId
    -> NgramsList
    -> m Bool
144
post l m  = do
145
  -- TODO check with Version for optim
146
  printDebug "New list as file" l
147
  _ <- mapM (\(nt, Versioned _v ns) -> setListNgrams l nt ns) $ toList m
148
  -- TODO reindex
149 150
  pure True

Alexandre Delanoë's avatar
Alexandre Delanoë committed
151 152 153

-----------------------------------------------------------------------------
-- | Re-index documents of a corpus with new ngrams (called orphans here)
154 155 156 157
reIndexWith :: ( HasRepo env
               , FlowCmdM env err m
               )
            => CorpusId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
158 159
            -> ListId
            -> NgramsType
160
            -> Set ListType
161
            -> m ()
162 163 164 165 166 167 168
reIndexWith cId lId nt lts = do
  -- Getting [NgramsTerm]
  ts <- List.concat
     <$> map (\(k,vs) -> k:vs)
     <$> HashMap.toList
     <$> getTermsWith identity [lId] nt lts
  
169
  -- printDebug "ts" ts
Alexandre Delanoë's avatar
Alexandre Delanoë committed
170 171

  -- Taking the ngrams with 0 occurrences only (orphans)
172 173
  occs <- getOccByNgramsOnlyFast' cId lId nt ts

174 175
  -- printDebug "occs" occs

176 177 178
  let orphans = List.concat 
              $ map (\t -> case HashMap.lookup t occs of
                       Nothing -> [t]
179
                       Just n  -> if n <= 1 then [t] else [ ]
180
                       ) ts
Alexandre Delanoë's avatar
Alexandre Delanoë committed
181

182 183
  -- printDebug "orphans" orphans

184
  -- Get all documents of the corpus
185
  docs <- selectDocNodes cId
186
  -- printDebug "docs length" (List.length docs)
187

Alexandre Delanoë's avatar
Alexandre Delanoë committed
188
  -- Checking Text documents where orphans match
189
  -- TODO Tests here
Alexandre Delanoë's avatar
Alexandre Delanoë committed
190
  let
191 192
    ngramsByDoc = map (HashMap.fromList)
                $ map (map (\(k,v) -> (SimpleNgrams (text2ngrams k), v)))
193
                $  map (\doc -> List.zip
194
                                (termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) orphans)
195 196 197 198
                                             $ Text.unlines $ catMaybes
                                               [ doc ^. node_hyperdata . hd_title
                                               , doc ^. node_hyperdata . hd_abstract
                                               ]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
199
                                 )
200 201
                                (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
                        ) docs
Alexandre Delanoë's avatar
Alexandre Delanoë committed
202

203
  -- printDebug "ngramsByDoc" ngramsByDoc
204

Alexandre Delanoë's avatar
Alexandre Delanoë committed
205
  -- Saving the indexation in database
206
  _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
207 208

  pure () -- ngramsByDoc
Alexandre Delanoë's avatar
Alexandre Delanoë committed
209

210 211 212 213 214 215
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
  where
    i = HashMap.lookup t m
    n = Just (text2ngrams t)

216 217 218 219 220
------------------------------------------------------------------------
type PostAPI = Summary "Update List"
        :> "add"
        :> "form"
        :> "async"
221
        :> AsyncJobs JobLog '[FormUrlEncoded] WithFile JobLog
222

223
postAsync :: GargServer JSONAPI
224 225
postAsync lId =
  serveJobsAPI $
226 227 228 229 230 231
    JobFunction (\f log' ->
      let
        log'' x = do
          printDebug "postAsync ListId" x
          liftBase $ log' x
      in postAsync' lId f log'')
232 233 234 235

postAsync' :: FlowCmdM env err m
          => ListId
          -> WithFile
236 237
          -> (JobLog -> m ())
          -> m JobLog
238 239
postAsync' l (WithFile _ m _) logStatus = do

240
  logStatus JobLog { _scst_succeeded = Just 0
241 242 243 244
                   , _scst_failed    = Just 0
                   , _scst_remaining = Just 1
                   , _scst_events    = Just []
                   }
245 246 247
  printDebug "New list as file" l
  _ <- post l m
  -- printDebug "Done" r
248

249
  pure JobLog { _scst_succeeded = Just 1
250 251 252 253
              , _scst_failed    = Just 0
              , _scst_remaining = Just 0
              , _scst_events    = Just []
              }
254
------------------------------------------------------------------------
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
readCsvText :: Text -> [(Text, Text, Text)]
readCsvText t = case eDec of
  Left _ -> []
  Right dec -> Vec.toList dec
  where
    lt = BSL.fromStrict $ P.encodeUtf8 t
    eDec = Csv.decodeWith
             (Csv.defaultDecodeOptions { Csv.decDelimiter = fromIntegral (P.ord '\t') })
             Csv.HasHeader lt :: Either Prelude.String (Vector (Text, Text, Text))

parseCsvData :: [(Text, Text, Text)] -> Map NgramsTerm NgramsRepoElement
parseCsvData lst = Map.fromList $ conv <$> lst
  where
    conv (_status, label, _forms) =
        (NgramsTerm label, NgramsRepoElement { _nre_size = 1
                                             , _nre_list = CandidateTerm
                                             , _nre_root = Nothing
                                             , _nre_parent = Nothing
                                             , _nre_children = MSet Map.empty })

csvPost :: FlowCmdM env err m
        => ListId
        -> Text
        -> m Bool
csvPost l m  = do
  printDebug "[csvPost] l" l
  -- printDebug "[csvPost] m" m
  -- status label forms
  let lst = readCsvText m
  let p = parseCsvData lst
  --printDebug "[csvPost] lst" lst
286
  printDebug "[csvPost] p" p
287 288 289
  _ <- setListNgrams l NgramsTerms p
  pure True
------------------------------------------------------------------------
290

Alexandre Delanoë's avatar
Alexandre Delanoë committed
291

292

293
csvPostAsync :: GargServer CSVAPI
294 295
csvPostAsync lId =
  serveJobsAPI $
296 297 298 299 300 301
    JobFunction $ \f@(WithTextFile ft _ n) log' -> do
      let log'' x = do
            printDebug "[csvPostAsync] filetype" ft
            printDebug "[csvPostAsync] name" n
            liftBase $ log' x
      csvPostAsync' lId f log''
302 303 304

csvPostAsync' :: FlowCmdM env err m
             => ListId
305
             -> WithTextFile
306 307
             -> (JobLog -> m ())
             -> m JobLog
308
csvPostAsync' l (WithTextFile _ m _) logStatus = do
309 310 311 312 313
  logStatus JobLog { _scst_succeeded = Just 0
                   , _scst_failed    = Just 0
                   , _scst_remaining = Just 1
                   , _scst_events    = Just []
                   }
314
  _r <- csvPost l m
315 316 317 318 319 320

  pure JobLog { _scst_succeeded = Just 1
              , _scst_failed    = Just 0
              , _scst_remaining = Just 0
              , _scst_events    = Just []
              }
321

322
------------------------------------------------------------------------