Flow.hs 13.4 KB
Newer Older
1 2 3 4 5 6 7 8 9
{-|
Module      : Gargantext.Database.Flow
Description : Database Flow
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12 13 14

-- TODO-ACCESS:
--   check userId       CanFillUserCorpus   userCorpusId
--   check masterUserId CanFillMasterCorpus masterCorpusId

15 16 17 18
-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
-- TODO-EVENTS: InsertedNodes


19 20
-}

21
{-# LANGUAGE ConstraintKinds   #-}
22
{-# LANGUAGE DeriveGeneric     #-}
23
{-# LANGUAGE NoImplicitPrelude #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
24
{-# LANGUAGE OverloadedStrings #-}
25
{-# LANGUAGE RankNTypes        #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
26
{-# LANGUAGE FlexibleContexts  #-}
27

28
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
29
    where
30

31
--import Debug.Trace (trace)
32
import Control.Lens ((^.), view, Lens', _Just)
33
import Control.Monad (mapM_)
34
import Control.Monad.IO.Class (liftIO)
35 36
import Data.List (concat)
import Data.Map (Map, lookup, toList)
37
import Data.Maybe (Maybe(..), catMaybes)
38
import Data.Monoid
39
import Data.Text (Text, splitOn, intercalate)
40
import GHC.Show (Show)
41
import Gargantext.API.Ngrams (HasRepoVar)
42
import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
43 44
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (NodePoly(..), Terms(..))
45 46
import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main
47
import Gargantext.Database.Config (userMaster, corpusMasterName)
48
import Gargantext.Database.Flow.Utils (insertDocNgrams)
49
import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
50
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
51
import Gargantext.Database.Root (getRoot)
52 53
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams,  NgramsType(..), text2ngrams, ngramsTypeId)
import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
54
import Gargantext.Database.Schema.User (getUser, UserLight(..))
55
import Gargantext.Database.TextSearch (searchInDatabase)
56
import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
57
import Gargantext.Database.Utils (Cmd, CmdM)
58
import Gargantext.Ext.IMT (toSchoolName)
59
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
60
import Gargantext.Prelude
61
import Gargantext.Text.List (buildNgramsLists,StopSize(..))
62
import Gargantext.Text.Parsers (parseDocs, FileFormat)
63 64 65
import Gargantext.Text.Terms (TermType(..), tt_lang)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
66
import Servant (ServantErr)
67
import System.FilePath (FilePath)
68 69
--import qualified Data.List as List
import qualified Data.Map  as Map
70 71
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add  as Doc  (add)
72
import qualified Gargantext.Text.Parsers.GrandDebat as GD
73

74
type FlowCmdM env err m =
75
  ( CmdM     env err m
76 77
  , RepoCmdM env err m
  , HasNodeError err
Alexandre Delanoë's avatar
Alexandre Delanoë committed
78
  , HasRepoVar env
79
  )
80

81 82 83 84 85
type FlowCorpus a = ( AddUniqId a
                    , UniqId a
                    , InsertDb a
                    , ExtractNgramsT a
                    )
Alexandre Delanoë's avatar
Alexandre Delanoë committed
86

87
------------------------------------------------------------------------
88

89 90 91 92 93
flowAnnuaire :: FlowCmdM env ServantErr m 
             => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
flowAnnuaire u n l filePath = do
  docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
  flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
94 95


96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
flowCorpusDebat :: FlowCmdM env ServantErr m
            => Username -> CorpusName
            -> Limit -> FilePath
            -> m CorpusId
flowCorpusDebat u n l fp = do
  docs <- liftIO ( splitEvery 500
                 <$> take l
                 <$> GD.readFile fp
                 :: IO [[GD.GrandDebatReference ]]
                 )
  flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)


flowCorpusFile :: FlowCmdM env ServantErr m
           => Username -> CorpusName
111
           -> Limit -- Limit the number of docs (for dev purpose)
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
           -> TermType Lang -> FileFormat -> FilePath
           -> m CorpusId
flowCorpusFile u n l la ff fp = do
  docs <- liftIO ( splitEvery 500
                 <$> take l
                 <$> parseDocs ff fp
                 )
  flowCorpus u n la (map (map toHyperdataDocument) docs)

-- TODO query with complex query
flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
          => Username -> Lang -> Text -> m CorpusId
flowCorpusSearchInDatabase u la q = do
  (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
  ids <-  map fst <$> searchInDatabase cId (stemIt q)
  flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
128

129
------------------------------------------------------------------------
130

131 132 133 134 135
flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
     => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flow c u cn la docs = do
  ids <- mapM (insertMasterDocs c la ) docs
  flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
136

137 138 139
flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
     => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
140 141


142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
               => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
flowCorpusUser l userName corpusName ctype ids = do
  -- User Flow
  (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
  -- TODO: check if present already, ignore
  _ <- Doc.add userCorpusId ids

  -- User List Flow
  --{-
  (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
  ngs         <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
  userListId  <- flowList userId userCorpusId ngs
  printDebug "userListId" userListId
  -- User Graph Flow
  _ <- mkGraph  userCorpusId userId
  --}

  -- User Dashboard Flow
161
  _ <- mkDashboard userCorpusId userId
162

163
  -- Annuaire Flow
164
  -- _ <- mkAnnuaire  rootUserId userId
165
  pure userCorpusId
166

167 168 169 170 171 172 173 174 175 176 177 178

insertMasterDocs :: ( FlowCmdM env ServantErr m
                    , FlowCorpus a
                    , MkCorpus   c
                    )
                 => Maybe c -> TermType Lang -> [a] -> m [DocId]
insertMasterDocs c lang hs  =  do
  (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c

  -- TODO Type NodeDocumentUnicised
  let hs' = map addUniqId hs
  ids <- insertDb masterUserId masterCorpusId hs'
179
  let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
180
  
181
  maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
182 183
  terms2id <- insertNgrams $ Map.keys maps
  let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
184 185 186
  
  lId <- getOrMkList masterCorpusId masterUserId
  _   <- insertDocNgrams lId indexedNgrams
187 188
  pure $ map reId ids

189

190

191 192
type CorpusName = Text

193 194 195 196
getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
              => Username -> CorpusName -> Maybe a
              -> Cmd err (UserId, RootId, CorpusId)
getOrMkRootWithCorpus username cName c = do
197
  maybeUserId <- getUser username
198
  userId <- case maybeUserId of
199
        Nothing   -> nodeError NoUserFound
200
        Just user -> pure $ userLight_id user
201

202
  rootId' <- map _node_id <$> getRoot username
203

204
  rootId'' <- case rootId' of
205
        []  -> mkRoot username userId
206
        n   -> case length n >= 2 of
207
            True  -> nodeError ManyNodeUsers
208
            False -> pure rootId'
209

Alexandre Delanoë's avatar
Alexandre Delanoë committed
210
  rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
211

Alexandre Delanoë's avatar
Alexandre Delanoë committed
212
  corpusId'' <- if username == userMaster
213
                  then do
214
                    ns <- getCorporaWithParentId rootId
215 216 217
                    pure $ map _node_id ns
                  else
                    pure []
218
  
219 220
  corpusId' <- if corpusId'' /= []
                  then pure corpusId''
221
                  else mk (Just cName) c rootId userId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
222

223
  corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
224

Alexandre Delanoë's avatar
Alexandre Delanoë committed
225 226
  pure (userId, rootId, corpusId)

227

228
------------------------------------------------------------------------
229 230 231 232 233 234 235 236 237 238 239 240


class UniqId a
  where
    uniqId :: Lens' a (Maybe HashId)


instance UniqId HyperdataDocument
  where
    uniqId = hyperdataDocument_uniqId

instance UniqId HyperdataContact
241
  where
242 243 244 245 246 247 248
    uniqId = hc_uniqId

viewUniqId' :: UniqId a => a -> (HashId, a)
viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
      where
        err = panic "[ERROR] Database.Flow.toInsert"

249 250

toInserted :: [ReturnId] -> Map HashId ReturnId
251
toInserted = Map.fromList . map    (\r ->  (reUniqId r, r)    )
252
                         . filter (\r -> reInserted r == True)
253

254 255 256 257
data DocumentWithId a = DocumentWithId
  { documentId   :: !NodeId
  , documentData :: !a
  } deriving (Show)
258

259 260 261
mergeData :: Map HashId ReturnId
          -> Map HashId a
          -> [DocumentWithId a]
262
mergeData rs = catMaybes . map toDocumentWithId . Map.toList
263 264 265 266
  where
    toDocumentWithId (hash,hpd) =
      DocumentWithId <$> fmap reId (lookup hash rs)
                     <*> Just hpd
267

268
------------------------------------------------------------------------
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
data DocumentIdWithNgrams a = DocumentIdWithNgrams
  { documentWithId  :: !(DocumentWithId a)
  , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
  } deriving (Show)

-- TODO extractNgrams according to Type of Data

class ExtractNgramsT h
  where
    extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))


instance ExtractNgramsT HyperdataContact
  where
    extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
      where
        extract :: TermType Lang -> HyperdataContact
                -> Cmd err (Map Ngrams (Map NgramsType Int))
        extract _l hc' = do
          let authors = map text2ngrams
                     $ maybe ["Nothing"] (\a -> [a])
                     $ view (hc_who . _Just . cw_lastName) hc'
        
292
          pure $ Map.fromList $ [(a', Map.singleton Authors     1) | a' <- authors    ]
293

294

295

296

297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
instance ExtractNgramsT HyperdataDocument
  where
    extractNgramsT = extractNgramsT'

extractNgramsT' :: TermType Lang -> HyperdataDocument
               -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
  where
    extractNgramsT'' :: TermType Lang -> HyperdataDocument
                   -> Cmd err (Map Ngrams (Map NgramsType Int))
    extractNgramsT'' lang' doc = do
      let source    = text2ngrams
                    $ maybe "Nothing" identity
                    $ _hyperdataDocument_source doc

          institutes = map text2ngrams
                     $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
                     $ _hyperdataDocument_institutes doc

          authors    = map text2ngrams
                     $ maybe ["Nothing"] (splitOn ", ")
                     $ _hyperdataDocument_authors doc

          leText = catMaybes [ _hyperdataDocument_title    doc
                             , _hyperdataDocument_abstract doc
                             ]

      terms' <- map text2ngrams
             <$> map (intercalate " " . _terms_label)
             <$> concat
             <$> liftIO (extractTerms lang' leText)

329 330 331 332
      pure $ Map.fromList $  [(source, Map.singleton Sources 1)]
                         <> [(i', Map.singleton Institutes  1) | i' <- institutes ]
                         <> [(a', Map.singleton Authors     1) | a' <- authors    ]
                         <> [(t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
333 334 335 336


filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
                     -> Map Ngrams (Map NgramsType Int)
337
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
338 339 340 341
  where
    filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
          True  -> (ng,y)
          False -> (Ngrams (Text.take s' t) n , y)
342 343


344
documentIdWithNgrams :: HasNodeError err
345 346 347 348
                     => (a
                     -> Cmd err (Map Ngrams (Map NgramsType Int)))
                     -> [DocumentWithId a]
                     -> Cmd err [DocumentIdWithNgrams a]
349 350 351 352 353
documentIdWithNgrams f = mapM toDocumentIdWithNgrams
  where
    toDocumentIdWithNgrams d = do
      e <- f $ documentData d
      pure $ DocumentIdWithNgrams d e
354

355 356 357


-- FLOW LIST
358
-- | TODO check optimization
359 360
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
                -> Map Ngrams (Map NgramsType (Map NodeId Int))
361
mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
362
  where
363 364
    f :: DocumentIdWithNgrams a
      -> Map Ngrams (Map NgramsType (Map NodeId Int))
365
    f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
366 367
      where
        nId = documentId $ documentWithId d
368

369
------------------------------------------------------------------------
370 371 372 373 374 375 376
listInsert :: FlowCmdM env err m
             => ListId -> Map NgramsType [NgramsElement]
             -> m ()
listInsert lId ngs = mapM_ (\(typeList, ngElmts)
                             -> putListNgrams lId typeList ngElmts
                             ) $ toList ngs

377
flowList :: FlowCmdM env err m => UserId -> CorpusId
378
         -> Map NgramsType [NgramsElement]
379
         -> m ListId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
380
flowList uId cId ngs = do
381
  lId <- getOrMkList cId uId
382
  printDebug "listId flowList" lId
383
  listInsert lId ngs
384
  --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
385
  pure lId
386