Flow.hs 13.8 KB
Newer Older
1 2 3 4 5 6 7 8 9
{-|
Module      : Gargantext.Database.Flow
Description : Database Flow
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12 13 14

-- TODO-ACCESS:
--   check userId       CanFillUserCorpus   userCorpusId
--   check masterUserId CanFillMasterCorpus masterCorpusId

15 16
-}

17
{-# LANGUAGE ConstraintKinds   #-}
18
{-# LANGUAGE DeriveGeneric     #-}
19
{-# LANGUAGE NoImplicitPrelude #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
20
{-# LANGUAGE OverloadedStrings #-}
21
{-# LANGUAGE RankNTypes        #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
22
{-# LANGUAGE FlexibleContexts  #-}
23

24
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
25
    where
26

27 28 29 30 31 32 33 34
--import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
--import Gargantext.Database.Metrics.TFICF (getTficf)
--import Gargantext.Database.Node.Contact (HyperdataContact(..))
--import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
--import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
35
--import Debug.Trace (trace)
36
import Control.Lens ((^.), view, Lens', _Just)
37
import Control.Monad (mapM_)
38
import Control.Monad.IO.Class (liftIO)
39 40
import Data.List (concat)
import Data.Map (Map, lookup, toList)
41
import Data.Maybe (Maybe(..), catMaybes)
42
import Data.Monoid
43
import Data.Text (Text, splitOn, intercalate)
44
import GHC.Show (Show)
45 46 47 48
import Gargantext.API.Ngrams (HasRepoVar)
import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (NodePoly(..), Terms(..))
49 50
import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main
51 52
import Gargantext.Database.TextSearch (searchInDatabase)
import Gargantext.Database.Config (userMaster, corpusMasterName)
53
import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
54 55
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
56
import Gargantext.Database.Root (getRoot)
57 58
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams,  NgramsType(..), text2ngrams, ngramsTypeId)
import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
59
import Gargantext.Database.Schema.User (getUser, UserLight(..))
60
import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
61
import Gargantext.Database.Utils (Cmd, CmdM)
62
import Gargantext.Ext.IMT (toSchoolName)
63
import Gargantext.Prelude
64
import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65
import Gargantext.Text.Parsers (parseDocs, FileFormat)
66 67 68 69
import Gargantext.Text.Terms (TermType(..), tt_lang)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
Alexandre Delanoë's avatar
Alexandre Delanoë committed
70
import Servant (ServantErr)
71
import System.FilePath (FilePath)
72
import qualified Data.Map as DM
73 74
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add  as Doc  (add)
75

76
type FlowCmdM env err m =
77
  ( CmdM     env err m
78 79
  , RepoCmdM env err m
  , HasNodeError err
Alexandre Delanoë's avatar
Alexandre Delanoë committed
80
  , HasRepoVar env
81
  )
82

83 84 85 86 87
type FlowCorpus a = ( AddUniqId a
                    , UniqId a
                    , InsertDb a
                    , ExtractNgramsT a
                    )
Alexandre Delanoë's avatar
Alexandre Delanoë committed
88

89
------------------------------------------------------------------------
90

91 92 93 94 95
flowAnnuaire :: FlowCmdM env ServantErr m 
             => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
flowAnnuaire u n l filePath = do
  docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
  flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
96 97


98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
flowCorpusDebat :: FlowCmdM env ServantErr m
            => Username -> CorpusName
            -> Limit -> FilePath
            -> m CorpusId
flowCorpusDebat u n l fp = do
  docs <- liftIO ( splitEvery 500
                 <$> take l
                 <$> GD.readFile fp
                 :: IO [[GD.GrandDebatReference ]]
                 )
  flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)


flowCorpusFile :: FlowCmdM env ServantErr m
           => Username -> CorpusName
           -> Limit -- ^ Limit the number of docs (for dev purpose)
           -> TermType Lang -> FileFormat -> FilePath
           -> m CorpusId
flowCorpusFile u n l la ff fp = do
  docs <- liftIO ( splitEvery 500
                 <$> take l
                 <$> parseDocs ff fp
                 )
  flowCorpus u n la (map (map toHyperdataDocument) docs)

-- TODO query with complex query
flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
          => Username -> Lang -> Text -> m CorpusId
flowCorpusSearchInDatabase u la q = do
  (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
  ids <-  map fst <$> searchInDatabase cId (stemIt q)
  flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
130

131
------------------------------------------------------------------------
132

133 134
-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
-- TODO-EVENTS: InsertedNodes
135

136

137 138 139 140 141
flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
     => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flow c u cn la docs = do
  ids <- mapM (insertMasterDocs c la ) docs
  flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
142

143 144 145
flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
     => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
146 147


148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
flowCorpusUser :: (FlowCmdM env ServantErr m, MkCorpus c)
               => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
flowCorpusUser l userName corpusName ctype ids = do
  -- User Flow
  (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
  -- TODO: check if present already, ignore
  _ <- Doc.add userCorpusId ids

  -- User List Flow
  --{-
  (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
  ngs         <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
  userListId  <- flowList userId userCorpusId ngs
  printDebug "userListId" userListId
  -- User Graph Flow
  _ <- mkGraph  userCorpusId userId
  --}

  -- User Dashboard Flow
167
  _ <- mkDashboard userCorpusId userId
168

169
  -- Annuaire Flow
170
  -- _ <- mkAnnuaire  rootUserId userId
171
  pure userCorpusId
172

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195

insertMasterDocs :: ( FlowCmdM env ServantErr m
                    , FlowCorpus a
                    , MkCorpus   c
                    )
                 => Maybe c -> TermType Lang -> [a] -> m [DocId]
insertMasterDocs c lang hs  =  do
  (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c

  -- TODO Type NodeDocumentUnicised
  let hs' = map addUniqId hs
  ids <- insertDb masterUserId masterCorpusId hs'
  let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
  
  docsWithNgrams     <- documentIdWithNgrams (extractNgramsT lang) documentsWithId

  let maps            = mapNodeIdNgrams docsWithNgrams

  terms2id <- insertNgrams $ DM.keys maps
  let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
  _                <- insertToNodeNgrams indexedNgrams
  pure $ map reId ids

196

197

198 199
type CorpusName = Text

200 201 202 203
getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
              => Username -> CorpusName -> Maybe a
              -> Cmd err (UserId, RootId, CorpusId)
getOrMkRootWithCorpus username cName c = do
204
  maybeUserId <- getUser username
205
  userId <- case maybeUserId of
206
        Nothing   -> nodeError NoUserFound
207
        Just user -> pure $ userLight_id user
208

209
  rootId' <- map _node_id <$> getRoot username
210

211
  rootId'' <- case rootId' of
212
        []  -> mkRoot username userId
213
        n   -> case length n >= 2 of
214
            True  -> nodeError ManyNodeUsers
215
            False -> pure rootId'
216

Alexandre Delanoë's avatar
Alexandre Delanoë committed
217
  rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
218

Alexandre Delanoë's avatar
Alexandre Delanoë committed
219
  corpusId'' <- if username == userMaster
220
                  then do
221
                    ns <- getCorporaWithParentId rootId
222 223 224
                    pure $ map _node_id ns
                  else
                    pure []
225
  
226 227
  corpusId' <- if corpusId'' /= []
                  then pure corpusId''
228
                  else mk (Just cName) c rootId userId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
229

230
  corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
231

Alexandre Delanoë's avatar
Alexandre Delanoë committed
232 233
  pure (userId, rootId, corpusId)

234

235
------------------------------------------------------------------------
236 237 238 239 240 241 242 243 244 245 246 247


class UniqId a
  where
    uniqId :: Lens' a (Maybe HashId)


instance UniqId HyperdataDocument
  where
    uniqId = hyperdataDocument_uniqId

instance UniqId HyperdataContact
248
  where
249 250 251 252 253 254 255
    uniqId = hc_uniqId

viewUniqId' :: UniqId a => a -> (HashId, a)
viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
      where
        err = panic "[ERROR] Database.Flow.toInsert"

256 257

toInserted :: [ReturnId] -> Map HashId ReturnId
258 259
toInserted = DM.fromList . map    (\r ->  (reUniqId r, r)    )
                         . filter (\r -> reInserted r == True)
260

261 262 263 264
data DocumentWithId a = DocumentWithId
  { documentId   :: !NodeId
  , documentData :: !a
  } deriving (Show)
265

266 267 268
mergeData :: Map HashId ReturnId
          -> Map HashId a
          -> [DocumentWithId a]
269 270 271 272 273
mergeData rs = catMaybes . map toDocumentWithId . DM.toList
  where
    toDocumentWithId (hash,hpd) =
      DocumentWithId <$> fmap reId (lookup hash rs)
                     <*> Just hpd
274

275
------------------------------------------------------------------------
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
data DocumentIdWithNgrams a = DocumentIdWithNgrams
  { documentWithId  :: !(DocumentWithId a)
  , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
  } deriving (Show)

-- TODO extractNgrams according to Type of Data

class ExtractNgramsT h
  where
    extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))


instance ExtractNgramsT HyperdataContact
  where
    extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
      where
        extract :: TermType Lang -> HyperdataContact
                -> Cmd err (Map Ngrams (Map NgramsType Int))
        extract _l hc' = do
          let authors = map text2ngrams
                     $ maybe ["Nothing"] (\a -> [a])
                     $ view (hc_who . _Just . cw_lastName) hc'
        
          pure $ DM.fromList $ [(a', DM.singleton Authors     1) | a' <- authors    ]

301

302

303

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
instance ExtractNgramsT HyperdataDocument
  where
    extractNgramsT = extractNgramsT'

extractNgramsT' :: TermType Lang -> HyperdataDocument
               -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
  where
    extractNgramsT'' :: TermType Lang -> HyperdataDocument
                   -> Cmd err (Map Ngrams (Map NgramsType Int))
    extractNgramsT'' lang' doc = do
      let source    = text2ngrams
                    $ maybe "Nothing" identity
                    $ _hyperdataDocument_source doc

          institutes = map text2ngrams
                     $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
                     $ _hyperdataDocument_institutes doc

          authors    = map text2ngrams
                     $ maybe ["Nothing"] (splitOn ", ")
                     $ _hyperdataDocument_authors doc

          leText = catMaybes [ _hyperdataDocument_title    doc
                             , _hyperdataDocument_abstract doc
                             ]

      terms' <- map text2ngrams
             <$> map (intercalate " " . _terms_label)
             <$> concat
             <$> liftIO (extractTerms lang' leText)

      pure $ DM.fromList $  [(source, DM.singleton Sources 1)]
                         <> [(i', DM.singleton Institutes  1) | i' <- institutes ]
                         <> [(a', DM.singleton Authors     1) | a' <- authors    ]
                         <> [(t', DM.singleton NgramsTerms 1) | t' <- terms'     ]


filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
                     -> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
  where
    filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
          True  -> (ng,y)
          False -> (Ngrams (Text.take s' t) n , y)
349 350


351
documentIdWithNgrams :: HasNodeError err
352 353 354 355
                     => (a
                     -> Cmd err (Map Ngrams (Map NgramsType Int)))
                     -> [DocumentWithId a]
                     -> Cmd err [DocumentIdWithNgrams a]
356 357 358 359 360
documentIdWithNgrams f = mapM toDocumentIdWithNgrams
  where
    toDocumentIdWithNgrams d = do
      e <- f $ documentData d
      pure $ DocumentIdWithNgrams d e
361

362 363 364


-- FLOW LIST
365
-- | TODO check optimization
366 367
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
                -> Map Ngrams (Map NgramsType (Map NodeId Int))
368
mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
369
  where
370 371
    f :: DocumentIdWithNgrams a
      -> Map Ngrams (Map NgramsType (Map NodeId Int))
372 373 374
    f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
      where
        nId = documentId $ documentWithId d
375

376
------------------------------------------------------------------------
377 378 379 380 381 382 383
listInsert :: FlowCmdM env err m
             => ListId -> Map NgramsType [NgramsElement]
             -> m ()
listInsert lId ngs = mapM_ (\(typeList, ngElmts)
                             -> putListNgrams lId typeList ngElmts
                             ) $ toList ngs

384
flowList :: FlowCmdM env err m => UserId -> CorpusId
385
         -> Map NgramsType [NgramsElement]
386
         -> m ListId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
387
flowList uId cId ngs = do
388
  lId <- getOrMkList cId uId
389
  printDebug "listId flowList" lId
390
  listInsert lId ngs
391
  pure lId
392