Flow.hs 14.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
{-|
Module      : Gargantext.Database.Flow
Description : Database Flow
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

12
{-# LANGUAGE ConstraintKinds   #-}
13
{-# LANGUAGE DeriveGeneric     #-}
14
{-# LANGUAGE NoImplicitPrelude #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
15
{-# LANGUAGE OverloadedStrings #-}
16
{-# LANGUAGE RankNTypes        #-}
Alexandre Delanoë's avatar
Alexandre Delanoë committed
17
{-# LANGUAGE FlexibleContexts  #-}
18

19
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20
    where
21

Alexandre Delanoë's avatar
Alexandre Delanoë committed
22
--import Control.Lens (view)
23
import Control.Monad (mapM_)
24
import Control.Monad.IO.Class (liftIO)
25 26
--import Gargantext.Core.Types
--import Gargantext.Database.Node.Contact (HyperdataContact(..))
27
import Data.Map (Map, lookup, fromListWith, toList)
28
import Data.Maybe (Maybe(..), catMaybes)
29
import Data.Monoid
30
import Data.Text (Text, splitOn, intercalate)
31
import Data.Tuple.Extra (both)
32
import Data.List (concat)
33
import GHC.Show (Show)
34
import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
35 36
import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main
37
import Gargantext.Core (Lang(..))
38
import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
39
import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
40
import Gargantext.Database.Metrics.TFICF (getTficf)
41
import Gargantext.Text.Terms (extractTerms)
42
import Gargantext.Text.Metrics.TFICF (Tficf(..))
43
import Gargantext.Database.Node.Document.Add    (add)
44
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
45
import Gargantext.Database.Root (getRoot)
46
import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams,  NgramsType(..), text2ngrams, ngramsTypeId)
47
import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
48 49
import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
50
import Gargantext.Database.Schema.User (getUser, UserLight(..))
51
import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
52
import Gargantext.Database.Utils (Cmd, CmdM)
53
import Gargantext.Text.Terms (TermType(..))
54
import Gargantext.Ext.IMT (toSchoolName)
55
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
56
import Gargantext.Prelude
57
import Gargantext.Text.Parsers (parseDocs, FileFormat)
58
import System.FilePath (FilePath)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
59 60
import Gargantext.API.Ngrams (HasRepoVar)
import Servant (ServantErr)
61
import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
62
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
63
import qualified Data.Map as DM
64

65
type FlowCmdM env err m =
66
  ( CmdM     env err m
67 68
  , RepoCmdM env err m
  , HasNodeError err
Alexandre Delanoë's avatar
Alexandre Delanoë committed
69
  , HasRepoVar env
70
  )
71

Alexandre Delanoë's avatar
Alexandre Delanoë committed
72 73 74

flowCorpus :: FlowCmdM env ServantErr m
           => FileFormat -> FilePath -> CorpusName -> m CorpusId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
75
flowCorpus ff fp cName = do
Alexandre Delanoë's avatar
Alexandre Delanoë committed
76
  --insertUsers [gargantuaUser, simpleUser]
77
  hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
78 79
  params <- flowInsert NodeCorpus hyperdataDocuments' cName
  flowCorpus' NodeCorpus hyperdataDocuments' params
80

81

Alexandre Delanoë's avatar
Alexandre Delanoë committed
82
flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
83
     -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
84 85
flowInsert _nt hyperdataDocuments cName = do
  let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
86

87
  (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88
  ids  <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
89

90
  (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
91
  _ <- add userCorpusId (map reId ids)
92

93 94
  pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)

95

96 97 98 99 100 101 102
-- TODO-ACCESS:
--   check userId       CanFillUserCorpus   userCorpusId
--   check masterUserId CanFillMasterCorpus masterCorpusId
--
-- TODO-EVENTS:
--   InsertedNgrams ?
--   InsertedNodeNgrams ?
103
flowCorpus' :: FlowCmdM env err m
104
            => NodeType -> [HyperdataDocument]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
105
            -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
106
            -> m CorpusId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
107
flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
108
--------------------------------------------------
109

110
  let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
111
  --printDebug "documentsWithId" documentsWithId
112
  docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
113
  --printDebug "docsWithNgrams" docsWithNgrams
114
  let maps            = mapNodeIdNgrams docsWithNgrams
115

116
  --printDebug "maps" (maps)
117 118
  terms2id <- insertNgrams $ DM.keys maps
  let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
119
  --printDebug "inserted ngrams" indexedNgrams
120
  _             <- insertToNodeNgrams indexedNgrams
121

122 123 124 125
  -- List Ngrams Flow
  _masterListId <- flowList masterUserId masterCorpusId indexedNgrams
  _userListId    <- flowListUser userId userCorpusId 500
  --printDebug "Working on User ListId : " userListId
126 127
  --}
--------------------------------------------------
128 129
  _ <- mkDashboard userCorpusId userId
  _ <- mkGraph     userCorpusId userId
130

131
  -- Annuaire Flow
132
  -- _ <- mkAnnuaire  rootUserId userId
133

134
  pure userCorpusId
135
  -- del [corpusId2, corpusId]
136

Alexandre Delanoë's avatar
Alexandre Delanoë committed
137 138
flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
flowCorpus' _ _ _ = undefined
139

140

141 142
type CorpusName = Text

Alexandre Delanoë's avatar
Alexandre Delanoë committed
143
subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
144
subFlowCorpus username cName = do
145
  maybeUserId <- getUser username
146
  userId <- case maybeUserId of
147
        Nothing   -> nodeError NoUserFound
148
        -- mk NodeUser gargantua_id "Node Gargantua"
149
        Just user -> pure $ userLight_id user
150

151
  --printDebug "userId" userId
152
  rootId' <- map _node_id <$> getRoot username
153

154
  --printDebug "rootId'" rootId'
155
  rootId'' <- case rootId' of
156
        []  -> mkRoot username userId
157
        n   -> case length n >= 2 of
158
            True  -> nodeError ManyNodeUsers
159
            False -> pure rootId'
160

161
  --printDebug "rootId''" rootId''
Alexandre Delanoë's avatar
Alexandre Delanoë committed
162
  rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
163

Alexandre Delanoë's avatar
Alexandre Delanoë committed
164
  corpusId'' <- if username == userMaster
165
                  then do
166
                    ns <- getCorporaWithParentId rootId
167 168 169
                    pure $ map _node_id ns
                  else
                    pure []
Alexandre Delanoë's avatar
Alexandre Delanoë committed
170

171 172
  corpusId' <- if corpusId'' /= []
                  then pure corpusId''
173
                  else mkCorpus (Just cName) Nothing rootId userId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
174

175
  corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
176

177 178
  --printDebug "(username, userId, rootId, corpusId)"
  --            (username, userId, rootId, corpusId)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
179 180
  pure (userId, rootId, corpusId)

181

182
------------------------------------------------------------------------
183
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
184
toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
185
  where
186
    err = "Database.Flow.toInsert"
187 188

toInserted :: [ReturnId] -> Map HashId ReturnId
189 190
toInserted = DM.fromList . map    (\r ->  (reUniqId r, r)    )
                         . filter (\r -> reInserted r == True)
191

192
data DocumentWithId =
193 194
     DocumentWithId { documentId   :: !NodeId
                    , documentData :: !HyperdataDocument
195
                    } deriving (Show)
196

197
mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
198 199 200 201 202
mergeData rs = catMaybes . map toDocumentWithId . DM.toList
  where
    toDocumentWithId (hash,hpd) =
      DocumentWithId <$> fmap reId (lookup hash rs)
                     <*> Just hpd
203

204
------------------------------------------------------------------------
205
data DocumentIdWithNgrams =
206
     DocumentIdWithNgrams
207
     { documentWithId  :: !DocumentWithId
208
     , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
209
     } deriving (Show)
210

211
-- TODO group terms
212
extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
213 214 215 216 217
extractNgramsT doc = do
  let source    = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
  let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))  $ _hyperdataDocument_institutes doc
  let authors    = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
  let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
218
  terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
219

220 221 222 223
  pure $ DM.fromList $  [(source, DM.singleton Sources 1)]
                     <> [(i', DM.singleton Institutes  1) | i' <- institutes ]
                     <> [(a', DM.singleton Authors     1) | a' <- authors    ]
                     <> [(t', DM.singleton NgramsTerms 1) | t' <- terms'     ]
224 225 226



227 228
documentIdWithNgrams :: HasNodeError err
                     => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
229
                     -> [DocumentWithId]   -> Cmd err [DocumentIdWithNgrams]
230 231 232 233 234
documentIdWithNgrams f = mapM toDocumentIdWithNgrams
  where
    toDocumentIdWithNgrams d = do
      e <- f $ documentData d
      pure $ DocumentIdWithNgrams d e
235 236

-- | TODO check optimization
237 238
mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
239
  where
240 241 242 243
    f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
    f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
      where
        nId = documentId $ documentWithId d
244

245
------------------------------------------------------------------------
246 247 248
flowList :: FlowCmdM env err m => UserId -> CorpusId
         -> Map NgramsIndexed (Map NgramsType (Map NodeId Int))
         -> m ListId
Alexandre Delanoë's avatar
Alexandre Delanoë committed
249
flowList uId cId ngs = do
250
  --printDebug "ngs:" ngs
251
  lId <- getOrMkList cId uId
252
  --printDebug "ngs" (DM.keys ngs)
253
  -- TODO add stemming equivalence of 2 ngrams
254 255 256
  -- TODO needs rework
  -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
  -- _ <- insertGroups lId groupEd
257 258

-- compute Candidate / Map
259 260
  _is <- mapM_ (\(typeList, ngElements) -> putListNgrams lId typeList ngElements) $ toList $ ngrams2list' ngs
  --printDebug "listNgrams inserted :" is
261 262 263

  pure lId

264
flowListUser :: FlowCmdM env err m
265
             => UserId -> CorpusId -> Int -> m ListId
266 267 268
flowListUser uId cId n = do
  lId <- getOrMkList cId uId

269 270
  ngs <- take n <$> sortWith tficf_score
                <$> getTficf userMaster cId lId NgramsTerms
271

272
  putListNgrams lId NgramsTerms $
273 274
    [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
    | ng <- ngs ]
275 276

  pure lId
277

278
------------------------------------------------------------------------
279

280 281 282 283
{-
  TODO rework:
    * quadratic
    * DM.keys called twice
284
groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
285 286
              -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
              -> Map NgramsIndexed NgramsIndexed
287
groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
288
-}
289

290

291
-- TODO check: do not insert duplicates
Alexandre Delanoë's avatar
Alexandre Delanoë committed
292
insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
293 294
insertGroups lId ngrs =
  insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
295 296
                            | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
                            , ng1 /= ng2
297
                            ]
298 299

------------------------------------------------------------------------
300
ngrams2list :: Map NgramsIndexed (Map NgramsType a)
301
            -> [(ListType, (NgramsType, NgramsIndexed))]
302
ngrams2list m =
303
  [ (CandidateList, (t, ng))
304 305 306
  | (ng, tm) <- DM.toList m
  , t <- DM.keys tm
  ]
307

308 309 310 311 312 313 314 315 316 317 318 319
ngrams2list' :: Map NgramsIndexed (Map NgramsType a)
            -> Map NgramsType [NgramsElement]
ngrams2list' m = fromListWith (<>)
  [ (t, [NgramsElement (_ngramsTerms $ _ngrams ng) CandidateList 1 Nothing mempty])
  | (ng, tm) <- DM.toList m
  , t <- DM.keys tm
  ]





320
-- | TODO: weight of the list could be a probability
321
insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
322
insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
323
                     | (l,(ngt, ng)) <- lngs
324 325
                   ]
------------------------------------------------------------------------
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379


-- | Annuaire

flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
flowAnnuaire filePath = do
  contacts <- liftIO $ deserialiseImtUsersFromFile filePath
  ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
  printDebug "length annuaire" ps


flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
                    -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
flowInsertAnnuaire name children = do

  (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
  ids  <- insertDocuments masterUserId masterCorpusId NodeContact children

  (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
  _ <- add userCorpusId (map reId ids)

  printDebug "AnnuaireID" userCorpusId

  pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)


subFlowAnnuaire :: HasNodeError err =>
  Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
subFlowAnnuaire username _cName = do
  maybeUserId <- getUser username

  userId <- case maybeUserId of
        Nothing   -> nodeError NoUserFound
        -- mk NodeUser gargantua_id "Node Gargantua"
        Just user -> pure $ userLight_id user

  rootId' <- map _node_id <$> getRoot username

  rootId'' <- case rootId' of
        []  -> mkRoot username userId
        n   -> case length n >= 2 of
            True  -> nodeError ManyNodeUsers
            False -> pure rootId'
  rootId <- maybe (nodeError NoRootFound) pure (head rootId'')

  corpusId' <- mkAnnuaire rootId userId

  corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')

  printDebug "(username, userId, rootId, corpusId)"
              (username, userId, rootId, corpusId)
  pure (userId, rootId, corpusId)