List.hs 14.3 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Text.Ngrams.Lists
3
Description : Tools to build lists
4 5 6 7 8 9 10 11
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

-}

Alexandre Delanoë's avatar
Alexandre Delanoë committed
12 13
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
14
{-# LANGUAGE BangPatterns        #-}
15

16
module Gargantext.Core.Text.List
17 18
  where

19
import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20
import Data.HashMap.Strict (HashMap)
21
import Data.HashSet (HashSet)
22
import Data.Map.Strict (Map)
23
import Data.Monoid (mempty)
24
import Data.Ord (Down(..))
25
import Data.Set (Set)
26
import Data.Text (Text)
27
import Data.Tuple.Extra (both)
28 29
import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
import Gargantext.Core.NodeStory
30
import Gargantext.Core.Text (size)
31
import Gargantext.Core.Text.List.Group
32
import Gargantext.Core.Text.List.Group.Prelude
33
import Gargantext.Core.Text.List.Group.WithStem
34 35
import Gargantext.Core.Text.List.Social
import Gargantext.Core.Text.List.Social.Prelude
36
import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
37 38
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Core.Types.Individu (User(..))
39
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
40
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
41
import Gargantext.Database.Admin.Types.Node (NodeId)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
42
import Gargantext.Database.Prelude (CmdM)
43
import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
44
import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
45
import Gargantext.Database.Query.Table.Node (defaultList)
46
import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47
import Gargantext.Database.Query.Tree.Error (HasTreeError)
48
import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
49
import Gargantext.Prelude
50
import qualified Data.HashMap.Strict as HashMap
51
import qualified Data.HashSet as HashSet
52
import qualified Data.List    as List
53
import qualified Data.Map.Strict as Map
54
import qualified Data.Set     as Set
55
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56

57 58 59 60 61 62 63 64 65
{-
-- TODO maybe useful for later
isStopTerm :: StopSize -> Text -> Bool
isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  where
    isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
-}


66
-- | TODO improve grouping functions of Authors, Sources, Institutes..
67
buildNgramsLists :: ( HasNodeStory env err m
68 69 70 71
                    , CmdM     env err m
                    , HasTreeError err
                    , HasNodeError err
                    )
72
                 => User
Alexandre Delanoë's avatar
Alexandre Delanoë committed
73 74
                 -> UserCorpusId
                 -> MasterCorpusId
75 76
                 -> Maybe FlowSocialListWith
                 -> GroupParams
77
                 -> m (Map NgramsType [NgramsElement])
78 79 80
buildNgramsLists user uCid mCid mfslw gp = do
  ngTerms     <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
  othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
81 82 83
                      [ (Authors   , MapListSize 9, MaxListSize 1000)
                      , (Sources   , MapListSize 9, MaxListSize 1000)
                      , (Institutes, MapListSize 9, MaxListSize 1000)
84 85 86 87
                      ]

  pure $ Map.unions $ [ngTerms] <> othersTerms

88

89
data MapListSize = MapListSize { unMapListSize :: !Int }
90
data MaxListSize = MaxListSize { unMaxListSize :: !Int }
91

92 93 94 95 96 97 98 99 100
buildNgramsOthersList :: ( HasNodeError err
                         , CmdM     env err m
                         , HasNodeStory env err m
                         , HasTreeError err
                         )
                      => User
                      -> UserCorpusId
                      -> Maybe FlowSocialListWith
                      -> GroupParams
101
                      -> (NgramsType, MapListSize, MaxListSize)
102
                      -> m (Map NgramsType [NgramsElement])
103
buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
104
  allTerms  :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
105

106
  -- PrivateFirst for first developments since Public NodeMode is not implemented yet
107
  socialLists :: FlowCont NgramsTerm FlowListScores
108
    <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
109 110
                                                      $ HashMap.fromList
                                                      $ List.zip (HashMap.keys allTerms)
111
                                                                 (List.cycle [mempty])
112
                                    )
113
  let
114
    groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
115

116
    (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
117 118
                           $ view flc_scores groupedWithList

119
    (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
120

121
    listSize = mapListSize - (List.length mapTerms)
122
    (mapTerms', candiTerms) = both HashMap.fromList
123
                            $ List.splitAt listSize
124
                            $ List.take maxListSize
125
                            $ List.sortOn (Down . viewScore . snd)
126
                            $ HashMap.toList tailTerms'
127

128

129
  pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
130 131 132
                          <> (toNgramsElement mapTerms )
                          <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
                          <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
133
                          )]
134

Alexandre Delanoë's avatar
Alexandre Delanoë committed
135

136 137
getGroupParams :: ( HasNodeError err
                  , CmdM     env err m
138
                  , HasNodeStory env err m
139 140
                  , HasTreeError err
                  )
141
               => GroupParams -> HashSet Ngrams -> m GroupParams
142
getGroupParams gp@(GroupWithPosTag l a _m) ng = do
143
  !hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
144
  -- printDebug "hashMap" hashMap
145 146 147 148
  pure $ over gwl_map (\x -> x <> hashMap) gp
getGroupParams gp _ = pure gp


149
-- TODO use ListIds
150 151
buildNgramsTermsList :: ( HasNodeError err
                        , CmdM     env err m
152
                        , HasNodeStory env err m
153 154
                        , HasTreeError err
                        )
155 156 157 158 159 160 161
                     => User
                     -> UserCorpusId
                     -> MasterCorpusId
                     -> Maybe FlowSocialListWith
                     -> GroupParams
                     -> (NgramsType, MapListSize)
                     -> m (Map NgramsType [NgramsElement])
162
buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSize) = do
Alexandre Delanoë's avatar
Alexandre Delanoë committed
163

164
-- Filter 0 With Double
165
-- Computing global speGen score
166
  printDebug "[buildNgramsTermsList: Sample List] / start" nt
167 168
  !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt

Alexandre Delanoë's avatar
Alexandre Delanoë committed
169
  printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
170

171
  printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
172

173
  -- PrivateFirst for first developments since Public NodeMode is not implemented yet
174
  !(socialLists :: FlowCont NgramsTerm FlowListScores)
175
    <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
176 177 178
                                                      $ HashMap.fromList
                                                      $ List.zip (HashMap.keys   allTerms)
                                                                 (List.cycle     [mempty])
179
                                    )
180
  printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
181

182 183 184 185
  let !ngramsKeys = HashSet.fromList
                  $ List.take mapListSize
                  $ HashSet.toList
                  $ HashMap.keysSet allTerms
186

187 188 189 190
  printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)

  !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)

191
  printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
192 193

  let
194 195 196
    !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
    !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
    !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
197 198
                                 $ HashMap.filter (\g -> (view gts'_score g) > 1)
                                 $ view flc_scores groupedWithList
199

200
    !(groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
201

202 203 204
  printDebug "[buildNgramsTermsList] socialLists" socialLists
  printDebug "[buildNgramsTermsList] socialLists with scores" socialLists_Stemmed
  printDebug "[buildNgramsTermsList] groupedWithList" groupedWithList
205
  printDebug "[buildNgramsTermsList] stopTerms" stopTerms
206

207
  -- splitting monterms and multiterms to take proportional candidates
208
    -- use % of list if to big, or Int if too small
209 210 211 212
  let
    !listSizeGlobal = 2000 :: Double
    !monoSize = 0.4  :: Double
    !multSize = 1 - monoSize
213

214
    splitAt n' ns = both (HashMap.fromListWith (<>))
215 216 217
                     $ List.splitAt (round $ n' * listSizeGlobal)
                     $ List.sortOn (viewScore . snd)
                     $ HashMap.toList ns
218

219 220
    !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
    !(groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
221

222 223
-------------------------
-- Filter 1 With Set NodeId and SpeGen
224
    !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
225

226
  printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
227

228
 -- TODO remove (and remove HasNodeError instance)
229 230
  !userListId    <- defaultList uCid
  !masterListId  <- defaultList mCid
231

232
  !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
233
                                            [userListId, masterListId]
234
                                            nt
235
                                            selectedTerms
236

Alexandre Delanoë's avatar
Alexandre Delanoë committed
237

238
  printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
Alexandre Delanoë's avatar
Alexandre Delanoë committed
239

240
  let
241
    groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
242
    !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
243 244 245
                                $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)


246
  printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
247

248
  -- Coocurrences computation
249
  --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
250
  let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
251
              $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
252 253 254
                           | (t1, s1) <- mapStemNodeIds
                           , (t2, s2) <- mapStemNodeIds
                           ]
255
          where
256 257
            mapStemNodeIds = HashMap.toList
                           $ HashMap.map viewScores
258
                           $ groupedTreeScores_SetNodeId
259
  let
260
    -- computing scores
261
    mapScores f = HashMap.fromList
262
                $ map (\g -> (view scored_terms g, f g))
263 264
                $ normalizeGlobal
                $ map normalizeLocal
265 266 267
                $ scored'
                $ Map.fromList -- TODO remove this
                $ HashMap.toList mapCooc
268

269
  let
270
    groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
271
    !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
272

273
  let
274
    -- sort / partition / split
275
    -- filter mono/multi again
276
    !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
277

278
      -- filter with max score
279
    partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
280 281
                                                   > (view scored_speExc $ view gts'_score g)
                                              )
Alexandre Delanoë's avatar
Alexandre Delanoë committed
282

283 284
    !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
    !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
285

286
  -- splitAt
287
  let
288
    -- use % of list if to big, or Int if to small
289 290
    !mapSize = 1000 :: Double
    !canSize = mapSize * 2 :: Double
291

292 293
    !inclSize = 0.4  :: Double
    !exclSize = 1 - inclSize
294

295
    splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
296
    sortOn   f       = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
297

298 299
    monoInc_size n = splitAt' n $ monoSize * inclSize / 2
    multExc_size n = splitAt' n $ multSize * exclSize / 2
300 301


302 303
    !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
    !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
304

305 306
    !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
    !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
307

308

309 310
    !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
    !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
311

312 313
    !(canMulScoredInclHead, _)  = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
    !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
314

315
------------------------------------------------------------
316
    -- Final Step building the Typed list
317 318
    -- Candidates Terms need to be filtered
  let
319
    !maps = setListType (Just MapTerm)
320 321 322 323
        $  mapMonoScoredInclHead
        <> mapMonoScoredExclHead
        <> mapMultScoredInclHead
        <> mapMultScoredExclHead
324 325

    -- An original way to filter to start with
326
    !cands = setListType (Just CandidateTerm)
327 328 329 330
          $ canMonoScoredIncHead
          <> canMonoScoredExclHead
          <> canMulScoredInclHead
          <> canMultScoredExclHead
331

332
  -- TODO count it too
333
    !cands' = setListType (Just CandidateTerm)
334
          {-\$  groupedMonoTail
335 336
          <>-} groupedMultTail

337
    -- Quick FIX
338
    !candNgramsElement = List.take 1000
339 340
                      $ toNgramsElement cands <> toNgramsElement cands'

341
    !result = Map.unionsWith (<>)
342
       [ Map.fromList [( nt, toNgramsElement maps
343
                          <> toNgramsElement stopTerms
344
                          <> candNgramsElement
345 346
                      )]
       ]
347

348
  pure result