Count.hs 5.92 KB
Newer Older
1
{-|
2
Module      : Gargantext.Text.Metrics.Count
3 4 5 6 7 8 9
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12 13 14 15 16 17 18 19 20 21 22
Token and occurrence

An occurrence is not necessarily a token. Considering the sentence:
"A rose is a rose is a rose". We may equally correctly state that there
are eight or three words in the sentence. There are, in fact, three word
types in the sentence: "rose", "is" and "a". There are eight word tokens
in a token copy of the line. The line itself is a type. There are not
eight word types in the line. It contains (as stated) only the three
word types, 'a', 'is' and 'rose', each of which is unique. So what do we
call what there are eight of? They are occurrences of words. There are
three occurrences of the word type 'a', two of 'is' and three of 'rose'.
Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences

23 24 25
-}

{-# LANGUAGE NoImplicitPrelude #-}
26 27
{-# LANGUAGE OverloadedStrings #-}

28
module Gargantext.Text.Metrics.Count
29
  where
30

31
import Data.Text (Text)
32
import Control.Arrow (Arrow(..), (***))
33
import qualified Data.List as List
34 35 36

import qualified Data.Map.Strict as DMS
import Data.Map.Strict  ( Map, empty, singleton
37
                        , insertWith, unionWith, unionsWith
38
                        , mapKeys
39
                        )
40
import Data.Set (Set)
41 42
import Data.Text (pack)

43

44 45 46 47 48 49 50
------------------------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Core.Types
------------------------------------------------------------------------
type Occ  a = Map      a  Int
type Cooc a = Map (a,  a) Int
type FIS  a = Map (Set a) Int
51

52 53 54 55 56
data Group = ByStem | ByOntology

type Grouped = Stems


57
{-
58 59 60 61 62 63 64 65 66 67 68 69
-- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
-- >> map occurrences <$> Prelude.mapM (terms Mono EN) 
-- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
--λ:   cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
--fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
---- 
70 71
           -}

72 73
type Occs      = Int
type Coocs     = Int
74
type Threshold = Int
75

76
removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
77
removeApax t = DMS.filter (> t)
78

79
cooc :: [[Terms]] -> Map ([Text], [Text]) Int
80
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
81
  where
82
    terms_occs = occurrencesOn _terms_stem (List.concat tss)
83
    label_policy = mkLabelPolicy terms_occs
84 85 86 87


coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
                                      -> [[a]] -> Map (label, label) Coocs
88
coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
89
  where
90
    delta :: Arrow a => a b' c' -> a (b', b') (c', c')
91 92 93
    delta f = f *** f


94
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
95 96 97 98
mkLabelPolicy = DMS.map f where
  f = _terms_label . fst . maximumWith snd . DMS.toList
     -- TODO use the Foldable instance of Map instead of building a list

99
useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
100 101 102
useLabelPolicy m g = case DMS.lookup g m of
  Just label -> label
  Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
103
                -- TODO: use a non-fatal error if this can happen in practice
104
{-
105 106 107 108
labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
labelPolicy m g =  case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
                     Just label -> label
                     Nothing    -> panic $ "Label of Grouped not found: " <> (pack $ show g)
109
-}
110

111
coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Int
112 113
coocOn f as = DMS.unionsWith (+) $ map (coocOn' f) as

114
coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Int
115 116 117 118 119 120 121 122 123
coocOn' fun ts = DMS.fromListWith (+) xs
  where
      ts' = List.nub $ map fun ts
      xs = [ ((x, y), 1)
           | x <- ts'
           , y <- ts'
           , x >= y
           ]

124

125 126
------------------------------------------------------------------------

127 128 129 130 131
coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)

coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Int)]
coocOnSingleContext fun ts = xs
132
  where
133 134 135 136 137 138
      ts' = List.nub $ map fun ts
      xs = [ ((x, y), 1)
           | x <- ts'
           , y <- ts'
           , x >= y
           ]
139
------------------------------------------------------------------------
140 141 142


-- | Compute the grouped occurrences (occ)
143
occurrences :: [Terms] -> Map Grouped (Map Terms Int)
144
occurrences = occurrencesOn _terms_stem
145

146 147
occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
148

149 150 151
occurrencesWith :: (Foldable list, Ord k, Num a) => (b -> k) -> list b -> Map k a
occurrencesWith f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs

152
-- TODO add groups and filter stops
153

154
sumOcc :: Ord a => [Occ a] -> Occ a
155
sumOcc xs = unionsWith (+) xs
156 157