1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
{-|
Module : Gargantext.Text.Metrics.FrequentItemSet
Description : Ngrams tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Domain Specific Language to manage Frequent Item Set (FIS)
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics.FrequentItemSet
( Fis, Size(..)
, occ_hlcm, cooc_hlcm
, allFis, between
, fisWithSize
, fisWith
, fisWithSizePoly
, fisWithSizePoly2
, fisWithSizePolyMap
, module HLCM
)
where
import Prelude (Functor(..)) -- TODO
import qualified Data.Map.Strict as Map
import Data.Map.Strict (Map)
import qualified Data.Set as Set
import Data.Set (Set)
import qualified Data.Vector as V
import Data.List (filter, concat, null)
import Data.Maybe (catMaybes)
import HLCM
import Gargantext.Prelude
data Size = Point Int | Segment Int Int
------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1
occ_hlcm :: Frequency -> [[Item]] -> [Fis]
occ_hlcm = fisWithSize (Point 1)
-- | Cooccurrence is Frequent Item Set of size 2
cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
cooc_hlcm = fisWithSize (Point 2)
allFis :: Frequency -> [[Item]] -> [Fis]
allFis = fisWith Nothing
------------------------------------------------------------------------
between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
between (x,y) = fisWithSize (Segment x y)
--maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--maximum m = between (0,m)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Data type to type the Frequent Item Set
-- TODO replace List with Set in fisItemSet
-- be careful : risks to erase HLCM behavior
type Fis = Fis' Item
data Fis' a = Fis' { _fisCount :: Int
, _fisItemSet :: [a]
} deriving (Show)
instance Functor Fis' where
fmap f (Fis' c is) = Fis' c (fmap f is)
-- | Sugar from items to FIS
items2fis :: [Item] -> Maybe Fis
items2fis [] = Nothing
items2fis (i:is) = Just $ Fis' i is
------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
fisWithSize n f is = case n of
Point n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
Segment a b -> fisWith (Just (\x -> cond a (length x) b)) f is
where
cond a' x b' = a' <= x && x <= b'
--- Filter on Fis and not on [Item]
fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
fisWith s f is = case filter (not . null) is of
[] -> []
js -> catMaybes $ map items2fis $ filter' $ runLCMmatrix js f
-- drop unMaybe
where
filter' = case s of
Nothing -> identity
Just fun -> filter fun
-- Here the sole purpose to take the keys as a Set is tell we do not want
-- duplicates.
fisWithSizePoly :: Ord a => Size -> Frequency -> Set a -> [[a]] -> [Fis' a]
fisWithSizePoly n f ks = map (fmap fromItem) . fisWithSize n f . map (map toItem)
where
ksv = V.fromList $ Set.toList ks
ksm = Map.fromList . flip zip [0..] $ V.toList ksv
toItem = (ksm Map.!)
fromItem = (ksv V.!)
fisWithSizePoly2 :: Ord a => Size -> Frequency -> [[a]] -> [Fis' a]
fisWithSizePoly2 n f is = fisWithSizePoly n f ks is
where
ks = Set.fromList $ concat is
fisWithSizePolyMap :: Ord a => Size -> Frequency -> [[a]] -> Map (Set a) Int
fisWithSizePolyMap n f is =
Map.fromList $ (\i -> (Set.fromList (_fisItemSet i), _fisCount i)) <$> fisWithSizePoly2 n f is
------------------------------------------------------------------------
------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
--
---- | Index ngrams from Map
----indexNgram :: Ord a => Map a Occ -> Map Index a
----indexNgram m = fromList (zip [1..] (keys m))
--
---- | Index ngrams from Map
----ngramIndex :: Ord a => Map a Occ -> Map a Index
----ngramIndex m = fromList (zip (keys m) [1..])
--
--indexWith :: Ord a => Map a Occ -> [a] -> [Int]
--indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
--
--indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
--indexIt xs = (m, is)
-- where
-- m = sumOcc (map occ xs)
-- is = map (indexWith m) xs
--
--list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
--list2fis n xs = (m', fs)
-- where
-- (m, is) = indexIt xs
-- m' = M.filter (>50000) m
-- fs = FIS.all n is
--
--text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fis n xs = list2fis n (map terms xs)
--
----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
----text2fisWith = undefined
--