Commit 9f29cddb authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIS] Frequent Item Set DSL.

parent b6df8e42
...@@ -84,6 +84,7 @@ library: ...@@ -84,6 +84,7 @@ library:
- http-api-data - http-api-data
- http-types - http-types
- hxt - hxt
- hlcm
- ini - ini
- jose-jwt - jose-jwt
- lens - lens
......
{-| {-|
Module : Gargantext.Ngrams Module : Gargantext.Ngrams
Description : Ngrams tools Description : Ngrams tools
Copyright : (c) CNRS, 2017 Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
...@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters ...@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics , module Gargantext.Ngrams.Metrics
, ngrams, occurrences , ngrams, occ, sumOcc, text2fis
--, module Gargantext.Ngrams.Words --, module Gargantext.Ngrams.Words
) where ) where
...@@ -30,26 +30,34 @@ import Gargantext.Ngrams.Letters ...@@ -30,26 +30,34 @@ import Gargantext.Ngrams.Letters
import Gargantext.Ngrams.CoreNLP import Gargantext.Ngrams.CoreNLP
import Gargantext.Ngrams.Parser import Gargantext.Ngrams.Parser
import Gargantext.Ngrams.Occurrences import Gargantext.Ngrams.Occurrences
import Gargantext.Ngrams.TextMining import Gargantext.Ngrams.TextMining
--import Gargantext.Ngrams.Words --import Gargantext.Ngrams.Words
import Gargantext.Ngrams.Metrics import Gargantext.Ngrams.Metrics
import qualified Gargantext.Ngrams.FrequentItemSet as FIS
----------------------------------------------------------------- -----------------------------------------------------------------
import Data.Char (Char, isAlpha, isSpace) import Data.Char (Char, isAlpha, isSpace)
import Data.Text (Text, words, filter, toLower) import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map, empty, insertWith) import Data.Map.Strict (Map, empty, keys
, insertWith, unionWith
, fromList
, lookupIndex
)
import qualified Data.Map.Strict as M (filter)
import Data.Foldable (foldl') import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter) import Gargantext.Prelude hiding (filter)
import qualified Data.List as L (filter)
-- Maybe useful later: -- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..)) --import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions) --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..)) --import Language.Aspell.Options (ACOption(..))
type Occ = Int
type Index = Int
type FreqMin = Int
ngrams :: Text -> [Text] ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs ngrams xs = monograms $ toLower $ filter isGram xs
...@@ -59,12 +67,48 @@ monograms = words ...@@ -59,12 +67,48 @@ monograms = words
isGram :: Char -> Bool isGram :: Char -> Bool
isGram '-' = True isGram '-' = True
isGram '/' = True
isGram c = isAlpha c || isSpace c isGram c = isAlpha c || isSpace c
-- | Compute the occurrences -- | Compute the occurrences (occ)
occurrences :: Ord a => [a] -> Map a Int occ :: Ord a => [a] -> Map a Occ
occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
indexNgram :: Ord a => Map a Occ -> Map Index a
indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
ngramIndex :: Ord a => Map a Occ -> Map a Index
ngramIndex m = fromList (zip (keys m) [1..])
indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
indexIt xs = (m, is)
where
m = sumOcc (map occ xs)
is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m, fs)
where
(m, is) = indexIt xs
fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs)
text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fisWith = undefined
{-|
Module : Gargantext.Ngrams.FrequentItemSet
Description : Ngrams tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Domain Specific Language to manage Frequent Item Set (FIS)
-}
module Gargantext.Ngrams.FrequentItemSet
( Fis, Size
, occ, cooc
, all, between
, module HLCM
)
where
import Data.List (tail, filter)
import Data.Either
import HLCM
import Gargantext.Prelude
type Size = Either Int (Int, Int)
------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1
occ :: Frequency -> [[Item]] -> [Fis]
occ f is = fisWithSize (Left 1) f is
-- | Cooccurrence is Frequent Item Set of size 2
cooc :: Frequency -> [[Item]] -> [Fis]
cooc f is = fisWithSize (Left 2) f is
all :: Frequency -> [[Item]] -> [Fis]
all f is = fisWith Nothing f is
------------------------------------------------------------------------
between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
between (x,y) f is = fisWithSize (Right (x,y)) f is
maximum :: Int -> Frequency -> [[Item]] -> [Fis]
maximum m f is = between (0,m) f is
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Data type to type the Frequent Item Set
-- TODO replace List with Set in fisItemSet
-- be careful : risks to erase HLCM behavior
type Fis = Fis' Item
data Fis' a = Fis' { _fisCount :: Int
, _fisItemSet :: [a]
} deriving (Show)
-- | Sugar from items to FIS
items2fis :: [Item] -> Maybe Fis
items2fis is = case head is of
Nothing -> Nothing
Just h -> Just (Fis' h (tail is))
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
fisWithSize n f is = case n of
Left n' -> fisWith (Just (\x -> length x == (n'+1) )) f is
Right (a,b) -> fisWith (Just (\x -> cond1 a x && cond2 b x)) f is
where
cond1 a' x = length x >= a'
cond2 b' x = length x <= b'
fisWith :: Maybe ([Item] -> Bool) -> Frequency -> [[Item]] -> [Fis]
fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
where
filter' = case s of
Nothing -> identity
Just fun -> filter fun
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
...@@ -15,17 +15,19 @@ module Gargantext.Prelude ...@@ -15,17 +15,19 @@ module Gargantext.Prelude
) )
where where
import Data.Maybe (isJust, fromJust)
import Protolude ( Bool(True, False), Int, Double, Integer import Protolude ( Bool(True, False), Int, Double, Integer
, Fractional, Num, Maybe(Just,Nothing) , Fractional, Num, Maybe(Just,Nothing)
, Floating, Char, IO , Floating, Char, IO
, pure, (<$>), panic , pure, (<$>), panic
, head, flip
, Ord, Integral, Foldable, RealFrac, Monad, filter , Ord, Integral, Foldable, RealFrac, Monad, filter
, reverse, map, zip, drop, take, zipWith , reverse, map, zip, drop, take, zipWith
, sum, fromIntegral, length, fmap , sum, fromIntegral, length, fmap
, takeWhile, sqrt, undefined, identity , takeWhile, sqrt, undefined, identity
, abs, maximum, minimum, return, snd, truncate , abs, maximum, minimum, return, snd, truncate
, (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>) , (+), (*), (/), (-), (.), ($), (**), (^), (<), (>)
, Eq, (==), (<>) , Eq, (==), (>=), (<=), (<>)
, (&&), (||), not , (&&), (||), not
, fst, snd, toS , fst, snd, toS
) )
...@@ -208,3 +210,9 @@ zipFst f xs = zip (f xs) xs ...@@ -208,3 +210,9 @@ zipFst f xs = zip (f xs) xs
zipSnd :: ([a] -> [b]) -> [a] -> [(a, b)] zipSnd :: ([a] -> [b]) -> [a] -> [(a, b)]
zipSnd f xs = zip xs (f xs) zipSnd f xs = zip xs (f xs)
-- Just
unMaybe :: [Maybe a] -> [a]
unMaybe = map fromJust . L.filter isJust
...@@ -6,6 +6,8 @@ allow-newer: true ...@@ -6,6 +6,8 @@ allow-newer: true
extra-deps: extra-deps:
- git: https://github.com/delanoe/data-time-segment.git - git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723 commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
- git: https://gitlab.iscpif.fr/gargantext/hlcm.git
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git - git: https://github.com/delanoe/servant-static-th.git
commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434 commit: fff77e79fe94d563ab5cae2609b78c17b5c1f434
- aeson-1.2.4.0 - aeson-1.2.4.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment