[FEAT] [LEARN] REFACTO.

cddf0f9e · Alexandre Delanoë · 2ecc0288 · cddf0f9e · cddf0f9e · cddf0f9e
Commit cddf0f9e authored Jul 18, 2019 by Alexandre Delanoë
6 changed files
--- a/src/Gargantext/Text/Corpus/API/Isidore.hs
+++ b/src/Gargantext/Text/Corpus/API/Isidore.hs
@@ -69,7 +69,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
    langText (OnlyText t2   ) = t2
    langText (ArrayText ts  ) = Text.intercalate " " $ map langText ts
-  (utcTime, (pub_year, pub_month, pub_day)) <- Date.split l (maybe (Just "2019") (Just) d)
+  (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just "2019") (Just) d)
  pure $ HyperdataDocument (Just "IsidoreApi")
                 Nothing

--- a/src/Gargantext/Text/Corpus/Parsers.hs
+++ b/src/Gargantext/Text/Corpus/Parsers.hs
@@ -53,7 +53,7 @@ import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
-import Gargantext.Text.Terms.Stop (detectLang)
+import Gargantext.Text.Terms.Learn (detectLangDefault)
 ------------------------------------------------------------------------
 type ParseError = String
@@ -97,11 +97,11 @@ toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 -- TODO use language for RIS
 toDoc ff d = do
      let abstract = lookup "abstract" d
-      let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
+      let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
      let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
-      (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang  dateToParse
+      (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
      pure $ HyperdataDocument (Just $ DT.pack $ show ff)
                               (lookup "doi" d)

--- a/src/Gargantext/Text/Corpus/Parsers/Date.hs
+++ b/src/Gargantext/Text/Corpus/Parsers/Date.hs
@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, split) where
+module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, dateSplit) where
 import Data.HashMap.Strict as HM hiding (map)
 import Data.Text (Text, unpack, splitOn, pack)
@@ -40,9 +40,9 @@ import qualified Duckling.Core as DC
 ------------------------------------------------------------------------
 -- | Parse date to Ints
 -- TODO add hours, minutes and seconds
-split :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
+dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
-split _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
+dateSplit _ Nothing    = pure (Nothing, (Nothing, Nothing, Nothing))
-split l (Just txt) = do
+dateSplit l (Just txt) = do
  utcTime <- parse l txt
  let (y, m, d) = split' utcTime
  pure (Just utcTime, (Just y, Just m,Just d))

--- a/src/Gargantext/Text/Samples/EN.hs
+++ b/src/Gargantext/Text/Samples/EN.hs
@@ -22,4 +22,89 @@ import Data.String (String)
 textSample :: String
 textSample = "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
+stopList :: [String]
+stopList = 
+  ["a", "a's", "able", "about", "above", "according", "accordingly"
+  , "across", "actually", "after", "afterwards", "again", "against"
+  , "ain't", "all", "allow", "allows", "almost", "alone", "along"
+  , "already", "also", "although", "always", "am", "among", "amongst", "an"
+  , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
+  , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
+  , "apply", "appreciate", "appropriate", "are", "aren't", "around"
+  , "as", "aside", "ask", "asking", "associated", "at", "available"
+  , "away", "awfully", "b", "based", "be", "became", "because", "become"
+  , "becomes", "becoming", "been", "before", "beforehand", "behind"
+  , "being", "believe", "below", "beside", "besides", "best", "better"
+  , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
+  , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
+  , "certainly", "changes", "clearly", "co", "com", "come", "comes"
+  , "common", "concerning", "consequently", "consider", "considering"
+  , "contain", "containing", "contains", "corresponding", "could"
+  , "couldn't", "course", "currently", "d", "definitely", "described"
+  , "despite", "detecting", "detects", "did", "didn't", "different", "do"
+  , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
+  , "during", "e", "each", "edu", "eg", "eight", "either", "else"
+  , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
+  , "ever", "every", "everybody", "everyone", "everything", "everywhere"
+  , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
+  , "find", "first", "five", "followed", "following", "follows", "for"
+  , "former", "formerly", "forth", "four", "from", "further", "furthermore"
+  , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
+  , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
+  , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
+  , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
+  , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
+  , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
+  , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
+  , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
+  , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
+  , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
+  , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
+  , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
+  , "late", "lately", "later", "latter", "latterly", "least", "less"
+  , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
+  , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
+  , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
+  , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
+  , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
+  , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
+  , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
+  , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
+  , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
+  , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
+  , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
+  , "particular", "particularly", "per", "perhaps", "placed", "please"
+  , "plus", "possible", "presents", "presumably", "probably", "provides"
+  , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
+  , "reasonably", "regarding", "regardless", "regards", "relatively"
+  , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
+  , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
+  , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
+  , "serious", "seriously", "seven", "several", "shall", "shan't"
+  , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
+  , "six", "so", "some", "somebody", "somehow", "someone", "something"
+  , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
+  , "specified", "specify", "specifying", "still", "sub", "such", "sup"
+  , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
+  , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
+  , "theirs", "them", "themselves", "then", "thence", "there", "there's"
+  , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
+  , "these", "they", "they'd", "they'll", "they're", "they've", "think"
+  , "third", "this", "thorough", "thoroughly", "those", "though", "three"
+  , "through", "throughout", "thru", "thus", "to", "together", "too"
+  , "took", "toward", "towards", "tried", "tries", "truly", "try"
+  , "trying", "twice", "two", "u", "un", "under", "unfortunately"
+  , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
+  , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
+  , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
+  , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
+  , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
+  , "when's", "whence", "whenever", "where", "where's", "whereafter"
+  , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
+  , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
+  , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
+  , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
+  , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
+  , "yourself", "yourselves", "z", "zero"]
--- a/src/Gargantext/Text/Terms/Stop.hs
+++ b/src/Gargantext/Text/Terms/Stop.hs
@@ -16,14 +16,10 @@ Main type here is String.
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Terms.Stop -- (detectLang, detectLangs, stopList)
+module Gargantext.Text.Terms.Learn -- (detectLang, detectLangs, stopList)
  where
-import GHC.Base (Functor)
+--import Data.Char (toLower)
-import Numeric.Probability.Distribution ((??))
-import qualified Numeric.Probability.Distribution as D
-import Data.Char (toLower)
 import qualified Data.List as DL
 import Data.Maybe (maybe)
@@ -33,7 +29,7 @@ import qualified Data.Map.Strict as DM
 import Data.String (String)
 import Data.Text (Text)
-import Data.Text (pack, unpack)
+import Data.Text (pack, unpack, toLower)
 import Data.Tuple.Extra (both)
 import Gargantext.Prelude
@@ -52,34 +48,6 @@ data Candidate = Candidate { stop :: Double
                           , noStop :: Double
 } deriving (Show)
-- * String preparation
-- | String prepare
-blanks :: String -> String
-blanks [] = []
-blanks xs = [' '] <> xs <> [' ']
-- | Blocks increase the size of the word to ease computations
-- some border and unexepected effects can happen, need to be tested
-blockOf :: Int -> String -> String
-blockOf n = DL.concat . DL.take n . DL.repeat
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
-chunks :: Int -> Int -> String -> [String]
-chunks n m = DL.take m . filter (not . all (== ' '))
-                       . chunkAlong (n+1) 1
-                       . DL.concat
-                       . DL.take 1000
-                       . DL.repeat
-                       . blanks
-allChunks :: [Int] -> Int -> String -> [String]
-allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
-allChunks' :: [Int] -> Int -> String -> [[String]]
-allChunks' ns m st = map (\n -> chunks n m st) ns
 ------------------------------------------------------------------------
 -- * Analyze candidate
 type StringSize = Int
@@ -87,119 +55,121 @@ type TotalFreq  = Int
 type Freq       = Int
 type Word       = String
-data LangWord = LangWord Lang Word
+data CatWord a = CatWord a Word
+type CatProb a = Map     a Double
-type LangProba = Map Lang Double
+type Events a = Map a EventBook
 ------------------------------------------------------------------------
-detectLang :: Text -> Maybe Lang
+detectLangDefault :: Text -> Maybe Lang
-detectLang = head . map fst . detectLangs
+detectLangDefault = detectCat 99 eventLang
+  where
+    eventLang :: Events Lang
+    eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
+    langWord :: Lang -> CatWord Lang
+    langWord l = CatWord l (textSample l)
-detectLangs :: Text -> [(Lang, Double)]
+    textSample :: Lang -> String
-detectLangs = detectLangs' . unpack
+    textSample EN = EN.textSample
+    textSample FR = FR.textSample
+    --textSample DE = DE.textSample
+    --textSample SP = SP.textSample
+    --textSample CH = CH.textSample
-detectLangs' :: String -> [(Lang, Double)]
+detectStopDefault :: Text -> Maybe Bool
-detectLangs' s =  DL.reverse $ DL.sortOn snd
+detectStopDefault = undefined
-                            $ toList
-                            $ detect (wordsToBook [0..2] s) eventLang
-part :: (Eq p, Fractional p) => p -> p -> p
+detectDefault :: [(Bool, Text)] -> Text -> Maybe Bool
-part 0 _ = 0
+detectDefault events = detectCat 99 (priorEvents events)
-part _ 0 = 0
+  where
-part x y = x / y
+    priorEvents events' = toEvents True [0..2] 10 (map (\(a,b) -> CatWord a (unpack $ toLower b)) events')
+------------------------------------------------------------------------
+detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
+detectCat n es = head . map fst . (detectCat' n es) . unpack
+  where
+    detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
+    detectCat' n' es' s =  DL.reverse $ DL.sortOn snd
+                                $ toList
+                                $ detectWith n' es' (wordsToBook [0..2] n' s)
+    detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
+    detectWith n'' el (EventBook mapFreq _) =
+      DM.unionsWith (+)
+      $ map DM.fromList
+      $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
+      $ filter (\x -> fst x /= "  ")
+      $ DM.toList mapFreq
+    -- | TODO: monoids (but proba >= 0)
+    toPrior :: Int -> String -> Events a -> [(a, Double)]
+    toPrior n'' s el = prior n'' $ pebLang s el
+      where
+        pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
+        pebLang st = map (\(l,eb) -> (l, peb st eb)) .  DM.toList
+        peb :: String -> EventBook -> (Freq, TotalFreq)
+        peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
+          where
+            a = maybe 0 identity $ DM.lookup st mapFreq
+            b = maybe 1 identity $ DM.lookup (length st) mapN
+    prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
+    prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
+                                    (map (\(a,b) -> a / b) ps')
+      where
+        (ls, ps'') = DL.unzip ps
+        ps' = map (both fromIntegral) ps''
+    part :: (Eq p, Fractional p) => p -> p -> p
+    part 0 _ = 0
+    part _ 0 = 0
+    part x y = x / y
+{-
 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
                 t (a, b) -> t (a, b)
 toProba xs = map (\(a,b) -> (a, part b total)) xs
  where
    total = sum $ map snd xs
+-}
-textSample :: Lang -> String
-textSample EN = EN.textSample
-textSample FR = FR.textSample
--textSample DE = DE.textSample
--textSample SP = SP.textSample
--textSample CH = CH.textSample
-langWord :: Lang -> LangWord
-langWord l = LangWord l (textSample l)
-eventLang :: EventLang
-eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
-detect :: EventBook -> EventLang -> LangProba
-detect (EventBook mapFreq _) el = 
-  DM.unionsWith (+)
-  $ map DM.fromList
-  $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
-  $ filter (\x -> fst x /= "  ")
-  $ DM.toList mapFreq
------------------------------------------------------------------------
 -- | TODO: monoids
-type EventLang = Map Lang EventBook
+toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
+toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
-toEventLangs :: [Int] -> [LangWord] -> EventLang
-toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
-emptyEventLang :: [Int] -> EventLang
-emptyEventLang ns = toLang ns (LangWord FR "")
-toLang :: [Int] -> LangWord -> EventLang
-toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
-opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
-opLang f = DM.unionWith (op f)
------------------------------------------------------------------------
-- | TODO: monoids (but proba >= 0)
-peb :: String -> EventBook -> Double
-peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
  where
-    a = maybe 0 identity $ DM.lookup st mapFreq
+    emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
-    b = maybe 1 identity $ DM.lookup (length st) mapN
+    emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
-peb' :: String -> EventBook -> (Freq, TotalFreq)
+    toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
-peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
+    toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
-  where
-    a = maybe 0 identity $ DM.lookup st mapFreq
-    b = maybe 1 identity $ DM.lookup (length st) mapN
------------------------------------------------------------------------
+    opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
-toPrior :: String -> EventLang -> [(Lang, Double)]
+    opEvent f = DM.unionWith (op f)
-toPrior s el = prior $ pebLang s el
-pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
-pebLang st = map (\(l,eb) -> (l, peb' st eb)) .  DM.toList
 ------------------------------------------------------------------------
-prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
-prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps') 
-                                (map (\(a,b) -> a / b) ps')
-  where
-    (ls, ps'') = DL.unzip ps
-    ps' = map (both fromIntegral) ps''
 ------------------------------------------------------------------------
 data EventBook = EventBook { events_freq :: Map String     Freq
                           , events_n    :: Map StringSize TotalFreq
                           }
                             deriving (Show)
-emptyEventBook :: [Int] -> EventBook
+emptyEventBook :: [Int] -> Int -> EventBook
-emptyEventBook ns = wordToBook ns " "
+emptyEventBook ns n = wordToBook ns n " "
-wordsToBook :: [Int] -> String -> EventBook
+wordsToBook :: [Int] -> Int -> String -> EventBook
-wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
+wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
  where
    ws = map unpack $ words $ pack txt
-    eventsBook = map (wordToBook ns) ws
+    eventsBook = map (wordToBook ns n) ws
-wordToBook :: [Int] -> Word -> EventBook
+wordToBook :: [Int] -> Int -> Word -> EventBook
-wordToBook ns txt = EventBook ef en
+wordToBook ns n txt = EventBook ef en
  where
-    chks = allChunks' ns 10 txt
+    chks = allChunks ns n txt
-    en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
+    en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
    ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
@@ -209,19 +179,49 @@ op f (EventBook ef1 en1)
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
+allChunks :: [Int] -> Int -> String -> [[String]]
+allChunks ns m st = map (\n -> chunks n m st) ns
+-- | Chunks is the same function as splitBy in Context but for Strings,
+-- not Text (without pack and unpack operations that are not needed).
+chunks :: Int -> Int -> String -> [String]
+chunks n m = DL.take m . filter (not . all (== ' '))
+                       . chunkAlong (n+1) 1
+                       . DL.concat
+                       . DL.take 1000
+                       . DL.repeat
+                       . blanks
+-- | String preparation
+blanks :: String -> String
+blanks [] = []
+blanks xs = [' '] <> xs <> [' ']
+{-
+-- Some previous tests to be removed
+--import GHC.Base (Functor)
+--import Numeric.Probability.Distribution ((??))
+--import qualified Numeric.Probability.Distribution as D
+-- | Blocks increase the size of the word to ease computations
+-- some border and unexepected effects can happen, need to be tested
+blockOf :: Int -> String -> String
+blockOf n = DL.concat . DL.take n . DL.repeat
 -- * Make the distributions
 makeDist :: [String] -> D.T Double String
-makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
+makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
 stopDist :: D.T Double String
-stopDist = makeDist stopList
+stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
 candDist :: D.T Double String
 candDist = makeDist candList
 ------------------------------------------------------------------------
 sumProba :: Num a => D.T a String -> [Char] -> a
-sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
+sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
 -- | Get probability according a distribution
 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
@@ -236,93 +236,4 @@ candList :: [String]
 candList = [ "france", "alexandre", "mael", "constitution"
           , "etats-unis", "associes", "car", "train", "spam"]
+--}
-stopList :: [String]
-stopList = map show ([0..9]::[Int]) <> stopListWords
-stopListWords :: [String]
-stopListWords = 
-  ["a", "a's", "able", "about", "above", "according", "accordingly"
-  , "across", "actually", "after", "afterwards", "again", "against"
-  , "ain't", "all", "allow", "allows", "almost", "alone", "along"
-  , "already", "also", "although", "always", "am", "among", "amongst", "an"
-  , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
-  , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
-  , "apply", "appreciate", "appropriate", "are", "aren't", "around"
-  , "as", "aside", "ask", "asking", "associated", "at", "available"
-  , "away", "awfully", "b", "based", "be", "became", "because", "become"
-  , "becomes", "becoming", "been", "before", "beforehand", "behind"
-  , "being", "believe", "below", "beside", "besides", "best", "better"
-  , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
-  , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
-  , "certainly", "changes", "clearly", "co", "com", "come", "comes"
-  , "common", "concerning", "consequently", "consider", "considering"
-  , "contain", "containing", "contains", "corresponding", "could"
-  , "couldn't", "course", "currently", "d", "definitely", "described"
-  , "despite", "detecting", "detects", "did", "didn't", "different", "do"
-  , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
-  , "during", "e", "each", "edu", "eg", "eight", "either", "else"
-  , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
-  , "ever", "every", "everybody", "everyone", "everything", "everywhere"
-  , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
-  , "find", "first", "five", "followed", "following", "follows", "for"
-  , "former", "formerly", "forth", "four", "from", "further", "furthermore"
-  , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
-  , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
-  , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
-  , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
-  , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
-  , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
-  , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
-  , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
-  , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
-  , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
-  , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
-  , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
-  , "late", "lately", "later", "latter", "latterly", "least", "less"
-  , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
-  , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
-  , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
-  , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
-  , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
-  , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
-  , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
-  , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
-  , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
-  , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
-  , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
-  , "particular", "particularly", "per", "perhaps", "placed", "please"
-  , "plus", "possible", "presents", "presumably", "probably", "provides"
-  , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
-  , "reasonably", "regarding", "regardless", "regards", "relatively"
-  , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
-  , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
-  , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
-  , "serious", "seriously", "seven", "several", "shall", "shan't"
-  , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
-  , "six", "so", "some", "somebody", "somehow", "someone", "something"
-  , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
-  , "specified", "specify", "specifying", "still", "sub", "such", "sup"
-  , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
-  , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
-  , "theirs", "them", "themselves", "then", "thence", "there", "there's"
-  , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
-  , "these", "they", "they'd", "they'll", "they're", "they've", "think"
-  , "third", "this", "thorough", "thoroughly", "those", "though", "three"
-  , "through", "throughout", "thru", "thus", "to", "together", "too"
-  , "took", "toward", "towards", "tried", "tries", "truly", "try"
-  , "trying", "twice", "two", "u", "un", "under", "unfortunately"
-  , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
-  , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
-  , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
-  , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
-  , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
-  , "when's", "whence", "whenever", "where", "where's", "whereafter"
-  , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
-  , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
-  , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
-  , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
-  , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
-  , "yourself", "yourselves", "z", "zero"]
--- a/src/Gargantext/Text/Terms/Multi/RAKE.hs
+++ b/src/Gargantext/Text/Terms/Multi/RAKE.hs
@@ -36,7 +36,7 @@ import GHC.Real (round)
 import Data.Text (Text)
 import NLP.RAKE.Text
-import Gargantext.Text.Terms.Stop (stopList)
+import Gargantext.Text.Samples.EN (stopList)
 import Gargantext.Prelude
 select :: Double -> [a] -> [a]