Commit b025b4fd authored by Alexandre Delanoë's avatar Alexandre Delanoë

[ML] first naive bayes implementation for stop words detection.

parent 0f0205a3
......@@ -53,6 +53,7 @@ library:
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search
- Gargantext.Text.Terms
- Gargantext.Text.Terms.Stop
- Gargantext.Text.Terms.Mono
- Gargantext.Text.Terms.Multi.Lang.En
- Gargantext.Text.Terms.Multi.Lang.Fr
......@@ -112,6 +113,7 @@ library:
- path-io
- postgresql-simple
- pretty
- probability
- product-profunctors
- profunctors
- protolude
......@@ -29,13 +29,22 @@ list quality in time.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)
module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake, select, hardStopList)
import Data.Text (Text)
import GHC.Real (round)
import Data.Text (Text, pack)
import NLP.RAKE.Text
import Gargantext.Text.Terms.Stop (stopList)
import Gargantext.Prelude
select :: Double -> [a] -> [a]
select part ns = take n ns
n = round $ part * (fromIntegral $ length ns)
multiterms_rake :: Text -> [WordScore]
multiterms_rake = candidates hardStopList
......@@ -43,74 +52,4 @@ multiterms_rake = candidates hardStopList
-- | StopList
hardStopList :: StopwordsMap
hardStopList = mkStopwordsStr [
"available","away","awfully","based", "b","be","became","because","become",
hardStopList = mkStopwordsStr stopList
Module : Gargantext.Text.Terms.Stop
Description : Mono Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer :
Stability : experimental
Portability : POSIX
Stop words and (how to learn it).
Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Stop
import Numeric.Probability.Distribution ((??))
import qualified Numeric.Probability.Distribution as D
import Data.String (String)
import Data.Char (toLower)
import qualified Data.List as DL
-- import qualified Data.Map as M
import Gargantext.Prelude
data Candidate = Candidate { stop :: Double
, noStop :: Double
} deriving (Show)
-- * String preparation
-- | String prepare
blanks :: String -> String
blanks [] = []
blanks xs = [' '] <> xs <> [' ']
-- | Blocks increase the size of the word to ease computations
-- some border and unexepected effects can happen, need to be tested
blockOf :: Int -> String -> String
blockOf n st = DL.concat $ DL.take n $ DL.repeat st
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
chunks :: Int -> Int -> String -> [String]
chunks n m = DL.take m . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
allChunks :: [Int] -> Int -> String -> [String]
allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
-- * Make the distributions
makeDist :: [String] -> D.T Double String
makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
stopDist :: D.T Double String
stopDist = makeDist stopList
candDist :: D.T Double String
candDist = makeDist candList
-- * Analyze candidate
sumProba :: Num a => D.T a String -> [Char] -> a
sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
-- | Get probability according a distribution
(~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
(~?) ds x = (==x) ?? ds
candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
candList :: [String]
candList = ["france", "alexandre", "mael", "constitution", "delanoe", "etats-unis", "associes", "car", "train", "spam"]
stopList :: [String]
stopList = map show ([0..9]::[Int]) <> [
"involves", "already","also","although","always","am","among","amongst",
"available","away","awfully","based", "b","be","became","because","become",
