Commit cddf0f9e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] [LEARN] REFACTO.

parent 2ecc0288
...@@ -69,7 +69,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do ...@@ -69,7 +69,7 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
langText (OnlyText t2 ) = t2 langText (OnlyText t2 ) = t2
langText (ArrayText ts ) = Text.intercalate " " $ map langText ts langText (ArrayText ts ) = Text.intercalate " " $ map langText ts
(utcTime, (pub_year, pub_month, pub_day)) <- Date.split l (maybe (Just "2019") (Just) d) (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit l (maybe (Just "2019") (Just) d)
pure $ HyperdataDocument (Just "IsidoreApi") pure $ HyperdataDocument (Just "IsidoreApi")
Nothing Nothing
......
...@@ -53,7 +53,7 @@ import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS ...@@ -53,7 +53,7 @@ import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Corpus.Parsers.Date as Date import qualified Gargantext.Text.Corpus.Parsers.Date as Date
import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv) import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Learn (detectLangDefault)
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = String type ParseError = String
...@@ -97,11 +97,11 @@ toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument ...@@ -97,11 +97,11 @@ toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS -- TODO use language for RIS
toDoc ff d = do toDoc ff d = do
let abstract = lookup "abstract" d let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract)) let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff) pure $ HyperdataDocument (Just $ DT.pack $ show ff)
(lookup "doi" d) (lookup "doi" d)
......
...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" ...@@ -18,7 +18,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, split) where module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, dateSplit) where
import Data.HashMap.Strict as HM hiding (map) import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack) import Data.Text (Text, unpack, splitOn, pack)
...@@ -40,9 +40,9 @@ import qualified Duckling.Core as DC ...@@ -40,9 +40,9 @@ import qualified Duckling.Core as DC
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Parse date to Ints -- | Parse date to Ints
-- TODO add hours, minutes and seconds -- TODO add hours, minutes and seconds
split :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day)) dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
split _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing)) dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
split l (Just txt) = do dateSplit l (Just txt) = do
utcTime <- parse l txt utcTime <- parse l txt
let (y, m, d) = split' utcTime let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d)) pure (Just utcTime, (Just y, Just m,Just d))
......
...@@ -22,4 +22,89 @@ import Data.String (String) ...@@ -22,4 +22,89 @@ import Data.String (String)
textSample :: String textSample :: String
textSample = "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted." textSample = "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
stopList :: [String]
stopList =
["a", "a's", "able", "about", "above", "according", "accordingly"
, "across", "actually", "after", "afterwards", "again", "against"
, "ain't", "all", "allow", "allows", "almost", "alone", "along"
, "already", "also", "although", "always", "am", "among", "amongst", "an"
, "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
, "anything", "anyway", "anyways", "anywhere", "apart", "appear"
, "apply", "appreciate", "appropriate", "are", "aren't", "around"
, "as", "aside", "ask", "asking", "associated", "at", "available"
, "away", "awfully", "b", "based", "be", "became", "because", "become"
, "becomes", "becoming", "been", "before", "beforehand", "behind"
, "being", "believe", "below", "beside", "besides", "best", "better"
, "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
, "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
, "certainly", "changes", "clearly", "co", "com", "come", "comes"
, "common", "concerning", "consequently", "consider", "considering"
, "contain", "containing", "contains", "corresponding", "could"
, "couldn't", "course", "currently", "d", "definitely", "described"
, "despite", "detecting", "detects", "did", "didn't", "different", "do"
, "does", "doesn't", "doing", "don't", "done", "down", "downwards"
, "during", "e", "each", "edu", "eg", "eight", "either", "else"
, "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
, "ever", "every", "everybody", "everyone", "everything", "everywhere"
, "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
, "find", "first", "five", "followed", "following", "follows", "for"
, "former", "formerly", "forth", "four", "from", "further", "furthermore"
, "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
, "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
, "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
, "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
, "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
, "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
, "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
, "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
, "inc", "indeed", "indicate", "indicated", "indicates", "inner"
, "insofar", "instead", "into", "involves", "inward", "is", "isn't"
, "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
, "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
, "late", "lately", "later", "latter", "latterly", "least", "less"
, "lest", "let", "let's", "like", "liked", "likely", "little", "look"
, "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
, "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
, "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
, "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
, "needs", "neither", "never", "nevertheless", "new", "next", "nine"
, "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
, "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
, "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
, "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
, "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
, "particular", "particularly", "per", "perhaps", "placed", "please"
, "plus", "possible", "presents", "presumably", "probably", "provides"
, "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
, "reasonably", "regarding", "regardless", "regards", "relatively"
, "respectively", "right", "s", "said", "same", "saw", "say", "saying"
, "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
, "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
, "serious", "seriously", "seven", "several", "shall", "shan't"
, "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
, "six", "so", "some", "somebody", "somehow", "someone", "something"
, "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
, "specified", "specify", "specifying", "still", "sub", "such", "sup"
, "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
, "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
, "theirs", "them", "themselves", "then", "thence", "there", "there's"
, "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
, "these", "they", "they'd", "they'll", "they're", "they've", "think"
, "third", "this", "thorough", "thoroughly", "those", "though", "three"
, "through", "throughout", "thru", "thus", "to", "together", "too"
, "took", "toward", "towards", "tried", "tries", "truly", "try"
, "trying", "twice", "two", "u", "un", "under", "unfortunately"
, "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
, "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
, "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
, "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
, "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
, "when's", "whence", "whenever", "where", "where's", "whereafter"
, "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
, "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
, "whose", "why", "why's", "will", "willing", "wish", "with", "within"
, "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
, "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
, "yourself", "yourselves", "z", "zero"]
...@@ -16,14 +16,10 @@ Main type here is String. ...@@ -16,14 +16,10 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Stop -- (detectLang, detectLangs, stopList) module Gargantext.Text.Terms.Learn -- (detectLang, detectLangs, stopList)
where where
import GHC.Base (Functor) --import Data.Char (toLower)
import Numeric.Probability.Distribution ((??))
import qualified Numeric.Probability.Distribution as D
import Data.Char (toLower)
import qualified Data.List as DL import qualified Data.List as DL
import Data.Maybe (maybe) import Data.Maybe (maybe)
...@@ -33,7 +29,7 @@ import qualified Data.Map.Strict as DM ...@@ -33,7 +29,7 @@ import qualified Data.Map.Strict as DM
import Data.String (String) import Data.String (String)
import Data.Text (Text) import Data.Text (Text)
import Data.Text (pack, unpack) import Data.Text (pack, unpack, toLower)
import Data.Tuple.Extra (both) import Data.Tuple.Extra (both)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -52,34 +48,6 @@ data Candidate = Candidate { stop :: Double ...@@ -52,34 +48,6 @@ data Candidate = Candidate { stop :: Double
, noStop :: Double , noStop :: Double
} deriving (Show) } deriving (Show)
-- * String preparation
-- | String prepare
blanks :: String -> String
blanks [] = []
blanks xs = [' '] <> xs <> [' ']
-- | Blocks increase the size of the word to ease computations
-- some border and unexepected effects can happen, need to be tested
blockOf :: Int -> String -> String
blockOf n = DL.concat . DL.take n . DL.repeat
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
chunks :: Int -> Int -> String -> [String]
chunks n m = DL.take m . filter (not . all (== ' '))
. chunkAlong (n+1) 1
. DL.concat
. DL.take 1000
. DL.repeat
. blanks
allChunks :: [Int] -> Int -> String -> [String]
allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
allChunks' :: [Int] -> Int -> String -> [[String]]
allChunks' ns m st = map (\n -> chunks n m st) ns
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- * Analyze candidate -- * Analyze candidate
type StringSize = Int type StringSize = Int
...@@ -87,119 +55,121 @@ type TotalFreq = Int ...@@ -87,119 +55,121 @@ type TotalFreq = Int
type Freq = Int type Freq = Int
type Word = String type Word = String
data LangWord = LangWord Lang Word data CatWord a = CatWord a Word
type CatProb a = Map a Double
type LangProba = Map Lang Double type Events a = Map a EventBook
------------------------------------------------------------------------ ------------------------------------------------------------------------
detectLang :: Text -> Maybe Lang detectLangDefault :: Text -> Maybe Lang
detectLang = head . map fst . detectLangs detectLangDefault = detectCat 99 eventLang
where
eventLang :: Events Lang
eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
langWord :: Lang -> CatWord Lang
langWord l = CatWord l (textSample l)
detectLangs :: Text -> [(Lang, Double)] textSample :: Lang -> String
detectLangs = detectLangs' . unpack textSample EN = EN.textSample
textSample FR = FR.textSample
--textSample DE = DE.textSample
--textSample SP = SP.textSample
--textSample CH = CH.textSample
detectLangs' :: String -> [(Lang, Double)] detectStopDefault :: Text -> Maybe Bool
detectLangs' s = DL.reverse $ DL.sortOn snd detectStopDefault = undefined
$ toList
$ detect (wordsToBook [0..2] s) eventLang
part :: (Eq p, Fractional p) => p -> p -> p detectDefault :: [(Bool, Text)] -> Text -> Maybe Bool
part 0 _ = 0 detectDefault events = detectCat 99 (priorEvents events)
part _ 0 = 0 where
part x y = x / y priorEvents events' = toEvents True [0..2] 10 (map (\(a,b) -> CatWord a (unpack $ toLower b)) events')
------------------------------------------------------------------------
detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
detectCat n es = head . map fst . (detectCat' n es) . unpack
where
detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
detectCat' n' es' s = DL.reverse $ DL.sortOn snd
$ toList
$ detectWith n' es' (wordsToBook [0..2] n' s)
detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
detectWith n'' el (EventBook mapFreq _) =
DM.unionsWith (+)
$ map DM.fromList
$ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
$ filter (\x -> fst x /= " ")
$ DM.toList mapFreq
-- | TODO: monoids (but proba >= 0)
toPrior :: Int -> String -> Events a -> [(a, Double)]
toPrior n'' s el = prior n'' $ pebLang s el
where
pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
peb :: String -> EventBook -> (Freq, TotalFreq)
peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
where
a = maybe 0 identity $ DM.lookup st mapFreq
b = maybe 1 identity $ DM.lookup (length st) mapN
prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
(map (\(a,b) -> a / b) ps')
where
(ls, ps'') = DL.unzip ps
ps' = map (both fromIntegral) ps''
part :: (Eq p, Fractional p) => p -> p -> p
part 0 _ = 0
part _ 0 = 0
part x y = x / y
{-
toProba :: (Eq b, Fractional b, Functor t, Foldable t) => toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
t (a, b) -> t (a, b) t (a, b) -> t (a, b)
toProba xs = map (\(a,b) -> (a, part b total)) xs toProba xs = map (\(a,b) -> (a, part b total)) xs
where where
total = sum $ map snd xs total = sum $ map snd xs
-}
textSample :: Lang -> String
textSample EN = EN.textSample
textSample FR = FR.textSample
--textSample DE = DE.textSample
--textSample SP = SP.textSample
--textSample CH = CH.textSample
langWord :: Lang -> LangWord
langWord l = LangWord l (textSample l)
eventLang :: EventLang
eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
detect :: EventBook -> EventLang -> LangProba
detect (EventBook mapFreq _) el =
DM.unionsWith (+)
$ map DM.fromList
$ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
$ filter (\x -> fst x /= " ")
$ DM.toList mapFreq
------------------------------------------------------------------------
-- | TODO: monoids -- | TODO: monoids
type EventLang = Map Lang EventBook toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
toEventLangs :: [Int] -> [LangWord] -> EventLang
toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
emptyEventLang :: [Int] -> EventLang
emptyEventLang ns = toLang ns (LangWord FR "")
toLang :: [Int] -> LangWord -> EventLang
toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
opLang f = DM.unionWith (op f)
------------------------------------------------------------------------
-- | TODO: monoids (but proba >= 0)
peb :: String -> EventBook -> Double
peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
where where
a = maybe 0 identity $ DM.lookup st mapFreq emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
b = maybe 1 identity $ DM.lookup (length st) mapN emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
peb' :: String -> EventBook -> (Freq, TotalFreq) toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b) toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
where
a = maybe 0 identity $ DM.lookup st mapFreq
b = maybe 1 identity $ DM.lookup (length st) mapN
------------------------------------------------------------------------ opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
toPrior :: String -> EventLang -> [(Lang, Double)] opEvent f = DM.unionWith (op f)
toPrior s el = prior $ pebLang s el
pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
------------------------------------------------------------------------ ------------------------------------------------------------------------
prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
(map (\(a,b) -> a / b) ps')
where
(ls, ps'') = DL.unzip ps
ps' = map (both fromIntegral) ps''
------------------------------------------------------------------------ ------------------------------------------------------------------------
data EventBook = EventBook { events_freq :: Map String Freq data EventBook = EventBook { events_freq :: Map String Freq
, events_n :: Map StringSize TotalFreq , events_n :: Map StringSize TotalFreq
} }
deriving (Show) deriving (Show)
emptyEventBook :: [Int] -> EventBook emptyEventBook :: [Int] -> Int -> EventBook
emptyEventBook ns = wordToBook ns " " emptyEventBook ns n = wordToBook ns n " "
wordsToBook :: [Int] -> String -> EventBook wordsToBook :: [Int] -> Int -> String -> EventBook
wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
where where
ws = map unpack $ words $ pack txt ws = map unpack $ words $ pack txt
eventsBook = map (wordToBook ns) ws eventsBook = map (wordToBook ns n) ws
wordToBook :: [Int] -> Word -> EventBook wordToBook :: [Int] -> Int -> Word -> EventBook
wordToBook ns txt = EventBook ef en wordToBook ns n txt = EventBook ef en
where where
chks = allChunks' ns 10 txt chks = allChunks ns n txt
en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
...@@ -209,19 +179,49 @@ op f (EventBook ef1 en1) ...@@ -209,19 +179,49 @@ op f (EventBook ef1 en1)
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
allChunks :: [Int] -> Int -> String -> [[String]]
allChunks ns m st = map (\n -> chunks n m st) ns
-- | Chunks is the same function as splitBy in Context but for Strings,
-- not Text (without pack and unpack operations that are not needed).
chunks :: Int -> Int -> String -> [String]
chunks n m = DL.take m . filter (not . all (== ' '))
. chunkAlong (n+1) 1
. DL.concat
. DL.take 1000
. DL.repeat
. blanks
-- | String preparation
blanks :: String -> String
blanks [] = []
blanks xs = [' '] <> xs <> [' ']
{-
-- Some previous tests to be removed
--import GHC.Base (Functor)
--import Numeric.Probability.Distribution ((??))
--import qualified Numeric.Probability.Distribution as D
-- | Blocks increase the size of the word to ease computations
-- some border and unexepected effects can happen, need to be tested
blockOf :: Int -> String -> String
blockOf n = DL.concat . DL.take n . DL.repeat
-- * Make the distributions -- * Make the distributions
makeDist :: [String] -> D.T Double String makeDist :: [String] -> D.T Double String
makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10) makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
stopDist :: D.T Double String stopDist :: D.T Double String
stopDist = makeDist stopList stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
candDist :: D.T Double String candDist :: D.T Double String
candDist = makeDist candList candDist = makeDist candList
------------------------------------------------------------------------ ------------------------------------------------------------------------
sumProba :: Num a => D.T a String -> [Char] -> a sumProba :: Num a => D.T a String -> [Char] -> a
sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
-- | Get probability according a distribution -- | Get probability according a distribution
(~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
...@@ -236,93 +236,4 @@ candList :: [String] ...@@ -236,93 +236,4 @@ candList :: [String]
candList = [ "france", "alexandre", "mael", "constitution" candList = [ "france", "alexandre", "mael", "constitution"
, "etats-unis", "associes", "car", "train", "spam"] , "etats-unis", "associes", "car", "train", "spam"]
--}
stopList :: [String]
stopList = map show ([0..9]::[Int]) <> stopListWords
stopListWords :: [String]
stopListWords =
["a", "a's", "able", "about", "above", "according", "accordingly"
, "across", "actually", "after", "afterwards", "again", "against"
, "ain't", "all", "allow", "allows", "almost", "alone", "along"
, "already", "also", "although", "always", "am", "among", "amongst", "an"
, "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
, "anything", "anyway", "anyways", "anywhere", "apart", "appear"
, "apply", "appreciate", "appropriate", "are", "aren't", "around"
, "as", "aside", "ask", "asking", "associated", "at", "available"
, "away", "awfully", "b", "based", "be", "became", "because", "become"
, "becomes", "becoming", "been", "before", "beforehand", "behind"
, "being", "believe", "below", "beside", "besides", "best", "better"
, "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
, "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
, "certainly", "changes", "clearly", "co", "com", "come", "comes"
, "common", "concerning", "consequently", "consider", "considering"
, "contain", "containing", "contains", "corresponding", "could"
, "couldn't", "course", "currently", "d", "definitely", "described"
, "despite", "detecting", "detects", "did", "didn't", "different", "do"
, "does", "doesn't", "doing", "don't", "done", "down", "downwards"
, "during", "e", "each", "edu", "eg", "eight", "either", "else"
, "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
, "ever", "every", "everybody", "everyone", "everything", "everywhere"
, "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
, "find", "first", "five", "followed", "following", "follows", "for"
, "former", "formerly", "forth", "four", "from", "further", "furthermore"
, "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
, "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
, "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
, "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
, "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
, "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
, "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
, "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
, "inc", "indeed", "indicate", "indicated", "indicates", "inner"
, "insofar", "instead", "into", "involves", "inward", "is", "isn't"
, "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
, "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
, "late", "lately", "later", "latter", "latterly", "least", "less"
, "lest", "let", "let's", "like", "liked", "likely", "little", "look"
, "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
, "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
, "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
, "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
, "needs", "neither", "never", "nevertheless", "new", "next", "nine"
, "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
, "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
, "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
, "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
, "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
, "particular", "particularly", "per", "perhaps", "placed", "please"
, "plus", "possible", "presents", "presumably", "probably", "provides"
, "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
, "reasonably", "regarding", "regardless", "regards", "relatively"
, "respectively", "right", "s", "said", "same", "saw", "say", "saying"
, "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
, "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
, "serious", "seriously", "seven", "several", "shall", "shan't"
, "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
, "six", "so", "some", "somebody", "somehow", "someone", "something"
, "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
, "specified", "specify", "specifying", "still", "sub", "such", "sup"
, "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
, "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
, "theirs", "them", "themselves", "then", "thence", "there", "there's"
, "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
, "these", "they", "they'd", "they'll", "they're", "they've", "think"
, "third", "this", "thorough", "thoroughly", "those", "though", "three"
, "through", "throughout", "thru", "thus", "to", "together", "too"
, "took", "toward", "towards", "tried", "tries", "truly", "try"
, "trying", "twice", "two", "u", "un", "under", "unfortunately"
, "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
, "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
, "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
, "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
, "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
, "when's", "whence", "whenever", "where", "where's", "whereafter"
, "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
, "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
, "whose", "why", "why's", "will", "willing", "wish", "with", "within"
, "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
, "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
, "yourself", "yourselves", "z", "zero"]
...@@ -36,7 +36,7 @@ import GHC.Real (round) ...@@ -36,7 +36,7 @@ import GHC.Real (round)
import Data.Text (Text) import Data.Text (Text)
import NLP.RAKE.Text import NLP.RAKE.Text
import Gargantext.Text.Terms.Stop (stopList) import Gargantext.Text.Samples.EN (stopList)
import Gargantext.Prelude import Gargantext.Prelude
select :: Double -> [a] -> [a] select :: Double -> [a] -> [a]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment