[LANG] Detection improved.

9f5a6d1c · Alexandre Delanoë · 58592e6a · 9f5a6d1c · 9f5a6d1c · 9f5a6d1c
Commit 9f5a6d1c authored Oct 09, 2018 by Alexandre Delanoë
6 changed files
--- a/src/Gargantext/Text/Samples/CH.hs
+++ b/src/Gargantext/Text/Samples/CH.hs
+{-|
+Module      : Gargantext.Text.Samples.CH
+Description : Sample of Chinese Text
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Source: Wikipedia
+Page  : text mining
+-}
 module Gargantext.Text.Samples.CH where
 import Data.String (String)
-textMining :: String
+textSample :: String
-textMining = "文本挖掘有时也被称为文字探勘、文本数据挖掘等，大致相当于文字分析，一般指文本处理过程中产生高质量的信息。高质量的信息通常通过分类和预测来产生，如模式识别。文本挖掘通常涉及输入文本的处理过程（通常进行分析，同时加上一些衍生语言特征以及消除杂音，随后插入到数据库中） ，产生结构化数据，并最终评价和解释输出。'高品质'的文本挖掘通常是指某种组合的相关性，新颖性和趣味性。典型的文本挖掘方法包括文本分类，文本聚类，概念/实体挖掘，生产精确分类，观点分析，文档摘要和实体关系模型（即，学习已命名实体之间的关系） 。 文本分析包括了信息检索、词典分析来研究词语的频数分布、模式识别、标签 注释、信息抽取，数据挖掘技术包括链接和关联分析、可视化和预测分析。本质上，首要的任务是，通过自然语言处理和分析方法，将文本转化为数据进行分析"
+textSample = "文本挖掘有时也被称为文字探勘、文本数据挖掘等，大致相当于文字分析，一般指文本处理过程中产生高质量的信息。高质量的信息通常通过分类和预测来产生，如模式识别。文本挖掘通常涉及输入文本的处理过程（通常进行分析，同时加上一些衍生语言特征以及消除杂音，随后插入到数据库中） ，产生结构化数据，并最终评价和解释输出。'高品质'的文本挖掘通常是指某种组合的相关性，新颖性和趣味性。典型的文本挖掘方法包括文本分类，文本聚类，概念/实体挖掘，生产精确分类，观点分析，文档摘要和实体关系模型（即，学习已命名实体之间的关系） 。 文本分析包括了信息检索、词典分析来研究词语的频数分布、模式识别、标签 注释、信息抽取，数据挖掘技术包括链接和关联分析、可视化和预测分析。本质上，首要的任务是，通过自然语言处理和分析方法，将文本转化为数据进行分析"
--- a/src/Gargantext/Text/Samples/DE.hs
+++ b/src/Gargantext/Text/Samples/DE.hs
+{-|
+Module      : Gargantext.Text.Samples.DE
+Description : Sample of German Text
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Source: Wikipedia
+Page  : text mining
+-}
 module Gargantext.Text.Samples.DE where
 import Data.String (String)
-textMining :: String
+textSample :: String
-textMining = "Text Mining, seltener auch Textmining, Text Data Mining oder Textual Data Mining, ist ein Bündel von Algorithmus-basierten Analyseverfahren zur Entdeckung von Bedeutungsstrukturen aus un- oder schwachstrukturierten Textdaten. Mit statistischen und linguistischen Mitteln erschließt Text-Mining-Software aus Texten Strukturen, die die Benutzer in die Lage versetzen sollen, Kerninformationen der verarbeiteten Texte schnell zu erkennen. Im Optimalfall liefern Text-Mining-Systeme Informationen, von denen die Benutzer zuvor nicht wissen, ob und dass sie in den verarbeiteten Texten enthalten sind. Bei zielgerichteter Anwendung sind Werkzeuge des Text Mining außerdem in der Lage, Hypothesen zu generieren, diese zu überprüfen und schrittweise zu verfeinern."
+textSample = "Text Mining, seltener auch Textmining, Text Data Mining oder Textual Data Mining, ist ein Bündel von Algorithmus-basierten Analyseverfahren zur Entdeckung von Bedeutungsstrukturen aus un- oder schwachstrukturierten Textdaten. Mit statistischen und linguistischen Mitteln erschließt Text-Mining-Software aus Texten Strukturen, die die Benutzer in die Lage versetzen sollen, Kerninformationen der verarbeiteten Texte schnell zu erkennen. Im Optimalfall liefern Text-Mining-Systeme Informationen, von denen die Benutzer zuvor nicht wissen, ob und dass sie in den verarbeiteten Texten enthalten sind. Bei zielgerichteter Anwendung sind Werkzeuge des Text Mining außerdem in der Lage, Hypothesen zu generieren, diese zu überprüfen und schrittweise zu verfeinern."
--- a/src/Gargantext/Text/Samples/EN.hs
+++ b/src/Gargantext/Text/Samples/EN.hs
+{-|
+Module      : Gargantext.Text.Samples.EN
+Description : Sample of English Text
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Source: Wikipedia
+Page  : text mining
+-}
 module Gargantext.Text.Samples.EN where
 import Data.String (String)
-textMining :: String
+textSample :: String
-textMining = "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
+textSample = "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
--- a/src/Gargantext/Text/Samples/FR.hs
+++ b/src/Gargantext/Text/Samples/FR.hs
+{-|
+Module      : Gargantext.Text.Samples.FR
+Description : Sample of French Text
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Source: Wikipedia
+Page  : text mining
+-}
 module Gargantext.Text.Samples.FR where
+import Gargantext.Prelude
 import Data.String (String)
-textMining :: String
+textSample :: String
-textMining = "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."
+textSample = "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique." <> "Je pense donc je suis."
--- a/src/Gargantext/Text/Samples/SP.hs
+++ b/src/Gargantext/Text/Samples/SP.hs
+{-|
+Module      : Gargantext.Text.Samples.SP
+Description : Sample of Spanish Text
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Source: Wikipedia
+Page  : text mining
+-}
 module Gargantext.Text.Samples.SP where
 import Data.String (String)

--- a/src/Gargantext/Text/Terms/Stop.hs
+++ b/src/Gargantext/Text/Terms/Stop.hs
@@ -18,6 +18,7 @@ Main type here is String.
 module Gargantext.Text.Terms.Stop
  where
+import GHC.Base (Functor)
 import Numeric.Probability.Distribution ((??))
 import qualified Numeric.Probability.Distribution as D
@@ -31,6 +32,7 @@ import qualified Data.Map.Strict as DM
 import Data.String (String)
 import Data.Text (pack, unpack)
+import Data.Tuple.Extra (both)
 import Gargantext.Prelude
 import Gargantext.Core (Lang(..), allLangs)
@@ -84,29 +86,46 @@ type LangProba = Map Lang Double
 ------------------------------------------------------------------------
 detectLangs :: String -> [(Lang, Double)]
-detectLangs s = DL.reverse $ DL.sortOn snd
+detectLangs s =  DL.reverse $ DL.sortOn snd
-                           $ toList
+                            $ toList
-                           $ detect (wordsToBook [0..2] s) testEL
+                            $ detect (wordsToBook [0..2] s) eventLang
+part :: (Eq p, Fractional p) => p -> p -> p
+part 0 _ = 0
+part _ 0 = 0
+part x y = x / y
+toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
+                 t (a, b) -> t (a, b)
+toProba xs = map (\(a,b) -> (a, part b total)) xs
+  where
+    total = sum $ map snd xs
-textMining :: Lang -> String
+textSample :: Lang -> String
-textMining EN = EN.textMining
+textSample EN = EN.textSample
-textMining FR = FR.textMining
+textSample FR = FR.textSample
--textMining DE = DE.textMining
+--textSample DE = DE.textSample
--textMining SP = SP.textMining
+--textSample SP = SP.textSample
--textMining CH = CH.textMining
+--textSample CH = CH.textSample
 langWord :: Lang -> LangWord
-langWord l = LangWord l (textMining l)
+langWord l = LangWord l (textSample l)
-testEL :: EventLang
+eventLang :: EventLang
-testEL = toEventLangs [0..2] [ langWord l | l <- allLangs ]
+eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
 detect :: EventBook -> EventLang -> LangProba
-detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= "  ") $ DM.toList mapFreq
+detect (EventBook mapFreq _) el = 
+  DM.unionsWith (+) 
+  $ map DM.fromList
+  $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
+  $ filter (\x -> fst x /= "  ")
+  $ DM.toList mapFreq
 ------------------------------------------------------------------------
 -- | TODO: monoids
 type EventLang = Map Lang EventBook
 toEventLangs :: [Int] -> [LangWord] -> EventLang
 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
@@ -128,6 +147,28 @@ peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
    a = maybe 0 identity $ DM.lookup st mapFreq
    b = maybe 1 identity $ DM.lookup (length st) mapN
+peb' :: String -> EventBook -> (Freq, TotalFreq)
+peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
+  where
+    a = maybe 0 identity $ DM.lookup st mapFreq
+    b = maybe 1 identity $ DM.lookup (length st) mapN
+------------------------------------------------------------------------
+toPrior :: String -> EventLang -> [(Lang, Double)]
+toPrior s el = prior $ pebLang s el
+pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
+pebLang st = map (\(l,eb) -> (l, peb' st eb)) .  DM.toList
+------------------------------------------------------------------------
+prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
+prior ps = zip ls $ zipWith (\x y -> x^99 * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps') 
+                                (map (\(a,b) -> a / b) ps')
+  where
+    (ls, ps'') = DL.unzip ps
+    ps' = map (both fromIntegral) ps''
+------------------------------------------------------------------------
 data EventBook = EventBook { events_freq :: Map String     Freq
                           , events_n    :: Map StringSize TotalFreq
                           }