[FEAT] Lang detect.

704fe86f · Alexandre Delanoë · 652975a0 · 704fe86f · 704fe86f · 704fe86f
Commit 704fe86f authored Sep 24, 2018 by Alexandre Delanoë
5 changed files
--- a/src/Gargantext/Core.hs
+++ b/src/Gargantext/Core.hs
@@ -12,6 +12,7 @@ Portability : POSIX
 module Gargantext.Core
  where
+import Gargantext.Prelude 
 ------------------------------------------------------------------------
 -- | Language of a Text
 -- For simplicity, we suppose text has an homogenous language
@@ -26,3 +27,4 @@ module Gargantext.Core
 --
 --  ... add your language and help us to implement it (:
 data Lang = EN | FR
+  deriving (Show, Eq, Ord)
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -46,7 +46,7 @@ import Protolude ( Bool(True, False), Int, Int64, Double, Integer
                 , abs, min, max, maximum, minimum, return, snd, truncate
                 , (+), (*), (/), (-), (.), ($), (&), (**), (^), (<), (>), log
                 , Eq, (==), (>=), (<=), (<>), (/=)
-                 , (&&), (||), not, any
+                 , (&&), (||), not, any, all
                 , fst, snd, toS
                 , elem, die, mod, div, const, either
                 , curry, uncurry, repeat

--- a/src/Gargantext/Text/Metrics/Count.hs
+++ b/src/Gargantext/Text/Metrics/Count.hs
@@ -144,6 +144,9 @@ occurrences = occurrencesOn _terms_stem
 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
+occurrencesWith :: (Foldable list, Ord k, Num a) => (b -> k) -> list b -> Map k a
+occurrencesWith f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
 -- TODO add groups and filter stops
 sumOcc :: Ord a => [Occ a] -> Occ a

--- a/src/Gargantext/Text/Metrics/FrequentItemSet.hs
+++ b/src/Gargantext/Text/Metrics/FrequentItemSet.hs
@@ -16,7 +16,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
 module Gargantext.Text.Metrics.FrequentItemSet
  ( Fis, Size(..)
  , occ_hlcm, cooc_hlcm
-  , all, between
+  , allFis, between
  , fisWithSize
  , fisWith
  , fisWithSizePoly
@@ -51,8 +51,8 @@ occ_hlcm = fisWithSize (Point 1)
 cooc_hlcm :: Frequency -> [[Item]] -> [Fis]
 cooc_hlcm = fisWithSize (Point 2)
-all :: Frequency -> [[Item]] -> [Fis]
+allFis :: Frequency -> [[Item]] -> [Fis]
-all = fisWith Nothing
+allFis = fisWith Nothing
 ------------------------------------------------------------------------
 between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]

--- a/src/Gargantext/Text/Terms/Stop.hs
+++ b/src/Gargantext/Text/Terms/Stop.hs
@@ -21,14 +21,23 @@ module Gargantext.Text.Terms.Stop
 import Numeric.Probability.Distribution ((??))
 import qualified Numeric.Probability.Distribution as D
-import Data.String (String)
 import Data.Char (toLower)
 import qualified Data.List as DL
-- import qualified Data.Map as M
+import Data.Maybe (maybe)
+import Data.Map.Strict (Map)
+import qualified Data.Map.Strict as DM
+import Data.String (String)
+import Data.Text (pack, unpack)
 import Gargantext.Prelude
+import Gargantext.Core (Lang(..))
+import Gargantext.Text.Terms.Mono (words)
+import Gargantext.Text.Metrics.Count (occurrencesWith)
+------------------------------------------------------------------------
 data Candidate = Candidate { stop :: Double
                           , noStop :: Double
 } deriving (Show)
@@ -48,11 +57,92 @@ blockOf n st = DL.concat $ DL.take n $ DL.repeat st
 -- | Chunks is the same function as splitBy in Context but for Strings,
 -- not Text (without pack and unpack operations that are not needed).
 chunks :: Int -> Int -> String -> [String]
-chunks n m = DL.take m . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
+chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
 allChunks :: [Int] -> Int -> String -> [String]
 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
+allChunks' :: [Int] -> Int -> String -> [[String]]
+allChunks' ns m st = map (\n -> chunks n m st) ns
+------------------------------------------------------------------------
+-- * Analyze candidate
+type StringSize = Int
+type TotalFreq  = Int
+type Freq       = Int
+type Word       = String
+data LangWord = LangWord Lang Word
+type LangProba = Map Lang Double
+------------------------------------------------------------------------
+estimeTest :: String -> LangProba
+estimeTest s = estime (wordsToBook [0..2] s) testEL
+testEL :: EventLang
+testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day."
+                              , LangWord FR "Belle journée, j'y vais."
+                              , LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye"
+                              , LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci."
+                              ]
+estime :: EventBook -> EventLang -> LangProba
+estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= "  ") $ DM.toList mapFreq
+------------------------------------------------------------------------
+-- | TODO: monoids
+type EventLang = Map Lang EventBook
+toEventLangs :: [Int] -> [LangWord] -> EventLang
+toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
+emptyEventLang :: [Int] -> EventLang
+emptyEventLang ns = toLang ns (LangWord FR "")
+toLang :: [Int] -> LangWord -> EventLang
+toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
+opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
+opLang f = DM.unionWith (op f)
+------------------------------------------------------------------------
+-- | TODO: monoids (but proba >= 0)
+peb :: String -> EventBook -> Double
+peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
+  where
+    a = maybe 0 identity $ DM.lookup st mapFreq
+    b = maybe 1 identity $ DM.lookup (length st) mapN
+data EventBook = EventBook { events_freq :: Map String     Freq
+                           , events_n    :: Map StringSize TotalFreq
+                           }
+                             deriving (Show)
+emptyEventBook :: [Int] -> EventBook
+emptyEventBook ns = wordToBook ns " "
+wordsToBook :: [Int] -> String -> EventBook
+wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
+  where
+    ws = map unpack $ words $ pack txt
+    eventsBook = map (wordToBook ns) ws
+wordToBook :: [Int] -> Word -> EventBook
+wordToBook ns txt = EventBook ef en
+  where
+    chks = allChunks' ns 10 txt
+    en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks
+    ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
+op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
+op f (EventBook ef1 en1)
+     (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
+                                     (DM.unionWith f en1 en2)
+------------------------------------------------------------------------
 ------------------------------------------------------------------------
 -- * Make the distributions
 makeDist :: [String] -> D.T Double String
@@ -65,7 +155,6 @@ candDist :: D.T Double String
 candDist = makeDist candList
 ------------------------------------------------------------------------
-- * Analyze candidate
 sumProba :: Num a => D.T a String -> [Char] -> a
 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
@@ -78,7 +167,8 @@ candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 ------------------------------------------------------------------------
 candList :: [String]
-candList = ["france", "alexandre", "mael", "constitution", "delanoe", "etats-unis", "associes", "car", "train", "spam"]
+candList = [ "france", "alexandre", "mael", "constitution"
+           , "etats-unis", "associes", "car", "train", "spam"]
 stopList :: [String]