Commit 3cbf7e51 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[BAYES] detect lang (todo proba density).

parent 704fe86f
......@@ -167,6 +167,7 @@ executables:
- -with-rtsopts=-N
- -O2
- -Wmissing-signatures
- -Wcompat
dependencies:
- base
- containers
......
......@@ -12,7 +12,6 @@ Portability : POSIX
module Gargantext.Core
where
import Gargantext.Prelude
------------------------------------------------------------------------
-- | Language of a Text
-- For simplicity, we suppose text has an homogenous language
......@@ -26,5 +25,5 @@ import Gargantext.Prelude
-- - SP == spanish (not implemented yet)
--
-- ... add your language and help us to implement it (:
data Lang = EN | FR
data Lang = EN | FR | DE | SP | CH
deriving (Show, Eq, Ord)
......@@ -68,7 +68,7 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt)
parserLang :: Lang -> DC.Lang
parserLang FR = DC.FR
parserLang EN = DC.EN
parserLang _ = panic "not implemented"
-- | Final Date parser API
-- IO can be avoided here:
......
......@@ -55,6 +55,7 @@ stem lang = DT.pack . N.stem lang' . DT.unpack
lang' = case lang of
EN -> N.English
FR -> N.French
_ -> panic $ DT.pack "not implemented yet"
......@@ -57,4 +57,4 @@ tokenTags' lang t = map tokens2tokensTags
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
group _ = panic $ pack "group :: Lang not implemeted yet"
......@@ -124,6 +124,7 @@ corenlp' lang txt = do
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
_ -> panic $ pack "not implemented yet"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
let request = setRequestBodyLBS (cs txt) url
httpJSON request
......
......@@ -33,7 +33,7 @@ module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake, select, hardStopList)
where
import GHC.Real (round)
import Data.Text (Text, pack)
import Data.Text (Text)
import NLP.RAKE.Text
import Gargantext.Text.Terms.Stop (stopList)
......
......@@ -37,6 +37,12 @@ import Gargantext.Core (Lang(..))
import Gargantext.Text.Terms.Mono (words)
import Gargantext.Text.Metrics.Count (occurrencesWith)
import Gargantext.Text.Samples.FR as FR
import Gargantext.Text.Samples.EN as EN
import Gargantext.Text.Samples.DE as DE
import Gargantext.Text.Samples.SP as SP
import Gargantext.Text.Samples.CH as CH
------------------------------------------------------------------------
data Candidate = Candidate { stop :: Double
, noStop :: Double
......@@ -78,18 +84,20 @@ type LangProba = Map Lang Double
------------------------------------------------------------------------
estimeTest :: String -> LangProba
estimeTest s = estime (wordsToBook [0..2] s) testEL
detectLangs :: String -> LangProba
detectLangs s = detect (wordsToBook [0..2] s) testEL
testEL :: EventLang
testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day."
, LangWord FR "Belle journée, j'y vais."
, LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye"
, LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci."
testEL = toEventLangs [0..2] [ LangWord EN EN.textMining
, LangWord FR FR.textMining
, LangWord DE DE.textMining
, LangWord SP SP.textMining
, LangWord CH CH.textMining
]
estime :: EventBook -> EventLang -> LangProba
estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
detect :: EventBook -> EventLang -> LangProba
detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
------------------------------------------------------------------------
-- | TODO: monoids
......@@ -133,7 +141,7 @@ wordToBook :: [Int] -> Word -> EventBook
wordToBook ns txt = EventBook ef en
where
chks = allChunks' ns 10 txt
en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks
en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
......@@ -163,6 +171,7 @@ sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
(~?) ds x = (==x) ?? ds
------------------------------------------------------------------------
candidate :: [Char] -> Candidate
candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment