Commit 3cbf7e51 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[BAYES] detect lang (todo proba density).

parent 704fe86f
...@@ -167,6 +167,7 @@ executables: ...@@ -167,6 +167,7 @@ executables:
- -with-rtsopts=-N - -with-rtsopts=-N
- -O2 - -O2
- -Wmissing-signatures - -Wmissing-signatures
- -Wcompat
dependencies: dependencies:
- base - base
- containers - containers
......
...@@ -12,7 +12,6 @@ Portability : POSIX ...@@ -12,7 +12,6 @@ Portability : POSIX
module Gargantext.Core module Gargantext.Core
where where
import Gargantext.Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Language of a Text -- | Language of a Text
-- For simplicity, we suppose text has an homogenous language -- For simplicity, we suppose text has an homogenous language
...@@ -26,5 +25,5 @@ import Gargantext.Prelude ...@@ -26,5 +25,5 @@ import Gargantext.Prelude
-- - SP == spanish (not implemented yet) -- - SP == spanish (not implemented yet)
-- --
-- ... add your language and help us to implement it (: -- ... add your language and help us to implement it (:
data Lang = EN | FR data Lang = EN | FR | DE | SP | CH
deriving (Show, Eq, Ord) deriving (Show, Eq, Ord)
...@@ -68,7 +68,7 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt) ...@@ -68,7 +68,7 @@ import Text.XML.HXT.DOM.Util (decimalStringToInt)
parserLang :: Lang -> DC.Lang parserLang :: Lang -> DC.Lang
parserLang FR = DC.FR parserLang FR = DC.FR
parserLang EN = DC.EN parserLang EN = DC.EN
parserLang _ = panic "not implemented"
-- | Final Date parser API -- | Final Date parser API
-- IO can be avoided here: -- IO can be avoided here:
......
...@@ -55,6 +55,7 @@ stem lang = DT.pack . N.stem lang' . DT.unpack ...@@ -55,6 +55,7 @@ stem lang = DT.pack . N.stem lang' . DT.unpack
lang' = case lang of lang' = case lang of
EN -> N.English EN -> N.English
FR -> N.French FR -> N.French
_ -> panic $ DT.pack "not implemented yet"
...@@ -57,4 +57,4 @@ tokenTags' lang t = map tokens2tokensTags ...@@ -57,4 +57,4 @@ tokenTags' lang t = map tokens2tokensTags
group :: Lang -> [TokenTag] -> [TokenTag] group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group group EN = En.group
group FR = Fr.group group FR = Fr.group
group _ = panic $ pack "group :: Lang not implemeted yet"
...@@ -124,6 +124,7 @@ corenlp' lang txt = do ...@@ -124,6 +124,7 @@ corenlp' lang txt = do
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}" FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
_ -> panic $ pack "not implemented yet"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
let request = setRequestBodyLBS (cs txt) url let request = setRequestBodyLBS (cs txt) url
httpJSON request httpJSON request
......
...@@ -33,7 +33,7 @@ module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake, select, hardStopList) ...@@ -33,7 +33,7 @@ module Gargantext.Text.Terms.Multi.RAKE (multiterms_rake, select, hardStopList)
where where
import GHC.Real (round) import GHC.Real (round)
import Data.Text (Text, pack) import Data.Text (Text)
import NLP.RAKE.Text import NLP.RAKE.Text
import Gargantext.Text.Terms.Stop (stopList) import Gargantext.Text.Terms.Stop (stopList)
......
...@@ -37,6 +37,12 @@ import Gargantext.Core (Lang(..)) ...@@ -37,6 +37,12 @@ import Gargantext.Core (Lang(..))
import Gargantext.Text.Terms.Mono (words) import Gargantext.Text.Terms.Mono (words)
import Gargantext.Text.Metrics.Count (occurrencesWith) import Gargantext.Text.Metrics.Count (occurrencesWith)
import Gargantext.Text.Samples.FR as FR
import Gargantext.Text.Samples.EN as EN
import Gargantext.Text.Samples.DE as DE
import Gargantext.Text.Samples.SP as SP
import Gargantext.Text.Samples.CH as CH
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Candidate = Candidate { stop :: Double data Candidate = Candidate { stop :: Double
, noStop :: Double , noStop :: Double
...@@ -78,18 +84,20 @@ type LangProba = Map Lang Double ...@@ -78,18 +84,20 @@ type LangProba = Map Lang Double
------------------------------------------------------------------------ ------------------------------------------------------------------------
estimeTest :: String -> LangProba
estimeTest s = estime (wordsToBook [0..2] s) testEL detectLangs :: String -> LangProba
detectLangs s = detect (wordsToBook [0..2] s) testEL
testEL :: EventLang testEL :: EventLang
testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day." testEL = toEventLangs [0..2] [ LangWord EN EN.textMining
, LangWord FR "Belle journée, j'y vais." , LangWord FR FR.textMining
, LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye" , LangWord DE DE.textMining
, LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci." , LangWord SP SP.textMining
, LangWord CH CH.textMining
] ]
estime :: EventBook -> EventLang -> LangProba detect :: EventBook -> EventLang -> LangProba
estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO: monoids -- | TODO: monoids
...@@ -133,7 +141,7 @@ wordToBook :: [Int] -> Word -> EventBook ...@@ -133,7 +141,7 @@ wordToBook :: [Int] -> Word -> EventBook
wordToBook ns txt = EventBook ef en wordToBook ns txt = EventBook ef en
where where
chks = allChunks' ns 10 txt chks = allChunks' ns 10 txt
en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
...@@ -163,6 +171,7 @@ sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x ...@@ -163,6 +171,7 @@ sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
(~?) ds x = (==x) ?? ds (~?) ds x = (==x) ?? ds
------------------------------------------------------------------------ ------------------------------------------------------------------------
candidate :: [Char] -> Candidate
candidate x = Candidate (sumProba stopDist x) (sumProba candDist x) candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment