Commit 7d6d74c4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[REFACTO] ngrams unsupervised.

parent 0b09d71e
...@@ -46,7 +46,7 @@ main = do ...@@ -46,7 +46,7 @@ main = do
createUsers = insertUsersDemo createUsers = insertUsersDemo
let cmd :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId let cmd :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) (Unsupervised EN 5 Nothing) CsvHalFormat corpusPath cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) (Unsupervised EN 5 1 Nothing) CsvHalFormat corpusPath
{- {-
let debatCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId let debatCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
debatCorpus = do debatCorpus = do
......
...@@ -190,7 +190,7 @@ insertMasterDocs c lang hs = do ...@@ -190,7 +190,7 @@ insertMasterDocs c lang hs = do
let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs') let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
let let
fixLang (Unsupervised l n m) = Unsupervised l n m' fixLang (Unsupervised l n s m) = Unsupervised l n s m'
where where
m' = case m of m' = case m of
Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId) Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
......
...@@ -59,7 +59,8 @@ data TermType lang ...@@ -59,7 +59,8 @@ data TermType lang
| Multi { _tt_lang :: lang } | Multi { _tt_lang :: lang }
| MonoMulti { _tt_lang :: lang } | MonoMulti { _tt_lang :: lang }
| Unsupervised { _tt_lang :: lang | Unsupervised { _tt_lang :: lang
, _tt_size :: Int , _tt_windoSize :: Int
, _tt_ngramsSize :: Int
, _tt_model :: Maybe (Tries Token ()) , _tt_model :: Maybe (Tries Token ())
} }
makeLenses ''TermType makeLenses ''TermType
...@@ -74,7 +75,7 @@ makeLenses ''TermType ...@@ -74,7 +75,7 @@ makeLenses ''TermType
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n (Just m'))) xs extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
where where
m' = case m of m' = case m of
Just m''-> m'' Just m''-> m''
...@@ -94,7 +95,7 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -94,7 +95,7 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n m) txt = termsUnsupervised m' n lang txt terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
where where
m' = maybe (newTries n txt) identity m m' = maybe (newTries n txt) identity m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
...@@ -112,15 +113,20 @@ isPunctuation x = List.elem x $ (Text.pack . pure) ...@@ -112,15 +113,20 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction -- language agnostic extraction
-- TODO: remove IO -- TODO: remove IO
-- TODO: newtype BlockText -- TODO: newtype BlockText
termsUnsupervised :: Tries Token () -> Int -> Lang -> Text -> IO [Terms]
termsUnsupervised m n l = type WindowSize = Int
type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
pure pure
. map (text2term l) . map (text2term l)
. List.nub . List.nub
. (List.filter (\l' -> List.length l' > 1)) . (List.filter (\l' -> List.length l' > s))
. List.concat . List.concat
. mainEleveWith m n . mainEleveWith (maybe (panic "no model") identity m) n
. uniText . uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token () newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t) newTries n t = buildTries n (fmap toToken $ uniText t)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment