Commit 7d6d74c4 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[REFACTO] ngrams unsupervised.

parent 0b09d71e
Pipeline #487 failed with stage
......@@ -46,7 +46,7 @@ main = do
createUsers = insertUsersDemo
let cmd :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) (Unsupervised EN 5 Nothing) CsvHalFormat corpusPath
cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) (Unsupervised EN 5 1 Nothing) CsvHalFormat corpusPath
{-
let debatCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
debatCorpus = do
......
......@@ -190,7 +190,7 @@ insertMasterDocs c lang hs = do
let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
let
fixLang (Unsupervised l n m) = Unsupervised l n m'
fixLang (Unsupervised l n s m) = Unsupervised l n s m'
where
m' = case m of
Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
......
......@@ -59,7 +59,8 @@ data TermType lang
| Multi { _tt_lang :: lang }
| MonoMulti { _tt_lang :: lang }
| Unsupervised { _tt_lang :: lang
, _tt_size :: Int
, _tt_windoSize :: Int
, _tt_ngramsSize :: Int
, _tt_model :: Maybe (Tries Token ())
}
makeLenses ''TermType
......@@ -74,7 +75,7 @@ makeLenses ''TermType
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n (Just m'))) xs
extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
where
m' = case m of
Just m''-> m''
......@@ -94,7 +95,7 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n m) txt = termsUnsupervised m' n lang txt
terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
where
m' = maybe (newTries n txt) identity m
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......@@ -112,15 +113,20 @@ isPunctuation x = List.elem x $ (Text.pack . pure)
-- language agnostic extraction
-- TODO: remove IO
-- TODO: newtype BlockText
termsUnsupervised :: Tries Token () -> Int -> Lang -> Text -> IO [Terms]
termsUnsupervised m n l =
type WindowSize = Int
type MinNgramSize = Int
termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
pure
. map (text2term l)
. List.nub
. (List.filter (\l' -> List.length l' > 1))
. (List.filter (\l' -> List.length l' > s))
. List.concat
. mainEleveWith m n
. mainEleveWith (maybe (panic "no model") identity m) n
. uniText
termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment