Commit 07305554 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Split Corpus size at import.

parent b074d137
......@@ -87,8 +87,8 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
=> Username -> Text -> m CorpusId
flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- chunkAlong 10000 10000 <$> map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids
ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q [ids]
flowCorpusMaster :: FlowCmdM env ServantErr m => FileFormat -> FilePath -> m [[NodeId]]
......@@ -99,12 +99,7 @@ flowCorpusMaster ff fp = do
-- ChunkAlong needed for big corpora
-- TODO add LANG as parameter
-- TODO uniformize language of corpus
-- TODO ChunkAlong is not the right function here
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest of (divMod 15 10)?
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default behavior: NoRest
ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs
ids <- mapM insertMasterDocs $ splitEvery 10000 docs
pure ids
......
......@@ -128,10 +128,15 @@ type Grain = Int
type Step = Int
-- | Function to split a range into chunks
-- if step == grain then linearity
-- if step == grain then linearity (splitEvery)
-- elif step < grain then overlapping
-- else dotted with holes
-- TODO FIX BUG if Steps*Grain /= length l
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest of (divMod 15 10)?
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default behavior: NoRest
chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong a b l = case a >= length l of
True -> [l]
......
......@@ -46,13 +46,16 @@ data FilterConfig = FilterConfig
, fc_defaultValue :: DefaultValue
}
filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc :: (Show t, Ord t)
=> FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc
where
ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' :: (Show t, Ord t)
=> FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
......@@ -64,7 +67,8 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
takeSome :: Ord t
=> FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m
$ L.take l'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment