Commit 07305554 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] Split Corpus size at import.

parent b074d137
...@@ -87,8 +87,8 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m ...@@ -87,8 +87,8 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
=> Username -> Text -> m CorpusId => Username -> Text -> m CorpusId
flowCorpusSearchInDatabase u q = do flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- chunkAlong 10000 10000 <$> map fst <$> searchInDatabase cId (stemIt q) ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids flowCorpusUser u q [ids]
flowCorpusMaster :: FlowCmdM env ServantErr m => FileFormat -> FilePath -> m [[NodeId]] flowCorpusMaster :: FlowCmdM env ServantErr m => FileFormat -> FilePath -> m [[NodeId]]
...@@ -99,12 +99,7 @@ flowCorpusMaster ff fp = do ...@@ -99,12 +99,7 @@ flowCorpusMaster ff fp = do
-- ChunkAlong needed for big corpora -- ChunkAlong needed for big corpora
-- TODO add LANG as parameter -- TODO add LANG as parameter
-- TODO uniformize language of corpus -- TODO uniformize language of corpus
-- TODO ChunkAlong is not the right function here ids <- mapM insertMasterDocs $ splitEvery 10000 docs
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest of (divMod 15 10)?
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default behavior: NoRest
ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs
pure ids pure ids
......
...@@ -128,10 +128,15 @@ type Grain = Int ...@@ -128,10 +128,15 @@ type Grain = Int
type Step = Int type Step = Int
-- | Function to split a range into chunks -- | Function to split a range into chunks
-- if step == grain then linearity -- if step == grain then linearity (splitEvery)
-- elif step < grain then overlapping -- elif step < grain then overlapping
-- else dotted with holes -- else dotted with holes
-- TODO FIX BUG if Steps*Grain /= length l -- TODO FIX BUG if Steps*Grain /= length l
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest of (divMod 15 10)?
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default behavior: NoRest
chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]] chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong a b l = case a >= length l of chunkAlong a b l = case a >= length l of
True -> [l] True -> [l]
......
...@@ -46,13 +46,16 @@ data FilterConfig = FilterConfig ...@@ -46,13 +46,16 @@ data FilterConfig = FilterConfig
, fc_defaultValue :: DefaultValue , fc_defaultValue :: DefaultValue
} }
filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc :: (Show t, Ord t)
=> FilterConfig -> Map (t, t) Int -> Map (t, t) Int
filterCooc fc cc = (filterCooc' fc) ts cc filterCooc fc cc = (filterCooc' fc) ts cc
where where
ts = map _scored_terms $ takeSome fc $ coocScored cc ts = map _scored_terms $ takeSome fc $ coocScored cc
filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int filterCooc' :: (Show t, Ord t)
=> FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- trace ("coocScored " <> show ts) $ -- trace ("coocScored " <> show ts) $
foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m') foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
...@@ -64,7 +67,8 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = ...@@ -64,7 +67,8 @@ filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m =
-- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
-- each parts is then ordered by Inclusion/Exclusion -- each parts is then ordered by Inclusion/Exclusion
-- take n scored terms in each parts where n * SampleBins = MapListSize. -- take n scored terms in each parts where n * SampleBins = MapListSize.
takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t] takeSome :: Ord t
=> FilterConfig -> [Scored t] -> [Scored t]
takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
$ takeSample n m $ takeSample n m
$ L.take l' $ L.take l'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment