Commit c6b1adf0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DEV] Metrics before list creation.

parent 57cd486c
......@@ -85,6 +85,14 @@ flowCorpus userName ff fp corpusName = do
-- Master Flow
docs <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
-- ChunkAlong needed for big corpora
-- TODO add LANG as parameter
-- TODO uniformize language of corpus
-- TODO ChunkAlong is not the right function here
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest (divMod 15 10)?
-- but if temporary enables big corpora insert for tests
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default: NoRest
ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs
-- User Flow
......@@ -95,8 +103,10 @@ flowCorpus userName ff fp corpusName = do
-- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only
_ngs <- sortTficf <$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2)
printDebug "tficf size ngs" (length _ngs)
_ngs <- toTermList (isStopTerm . fst) <$> sortTficf
<$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2)
--printDebug "tficf size ngs" (take 100 $ ngs)
-- TODO getNgramsElement of NgramsType...
ngs <- getNgramsElementsWithParentNodeId masterCorpusId
......@@ -347,7 +357,28 @@ ngrams2list' m = fromListWith (<>)
]
------------------------------------------------------------------------
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
<> map (toTermList' stop CandidateTerm) zs
where
toTermList' stop' l n = case stop' n of
True -> (StopTerm, n)
False -> (l, n)
-- TODO use % of size of list
-- TODO user ML
xs = take a ns
ys = take b $ drop a xs
zs = drop b ys
a = 100
b = 1000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
------------------------------------------------------------------------
......
......@@ -53,8 +53,8 @@ ngramsGroup l n = Text.intercalate " "
sortTficf :: (Map Text (Double, Set Text))
-> [(Double, Set Text)]
sortTficf = List.sortOn fst . elems
-> [(Text, (Double, Set Text))]
sortTficf = List.sortOn (fst . snd) . toList
getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text)
......
......@@ -130,25 +130,31 @@ type Step = Int
-- if step == grain then linearity
-- elif step < grain then overlapping
-- else dotted with holes
-- TODO FIX BUG if Steps*Grain /= length l
chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong a b l = case a > 0 && b > 0 of
True -> chunkAlong_ a b l
False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step"
chunkAlong a b l = case a >= length l of
True -> [l]
False -> chunkAlong' a b l
chunkAlong_ :: Eq a => Int -> Int -> [a] -> [[a]]
chunkAlong_ a b l = filter (/= []) $ only (while dropAlong)
where
only = map (take a)
while = takeWhile (\x -> length x >= a)
dropAlong = L.scanl (\x _y -> drop b x) l ([1..] :: [Integer])
chunkAlong' :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong' a b l = case a > 0 && b > 0 of
True -> chunkAlong'' a b l
False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step"
chunkAlong'' :: Eq a => Int -> Int -> [a] -> [[a]]
chunkAlong'' a b l = filter (/= []) $ only (while dropAlong)
where
only = map (take a)
while = takeWhile (\x -> length x >= a)
dropAlong = L.scanl (\x _y -> drop b x) l ([1..] :: [Integer])
-- | Optimized version (Vector)
chunkAlong' :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a)
chunkAlong' a b l = only (while dropAlong)
where
only = V.map (V.take a)
while = V.takeWhile (\x -> V.length x >= a)
dropAlong = V.scanl (\x _y -> V.drop b x) l (V.fromList [1..])
chunkAlongV :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a)
chunkAlongV a b l = only (while dropAlong)
where
only = V.map (V.take a)
while = V.takeWhile (\x -> V.length x >= a)
dropAlong = V.scanl (\x _y -> V.drop b x) l (V.fromList [1..])
-- | TODO Inverse of chunk ? unchunkAlong ?
-- unchunkAlong :: Int -> Int -> [[a]] -> [a]
......
......@@ -37,10 +37,10 @@ sentences :: Text -> [Text]
sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isStop txt
sentences' txt = split isCharStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
isCharStop :: Char -> Bool
isCharStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts
......
......@@ -14,6 +14,7 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.TFICF ( TFICF
, TficfContext(..)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment