Commit c6b1adf0 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DEV] Metrics before list creation.

parent 57cd486c
...@@ -85,6 +85,14 @@ flowCorpus userName ff fp corpusName = do ...@@ -85,6 +85,14 @@ flowCorpus userName ff fp corpusName = do
-- Master Flow -- Master Flow
docs <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp) docs <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
-- ChunkAlong needed for big corpora -- ChunkAlong needed for big corpora
-- TODO add LANG as parameter
-- TODO uniformize language of corpus
-- TODO ChunkAlong is not the right function here
-- chunkAlong 10 10 [1..15] == [1..10]
-- BUG: what about the rest (divMod 15 10)?
-- but if temporary enables big corpora insert for tests
-- TODO: chunkAlongNoRest or chunkAlongWithRest
-- default: NoRest
ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs ids <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs
-- User Flow -- User Flow
...@@ -95,8 +103,10 @@ flowCorpus userName ff fp corpusName = do ...@@ -95,8 +103,10 @@ flowCorpus userName ff fp corpusName = do
-- User List Flow -- User List Flow
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-- /!\ this extract NgramsTerms Only -- /!\ this extract NgramsTerms Only
_ngs <- sortTficf <$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2) _ngs <- toTermList (isStopTerm . fst) <$> sortTficf
printDebug "tficf size ngs" (length _ngs) <$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2)
--printDebug "tficf size ngs" (take 100 $ ngs)
-- TODO getNgramsElement of NgramsType... -- TODO getNgramsElement of NgramsType...
ngs <- getNgramsElementsWithParentNodeId masterCorpusId ngs <- getNgramsElementsWithParentNodeId masterCorpusId
...@@ -347,7 +357,28 @@ ngrams2list' m = fromListWith (<>) ...@@ -347,7 +357,28 @@ ngrams2list' m = fromListWith (<>)
] ]
------------------------------------------------------------------------
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
<> map (toTermList' stop CandidateTerm) zs
where
toTermList' stop' l n = case stop' n of
True -> (StopTerm, n)
False -> (l, n)
-- TODO use % of size of list
-- TODO user ML
xs = take a ns
ys = take b $ drop a xs
zs = drop b ys
a = 100
b = 1000
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -53,8 +53,8 @@ ngramsGroup l n = Text.intercalate " " ...@@ -53,8 +53,8 @@ ngramsGroup l n = Text.intercalate " "
sortTficf :: (Map Text (Double, Set Text)) sortTficf :: (Map Text (Double, Set Text))
-> [(Double, Set Text)] -> [(Text, (Double, Set Text))]
sortTficf = List.sortOn fst . elems sortTficf = List.sortOn (fst . snd) . toList
getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text) getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text)
......
...@@ -130,25 +130,31 @@ type Step = Int ...@@ -130,25 +130,31 @@ type Step = Int
-- if step == grain then linearity -- if step == grain then linearity
-- elif step < grain then overlapping -- elif step < grain then overlapping
-- else dotted with holes -- else dotted with holes
-- TODO FIX BUG if Steps*Grain /= length l
chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]] chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong a b l = case a > 0 && b > 0 of chunkAlong a b l = case a >= length l of
True -> chunkAlong_ a b l True -> [l]
False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step" False -> chunkAlong' a b l
chunkAlong_ :: Eq a => Int -> Int -> [a] -> [[a]] chunkAlong' :: Eq a => Grain -> Step -> [a] -> [[a]]
chunkAlong_ a b l = filter (/= []) $ only (while dropAlong) chunkAlong' a b l = case a > 0 && b > 0 of
where True -> chunkAlong'' a b l
only = map (take a) False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step"
while = takeWhile (\x -> length x >= a)
dropAlong = L.scanl (\x _y -> drop b x) l ([1..] :: [Integer]) chunkAlong'' :: Eq a => Int -> Int -> [a] -> [[a]]
chunkAlong'' a b l = filter (/= []) $ only (while dropAlong)
where
only = map (take a)
while = takeWhile (\x -> length x >= a)
dropAlong = L.scanl (\x _y -> drop b x) l ([1..] :: [Integer])
-- | Optimized version (Vector) -- | Optimized version (Vector)
chunkAlong' :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a) chunkAlongV :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a)
chunkAlong' a b l = only (while dropAlong) chunkAlongV a b l = only (while dropAlong)
where where
only = V.map (V.take a) only = V.map (V.take a)
while = V.takeWhile (\x -> V.length x >= a) while = V.takeWhile (\x -> V.length x >= a)
dropAlong = V.scanl (\x _y -> V.drop b x) l (V.fromList [1..]) dropAlong = V.scanl (\x _y -> V.drop b x) l (V.fromList [1..])
-- | TODO Inverse of chunk ? unchunkAlong ? -- | TODO Inverse of chunk ? unchunkAlong ?
-- unchunkAlong :: Int -> Int -> [[a]] -> [a] -- unchunkAlong :: Int -> Int -> [[a]] -> [a]
......
...@@ -37,10 +37,10 @@ sentences :: Text -> [Text] ...@@ -37,10 +37,10 @@ sentences :: Text -> [Text]
sentences txt = map DT.pack $ segment $ DT.unpack txt sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text] sentences' :: Text -> [Text]
sentences' txt = split isStop txt sentences' txt = split isCharStop txt
isStop :: Char -> Bool isCharStop :: Char -> Bool
isStop c = c `elem` ['.','?','!'] isCharStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts unsentences txts = DT.intercalate " " txts
......
...@@ -14,6 +14,7 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id ...@@ -14,6 +14,7 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.TFICF ( TFICF module Gargantext.Text.Metrics.TFICF ( TFICF
, TficfContext(..) , TficfContext(..)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment