[DEV] Metrics before list creation.

c6b1adf0 · Alexandre Delanoë · 57cd486c · c6b1adf0 · c6b1adf0 · c6b1adf0
Commit c6b1adf0 authored Mar 04, 2019 by Alexandre Delanoë
5 changed files
--- a/src/Gargantext/Database/Flow.hs
+++ b/src/Gargantext/Database/Flow.hs
@@ -85,6 +85,14 @@ flowCorpus userName ff fp corpusName = do
  -- Master Flow
  docs <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
  -- ChunkAlong needed for big corpora
+  -- TODO add LANG as parameter
+  -- TODO uniformize language of corpus
+                                   -- TODO ChunkAlong is not the right function here
+                                   -- chunkAlong 10 10 [1..15] == [1..10]
+                                   -- BUG: what about the rest (divMod 15 10)?
+                                   -- but if temporary enables big corpora insert for tests
+                                   -- TODO: chunkAlongNoRest or chunkAlongWithRest
+                                   -- default: NoRest
  ids  <- mapM insertMasterDocs $ chunkAlong 10000 10000 docs

  -- User Flow
@@ -95,8 +103,10 @@ flowCorpus userName ff fp corpusName = do
  -- User List Flow
  (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
  -- /!\ this extract NgramsTerms Only
-  _ngs <- sortTficf <$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2)
-  printDebug "tficf size ngs" (length _ngs)
+  _ngs <- toTermList (isStopTerm . fst) <$> sortTficf
+                                       <$> getTficf' userCorpusId masterCorpusId (ngramsGroup EN 2)
+  
+  --printDebug "tficf size ngs" (take 100 $ ngs)

  -- TODO getNgramsElement of NgramsType...
  ngs <- getNgramsElementsWithParentNodeId masterCorpusId
@@ -347,7 +357,28 @@ ngrams2list' m = fromListWith (<>)
  ]


+------------------------------------------------------------------------

+toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
+toTermList stop ns =  map (toTermList' stop CandidateTerm) xs
+                <> map (toTermList' stop GraphTerm)     ys
+                <> map (toTermList' stop CandidateTerm) zs
+    where
+      toTermList' stop' l n = case stop' n of
+          True  -> (StopTerm, n)
+          False -> (l, n)
+      
+      -- TODO use % of size of list
+      -- TODO user ML
+      xs = take a ns
+      ys = take b $ drop a xs
+      zs = drop b ys
+
+      a = 100
+      b = 1000
+
+isStopTerm :: Text -> Bool
+isStopTerm x = Text.length x < 3

 ------------------------------------------------------------------------


--- a/src/Gargantext/Database/Metrics/NgramsByNode.hs
+++ b/src/Gargantext/Database/Metrics/NgramsByNode.hs
@@ -53,8 +53,8 @@ ngramsGroup l n = Text.intercalate " "


 sortTficf :: (Map Text (Double, Set Text))
-          -> [(Double, Set Text)]
-sortTficf  = List.sortOn fst . elems
+          -> [(Text, (Double, Set Text))]
+sortTficf  = List.sortOn (fst . snd) . toList


 getTficf' :: UserCorpusId -> MasterCorpusId -> (Text -> Text)

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -130,25 +130,31 @@ type Step  = Int
 -- if   step == grain then linearity
 -- elif step < grain then overlapping
 -- else dotted with holes
+-- TODO FIX BUG if Steps*Grain /= length l
 chunkAlong :: Eq a => Grain -> Step -> [a] -> [[a]]
-chunkAlong a b l = case a > 0 && b > 0 of
-    True  -> chunkAlong_ a b l
-    False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step"
+chunkAlong a b l = case a >= length l of
+  True  -> [l]
+  False -> chunkAlong' a b l

-chunkAlong_ :: Eq a => Int -> Int -> [a] -> [[a]]
-chunkAlong_ a b l = filter (/= []) $ only (while dropAlong)
-    where
-        only      = map       (take a)
-        while     = takeWhile (\x -> length x >= a)
-        dropAlong = L.scanl   (\x _y -> drop b x) l ([1..] :: [Integer])
+chunkAlong' :: Eq a => Grain -> Step -> [a] -> [[a]]
+chunkAlong' a b l = case a > 0 && b > 0 of
+  True  -> chunkAlong'' a b l
+  False -> panic "ChunkAlong: Parameters should be > 0 and Grain > Step"
+
+chunkAlong'' :: Eq a => Int -> Int -> [a] -> [[a]]
+chunkAlong'' a b l = filter (/= []) $ only (while dropAlong)
+  where
+    only      = map       (take a)
+    while     = takeWhile (\x -> length x >= a)
+    dropAlong = L.scanl   (\x _y -> drop b x) l ([1..] :: [Integer])

 -- | Optimized version (Vector)
-chunkAlong' :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a)
-chunkAlong' a b l = only (while  dropAlong)
-    where
-        only      = V.map       (V.take a)
-        while     = V.takeWhile (\x -> V.length x >= a)
-        dropAlong = V.scanl     (\x _y -> V.drop b x) l (V.fromList [1..])
+chunkAlongV :: Int -> Int -> V.Vector a -> V.Vector (V.Vector a)
+chunkAlongV a b l = only (while  dropAlong)
+  where
+    only      = V.map       (V.take a)
+    while     = V.takeWhile (\x -> V.length x >= a)
+    dropAlong = V.scanl     (\x _y -> V.drop b x) l (V.fromList [1..])

 -- | TODO Inverse of chunk ? unchunkAlong ?
 -- unchunkAlong :: Int -> Int -> [[a]] -> [a]

--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -37,10 +37,10 @@ sentences :: Text -> [Text]
 sentences txt = map DT.pack $ segment $ DT.unpack txt

 sentences' :: Text -> [Text]
-sentences' txt = split isStop txt
+sentences' txt = split isCharStop txt

-isStop :: Char -> Bool
-isStop c = c `elem` ['.','?','!']
+isCharStop :: Char -> Bool
+isCharStop c = c `elem` ['.','?','!']

 unsentences :: [Text] -> Text
 unsentences txts = DT.intercalate " " txts

--- a/src/Gargantext/Text/Metrics/TFICF.hs
+++ b/src/Gargantext/Text/Metrics/TFICF.hs
@@ -14,6 +14,7 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
 -}

 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}

 module Gargantext.Text.Metrics.TFICF ( TFICF
                                     , TficfContext(..)