[METRICS] FACTO

965531a2 · Alexandre Delanoë · a22e2dc6 · 965531a2 · 965531a2 · 965531a2
Commit 965531a2 authored Jul 15, 2019 by Alexandre Delanoë
5 changed files
--- a/bin/gargantext-import/Main.hs
+++ b/bin/gargantext-import/Main.hs
@@ -48,8 +48,9 @@ main = do
  let
    --tt = (Unsupervised EN 6 0 Nothing)
    tt = (Multi EN)
+    format = WOS -- CsvGargV3
    cmd :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
-    cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) tt  CsvGargV3 corpusPath
+    cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) tt  format corpusPath
  {-
  let debatCorpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
      debatCorpus = do

--- a/src/Gargantext/Text/List.hs
+++ b/src/Gargantext/Text/List.hs
@@ -60,8 +60,8 @@ data StopSize = StopSize {unStopSize :: Int}
 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
                 -> Cmd err (Map NgramsType [NgramsElement])
 buildNgramsLists l n m s uCid mCid = do
-  ngTerms     <- buildNgramsTermsList l n m s uCid mCid
-  --ngTerms     <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50
+  --ngTerms     <- buildNgramsTermsList l n m s uCid mCid
+  ngTerms     <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50
  othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
  pure $ Map.unions $ othersTerms <> [ngTerms]


--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -102,13 +102,26 @@ takeScored listSize incSize = both (map _scored_terms)
 linearTakes :: (Ord b1, Ord b2)
            => GraphListSize -> InclusionSize
            -> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
-linearTakes gls incSize speGen incExc = (List.splitAt gls)
+linearTakes mls incSize speGen incExc = (List.splitAt mls)
                      . List.concat
                      . map (take $ round
-                                  $ (fromIntegral gls     :: Double)
+                                  $ (fromIntegral mls     :: Double)
                                  / (fromIntegral incSize :: Double)
                             )
                      . map (sortOn speGen)
                      . splitEvery incSize
+                      . take 5000
+                      . takePercent (0.70)
                      . sortOn incExc

+takePercent :: Double -> [a] -> [a]
+takePercent l xs = List.take l' xs
+  where
+    l' = round $ l * (fromIntegral $ List.length xs)
+
+splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
+splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
+  where
+    (mpa, ca) = List.splitAt a $ List.filter af xs
+    (mpb, cb) = List.splitAt b $ List.filter bf xs
+
--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -129,10 +129,9 @@ termsUnsupervised _ = undefined
 newTries :: Int -> Text -> Tries Token ()
 newTries n t = buildTries n (fmap toToken $ uniText t)

+-- | TODO removing long terms > 24
 uniText :: Text -> [[Text]]
-uniText = 
- --  map (map (Text.toLower))
-         map (List.filter (not . isPunctuation))
+uniText = map (List.filter (not . isPunctuation))
        . map tokenize
        . sentences       -- | TODO get sentences according to lang
        . Text.toLower

--- a/src/Gargantext/Text/Terms/Mono/Token/En.hs
+++ b/src/Gargantext/Text/Terms/Mono/Token/En.hs
 {-|
 Module      : Gargantext.Text.Ngrams.Token.Text
-Description : 
+Description : Tokenizer main functions
 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
@@ -87,7 +87,7 @@ defaultTokenizer =     whitespace
 uris :: Tokenizer
 uris x | isUri x = E [Left x]
       | True    = E [Right x]
-    where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
+    where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]

 -- | Split off initial and final punctuation
 punctuation :: Tokenizer