some more refactorings, comments to code i didn't understand

956df688 · Przemyslaw Kaminski · 166f9c19 · 956df688 · 956df688 · 956df688
Verified Commit 956df688 authored Apr 02, 2024 by Przemyslaw Kaminski
9 changed files
--- a/gargantext.cabal
+++ b/gargantext.cabal
@@ -65,7 +65,6 @@ common defaults
  build-depends:
    base >=4.7 && <5

-  optimization: 2
 common optimized
  ghc-options:
    -O2
@@ -817,7 +816,6 @@ executable gargantext-server
    , text ^>= 1.2.4.1
    , unordered-containers ^>= 0.2.16.0
    , vector ^>= 0.7.3
-  optimization: 2

 executable gargantext-upgrade
  import:

--- a/src/Gargantext/Core/Text/List.hs
+++ b/src/Gargantext/Core/Text/List.hs
@@ -58,6 +58,11 @@ isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
 -}


+-- | Good value from users' requests and anthropological analysis
+goodMapListSize :: Int
+goodMapListSize = 350
+    
+
 -- | TODO improve grouping functions of Authors, Sources, Institutes..
 buildNgramsLists :: ( HasNodeStory env err m
                    , HasNLPServer env
@@ -71,7 +76,7 @@ buildNgramsLists :: ( HasNodeStory env err m
                 -> GroupParams
                 -> m (Map NgramsType [NgramsElement])
 buildNgramsLists user uCid mCid mfslw gp = do
-  ngTerms     <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
+  ngTerms     <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize goodMapListSize)
  othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
                      [ (Authors   , MapListSize 9, MaxListSize 1000)
                      , (Sources   , MapListSize 9, MaxListSize 1000)
@@ -195,6 +200,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
                                 $ HashMap.filter (\g -> view gts'_score g > 1)
                                 $ view flc_scores groupedWithList

+    -- | Split candidateTerms into mono-terms and multi-terms.
    !(groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms

  -- void $ panicTrace $ "groupedWithList: " <> show groupedWithList
@@ -211,6 +217,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
    !monoSize = 0.4  :: Double
    !multSize = 1 - monoSize

+    -- | Splits given hashmap into 2 pieces, based on score
    splitAt' n' ns = both (HashMap.fromListWith (<>))
                      $ List.splitAt (round $ n' * listSizeGlobal)
                      $ List.sortOn (viewScore . snd)
@@ -254,8 +261,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSi
                           ]
          where
            mapStemNodeIds = HashMap.toList
-                           $ HashMap.map viewScores
-                           $ groupedTreeScores_SetNodeId
+                           $ HashMap.map viewScores groupedTreeScores_SetNodeId
  let
    -- computing scores
    mapScores f = HashMap.fromList

--- a/src/Gargantext/Core/Text/Metrics/TFICF.hs
+++ b/src/Gargantext/Core/Text/Metrics/TFICF.hs
@@ -14,14 +14,15 @@ TFICF is a generalization of [TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93id
 -}


-module Gargantext.Core.Text.Metrics.TFICF ( TFICF
-                                     , TficfContext(..)
-                                     , Total(..)
-                                     , Count(..)
-                                     , tficf
-                                     , sortTficf
-                                     )
-  where
+module Gargantext.Core.Text.Metrics.TFICF
+ ( TFICF
+ , TficfContext(..)
+ , Total(..)
+ , Count(..)
+ , tficf
+ , sortTficf
+ )
+where

 import Data.List qualified as List
 import Data.Map.Strict (toList)
@@ -34,12 +35,19 @@ path = "[G.T.Metrics.TFICF]"

 type TFICF = Double

+-- https://www.researchgate.net/publication/221226686_TF-ICF_A_New_Term_Weighting_Scheme_for_Clustering_Dynamic_Data_Streams
+-- TficfSupra n m
+--  - m is the total number of documents in the corpus
+--  - n is the number of documents, where given term occured more than once
+-- TficfInfra n m
+--  - 
+    
 data TficfContext n m = TficfInfra n m
                      | TficfSupra n m
  deriving (Show)

-data Total = Total {unTotal :: !Double}
-data Count = Count {unCount :: !Double}
+newtype Total = Total { unTotal :: Double }
+newtype Count = Count { unCount :: Double }

 tficf :: TficfContext Count Total
      -> TficfContext Count Total
@@ -50,7 +58,11 @@ tficf (TficfInfra (Count ic) (Total it) )
            | otherwise                        = panicTrace
                                               $ "[ERR]"
                                               <> path
-                                               <> " Frequency impossible"
+                                               <> " Frequency impossible: "
+                                               <> "ic = " <> show ic
+                                               <> ", it = " <> show it
+                                               <> ", sc = " <> show sc
+                                               <> ", st = " <> show st
 tficf _ _ = panicTrace $ "[ERR]" <> path <> "Undefined for these contexts"



--- a/src/Gargantext/Core/Text/Terms/WithList.hs
+++ b/src/Gargantext/Core/Text/Terms/WithList.hs
@@ -114,7 +114,7 @@ extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pat

 --------------------------------------------------------------------------
 addSpaces :: Text -> Text
-addSpaces = (Text.intercalate " ") . (Text.chunksOf 1)
+addSpaces = Text.unwords . (Text.chunksOf 1)


 --------------------------------------------------------------------------

--- a/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
+++ b/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
@@ -185,10 +185,13 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
                ( int
                , toDBid NodeDocument
                , cId
+                -- , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
                , DPS.In (unNgramsTerm <$> (List.take 10000 tms))
                , cId
                , toDBid nt
                )
+    -- where
+    --   fields = [QualifiedIdentifier Nothing "text"]

 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
@@ -211,6 +214,27 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
      GROUP BY cng.node_id, ir.terms
  |]

+  
+-- queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
+-- queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
+--   WITH nodes_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
+--                           JOIN nodes_contexts nc ON c.id = nc.context_id
+--                             WHERE c.typename  = ?
+--                             AND nc.node_id = ?),
+--        input_rows(terms) AS (?)
+--   SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
+--     JOIN ngrams ng      ON cng.ngrams_id = ng.id
+--     JOIN input_rows  ir ON ir.terms      = ng.terms
+--     JOIN nodes_contexts nc ON nc.context_id   = cng.context_id
+--     JOIN nodes_sample ns ON nc.context_id   = ns.id
+--     WHERE nc.node_id      = ? -- CorpusId
+--       AND cng.ngrams_type = ? -- NgramsTypeId
+--       AND nc.category     > 0
+--       -- AND nc.context_id IN (SELECT id FROM nodes_sample)
+--       GROUP BY cng.node_id, ng.terms
+--   |]
+
+
 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
                                                     => CorpusId
                                                     -> Int

--- a/src/Gargantext/Database/Action/Metrics/TFICF.hs
+++ b/src/Gargantext/Database/Action/Metrics/TFICF.hs
@@ -70,6 +70,9 @@ getTficf_withSample cId mId nt = do
                     <$> getOccByNgramsOnlyFast_withSample mId countGlobal nt
                            (HM.keys mapTextDoubleLocal)

+  printDebug "[getTficf_withSample] mapTextDoubleLocal: " mapTextDoubleLocal
+  printDebug "[getTficf_withSample] mapTextDoubleGlobal: " mapTextDoubleGlobal
+
  --printDebug "getTficf_withSample" (mapTextDoubleLocal, mapTextDoubleGlobal, countLocal, countGlobal)
  pure $ HM.mapWithKey (\t n ->
      tficf (TficfInfra (Count n                                               )

--- a/src/Gargantext/Database/Prelude.hs
+++ b/src/Gargantext/Database/Prelude.hs
@@ -9,7 +9,6 @@ Portability : POSIX

 -}

-{-# LANGUAGE Arrows #-}
 {-# LANGUAGE ConstraintKinds, ScopedTypeVariables #-}
 {-# LANGUAGE LambdaCase #-}

@@ -210,9 +209,9 @@ fromField' field mb = do
          valueToHyperdata v = case fromJSON v of
             Success a  -> pure a
             Error _err -> returnError ConversionFailed field
-                         $ DL.intercalate " " [ "cannot parse hyperdata for JSON: "
-                                              , show v
-                                              ]
+                         $ DL.unwords [ "cannot parse hyperdata for JSON: "
+                                      , show v
+                                      ]

 printSqlOpa :: Default Unpackspec a a => Select a -> IO ()
 printSqlOpa = putStrLn . maybe "Empty query" identity . showSql

--- a/src/Gargantext/Database/Query/Table/NgramsPostag.hs
+++ b/src/Gargantext/Database/Query/Table/NgramsPostag.hs
@@ -164,7 +164,7 @@ querySelectLems = [sql|
       AS (SELECT id, terms
            FROM ngrams
            WHERE terms IN ?)
-   , input_rows(lang_id, algo_id, terms,n)
+   , input_rows
    AS (SELECT ? as lang_id, ? as algo_id, terms, id
         FROM trms)
    , lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir

--- a/src/Gargantext/Database/Query/Table/NodeContext.hs
+++ b/src/Gargantext/Database/Query/Table/NodeContext.hs
@@ -317,6 +317,15 @@ nodeContextsScore inputData = map (\(PGS.Only a) -> a)


 ------------------------------------------------------------------------
+-- | Counts the number of documents in a corpus.
+--   Also applies filter for category to be at least 1 (i.e. not in trash).
+--     select count(*)
+--     from contexts c 
+--     join nodes_contexts nc on c.id = nc.context_id 
+--     where 
+--     nc.node_id = 88
+--     and nc.category >= 1
+--     and c.typename = 4
 selectCountDocs :: HasDBid NodeType => CorpusId -> DBCmd err Int
 selectCountDocs cId = runCountOpaQuery (queryCountDocs cId)
  where