[refactor] more explicit type constructors

It's better to enumerate fields than not - easier to search later. Also, a performance improvement for queryNgramsOccurrences (no VALUES).

[refactor] more explicit type constructors
It's better to enumerate fields than not - easier to search later. Also, a performance improvement for queryNgramsOccurrences (no VALUES).
7f8a865a · Przemyslaw Kaminski · 0f038a40 · 7f8a865a · 7f8a865a · 7f8a865a
Verified Commit 7f8a865a authored Mar 27, 2024 by Przemyslaw Kaminski
8 changed files
--- a/src/Gargantext/Core/Text/Ngrams.hs
+++ b/src/Gargantext/Core/Text/Ngrams.hs
@@ -92,7 +92,8 @@ instance Hashable Ngrams

 makeLenses ''Ngrams
 instance PGS.ToRow Ngrams where
-  toRow (UnsafeNgrams t s) = [toField t, toField s]
+  toRow (UnsafeNgrams { .. }) = [ toField _ngramsTerms
+                                , toField _ngramsSize ]

 ------------------------------------------------------------------------
 -------------------------------------------------------------------------

--- a/src/Gargantext/Core/Text/Terms.hs
+++ b/src/Gargantext/Core/Text/Terms.hs
@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono  (monoTerms)
 import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
 import Gargantext.Core.Text.Terms.Multi (multiterms)
-import Gargantext.Core.Types ( TermsCount, POS, Terms(Terms), TermsWithCount )
+import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount )
 import Gargantext.Core.Utils (groupWithCounts)
 import Gargantext.Database.Prelude (DBCmd)
 import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
@@ -60,6 +60,7 @@ import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgr
 import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId)
 import Gargantext.Prelude

+
 data TermType lang
  = Mono      { _tt_lang :: !lang }
  | Multi     { _tt_lang :: !lang }
@@ -124,15 +125,15 @@ class ExtractNgramsT h
                   -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
 ------------------------------------------------------------------------
 enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
-enrichedTerms l pa po (Terms ng1 ng2) =
+enrichedTerms l pa po (Terms { .. }) =
  NgramsPostag { _np_lang = l
               , _np_algo = pa
               , _np_postag = po
               , _np_form = form
               , _np_lem = lem }
    where
-      form = text2ngrams $ Text.intercalate " " ng1
-      lem  = text2ngrams $ Text.intercalate " " $ Set.toList ng2
+      form = text2ngrams $ Text.unwords _terms_label
+      lem  = text2ngrams $ Text.unwords $ Set.toList _terms_stem

 ------------------------------------------------------------------------
 cleanNgrams :: Int -> Ngrams -> Ngrams

--- a/src/Gargantext/Core/Types.hs
+++ b/src/Gargantext/Core/Types.hs
@@ -75,7 +75,7 @@ data Terms = Terms { _terms_label :: Label
                   , _terms_stem  :: Stems
                   } deriving (Ord, Show)
 instance Eq Terms where
-  (==) (Terms _ s1) (Terms _ s2) = s1 == s2
+  (==) (Terms { _terms_stem = s1 }) (Terms { _terms_stem = s2 }) = s1 == s2

 type TermsCount = Int


--- a/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
+++ b/src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
@@ -185,12 +185,10 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
                ( int
                , toDBid NodeDocument
                , cId
-                , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
+                , DPS.In (unNgramsTerm <$> (List.take 10000 tms))
                , cId
                , toDBid nt
                )
-    where
-      fields = [QualifiedIdentifier Nothing "text"]

 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
@@ -198,16 +196,19 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
                          JOIN nodes_contexts nn ON n.id = nn.context_id
                            WHERE n.typename  = ?
                            AND nn.node_id = ?),
-       input_rows(terms) AS (?)
-  SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
-    JOIN ngrams ng      ON cng.ngrams_id = ng.id
-    JOIN input_rows  ir ON ir.terms      = ng.terms
+       input_rows AS (
+           SELECT id, terms
+             FROM ngrams
+             WHERE terms IN ?
+        )
+  SELECT ir.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
+    JOIN input_rows  ir ON cng.ngrams_id = ir.id
    JOIN nodes_contexts nn ON nn.context_id   = cng.context_id
    JOIN nodes_sample n ON nn.context_id   = n.id
    WHERE nn.node_id      = ? -- CorpusId
      AND cng.ngrams_type = ? -- NgramsTypeId
      AND nn.category     > 0
-      GROUP BY cng.node_id, ng.terms
+      GROUP BY cng.node_id, ir.terms
  |]

 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType

--- a/src/Gargantext/Database/Query/Table/Ngrams.hs
+++ b/src/Gargantext/Database/Query/Table/Ngrams.hs
@@ -29,7 +29,7 @@ import Data.HashMap.Strict qualified as HashMap
 import Data.List qualified as List
 import Data.Map.Strict qualified as Map
 import Database.PostgreSQL.Simple qualified as PGS
-import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType)
+import Gargantext.Core.Text.Ngrams (Ngrams(..), NgramsType)
 import Gargantext.Database.Admin.Types.Node ( pgNodeId, CorpusId, ListId, DocId )
 import Gargantext.Database.Prelude (runOpaQuery, formatPGSQuery, runPGSQuery, DBCmd)
 import Gargantext.Database.Query.Join (leftJoin3)
@@ -80,14 +80,15 @@ insertNgrams ns =

 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
 insertNgrams' :: [Ngrams] -> DBCmd err [Indexed Int Text]
-insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
+insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns')
  where
-    fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
+    ns' = (\n -> (_ngramsTerms n, _ngramsSize n)) <$> ns
+    fields = map (QualifiedIdentifier Nothing) ["text", "int4"]

 _insertNgrams_Debug :: [(Text, Size)] -> DBCmd err ByteString
 _insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
  where
-    fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
+    fields = map (QualifiedIdentifier Nothing) ["text", "int4"]

 ----------------------
 queryInsertNgrams :: PGS.Query

--- a/src/Gargantext/Database/Query/Table/NgramsPostag.hs
+++ b/src/Gargantext/Database/Query/Table/NgramsPostag.hs
@@ -53,14 +53,14 @@ type NgramsPostagInsert = ( Int
                          )

 toInsert :: NgramsPostag -> NgramsPostagInsert
-toInsert (NgramsPostag l a p form lem) =
-  ( toDBid l
-  , toDBid a
-  , show p
-  , view ngramsTerms form
-  , view ngramsSize  form
-  , view ngramsTerms lem
-  , view ngramsSize  lem
+toInsert (NgramsPostag { .. }) =
+  ( toDBid _np_lang
+  , toDBid _np_algo
+  , show _np_postag
+  , view ngramsTerms _np_form
+  , view ngramsSize  _np_form
+  , view ngramsTerms _np_lem
+  , view ngramsSize  _np_lem
  )

 insertNgramsPostag :: [NgramsPostag] -> DBCmd err (HashMap Text NgramsId)
@@ -154,17 +154,18 @@ SELECT terms,id FROM ins_form_ret
 -- TODO add lang and postag algo
 -- TODO remove when form == lem in insert
 selectLems :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
-selectLems l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems (PGS.In (map _ngramsTerms ns), toDBid l, toDBid server)
+selectLems l (NLPServerConfig { server }) ns =
+  runPGSQuery querySelectLems (PGS.In (_ngramsTerms <$> ns), toDBid l, toDBid server)
 ----------------------
 querySelectLems :: PGS.Query
 querySelectLems = [sql|
  WITH
     trms
-       AS (SELECT id, terms, n
+       AS (SELECT id, terms
            FROM ngrams
            WHERE terms IN ?)
   , input_rows(lang_id, algo_id, terms,n)
-    AS (SELECT ? as lang_id, ? as algo_id, terms, n, id
+    AS (SELECT ? as lang_id, ? as algo_id, terms, id
         FROM trms)
    , lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
    JOIN ngrams_postag np ON np.ngrams_id = ir.id
@@ -179,29 +180,29 @@ querySelectLems = [sql|
  |]

 -- | This is the same as 'selectLems', but slower.
-selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
-selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas)
-  where
-    fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"]
-    datas  = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns
+-- selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
+-- selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas)
+--   where
+--     fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"]
+--     datas  = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns

    
-querySelectLems' :: PGS.Query
-querySelectLems' = [sql|
-  WITH input_rows(lang_id, algo_id, terms,n)
-    AS (?) -- ((VALUES ('automata' :: "text")))
-    , lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir
-    JOIN ngrams        n1 ON ir.terms = n1.terms
-    JOIN ngrams_postag np ON np.ngrams_id = n1.id
-    JOIN ngrams        n2 ON n2.id    = np.lemm_id
-    WHERE np.lang_id = ir.lang_id
-      AND np.algo_id = ir.algo_id
-    GROUP BY n1.terms, n2.terms
-    ORDER BY score DESC
-  )
-
-  SELECT t1,t2 from lems
-  |]
+-- querySelectLems' :: PGS.Query
+-- querySelectLems' = [sql|
+--   WITH input_rows(lang_id, algo_id, terms,n)
+--     AS (?) -- ((VALUES ('automata' :: "text")))
+--     , lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir
+--     JOIN ngrams        n1 ON ir.terms = n1.terms
+--     JOIN ngrams_postag np ON np.ngrams_id = n1.id
+--     JOIN ngrams        n2 ON n2.id    = np.lemm_id
+--     WHERE np.lang_id = ir.lang_id
+--       AND np.algo_id = ir.algo_id
+--     GROUP BY n1.terms, n2.terms
+--     ORDER BY score DESC
+--   )
+
+--   SELECT t1,t2 from lems
+--   |]

 -- | Insert Table
 createTable_NgramsPostag :: DBCmd err [Int]

--- a/src/Gargantext/Database/Schema/Ngrams.hs
+++ b/src/Gargantext/Database/Schema/Ngrams.hs
@@ -39,6 +39,10 @@ import Gargantext.Prelude
 type NgramsId  = Int
 type Size      = Int

+-- | Ngrams table
+-- 'n' is the size, see G.D.Q.T.Ngrams -> insertNgrams'
+-- function. I.e. ngrams with 1 term are of size 1, ngrams with 2
+-- terms are of size 2 etc.
 data NgramsPoly id terms n = NgramsDB { _ngrams_id    :: !id
                                      , _ngrams_terms :: !terms
                                      , _ngrams_n     :: !n
@@ -90,7 +94,8 @@ instance PGS.ToRow Text where
  toRow t = [toField t]

 text2ngrams :: Text -> Ngrams
-text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
+text2ngrams txt = UnsafeNgrams { _ngramsTerms = txt'
+                               , _ngramsSize = length $ splitOn " " txt' }
  where
    txt' = strip txt


--- a/src/Gargantext/Database/Types.hs
+++ b/src/Gargantext/Database/Types.hs
@@ -21,7 +21,9 @@ import Gargantext.Database.Schema.Prelude
 import Gargantext.Prelude


-- | Index memory of any type in Gargantext
+-- | Index memory of any type in Gargantext.
+-- I.e. given entity 'a', we use this type to mark that it has a DB id of type 'i'.
+-- An un-indexed entity 'a' might not have been INSERT-ed yet to the DB.
 data Indexed i a =
  Indexed { _index     :: !i
          , _unIndex   :: !a