[refactor] more explicit type constructors

It's better to enumerate fields than not - easier to search later.

Also, a performance improvement for queryNgramsOccurrences (no VALUES).
parent 0f038a40
Pipeline #5829 passed with stages
......@@ -92,7 +92,8 @@ instance Hashable Ngrams
makeLenses ''Ngrams
instance PGS.ToRow Ngrams where
toRow (UnsafeNgrams t s) = [toField t, toField s]
toRow (UnsafeNgrams { .. }) = [ toField _ngramsTerms
, toField _ngramsSize ]
------------------------------------------------------------------------
-------------------------------------------------------------------------
......
......@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types ( TermsCount, POS, Terms(Terms), TermsWithCount )
import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount )
import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
......@@ -60,6 +60,7 @@ import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgr
import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId)
import Gargantext.Prelude
data TermType lang
= Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang }
......@@ -124,15 +125,15 @@ class ExtractNgramsT h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) =
enrichedTerms l pa po (Terms { .. }) =
NgramsPostag { _np_lang = l
, _np_algo = pa
, _np_postag = po
, _np_form = form
, _np_lem = lem }
where
form = text2ngrams $ Text.intercalate " " ng1
lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2
form = text2ngrams $ Text.unwords _terms_label
lem = text2ngrams $ Text.unwords $ Set.toList _terms_stem
------------------------------------------------------------------------
cleanNgrams :: Int -> Ngrams -> Ngrams
......
......@@ -75,7 +75,7 @@ data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems
} deriving (Ord, Show)
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2
(==) (Terms { _terms_stem = s1 }) (Terms { _terms_stem = s2 }) = s1 == s2
type TermsCount = Int
......
......@@ -185,12 +185,10 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
( int
, toDBid NodeDocument
, cId
, Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
, DPS.In (unNgramsTerm <$> (List.take 10000 tms))
, cId
, toDBid nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
......@@ -198,16 +196,19 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
JOIN nodes_contexts nn ON n.id = nn.context_id
WHERE n.typename = ?
AND nn.node_id = ?),
input_rows(terms) AS (?)
SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
input_rows AS (
SELECT id, terms
FROM ngrams
WHERE terms IN ?
)
SELECT ir.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
JOIN input_rows ir ON cng.ngrams_id = ir.id
JOIN nodes_contexts nn ON nn.context_id = cng.context_id
JOIN nodes_sample n ON nn.context_id = n.id
WHERE nn.node_id = ? -- CorpusId
AND cng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0
GROUP BY cng.node_id, ng.terms
GROUP BY cng.node_id, ir.terms
|]
selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
......
......@@ -29,7 +29,7 @@ import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Database.PostgreSQL.Simple qualified as PGS
import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType)
import Gargantext.Core.Text.Ngrams (Ngrams(..), NgramsType)
import Gargantext.Database.Admin.Types.Node ( pgNodeId, CorpusId, ListId, DocId )
import Gargantext.Database.Prelude (runOpaQuery, formatPGSQuery, runPGSQuery, DBCmd)
import Gargantext.Database.Query.Join (leftJoin3)
......@@ -80,14 +80,15 @@ insertNgrams ns =
-- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
insertNgrams' :: [Ngrams] -> DBCmd err [Indexed Int Text]
insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns')
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
ns' = (\n -> (_ngramsTerms n, _ngramsSize n)) <$> ns
fields = map (QualifiedIdentifier Nothing) ["text", "int4"]
_insertNgrams_Debug :: [(Text, Size)] -> DBCmd err ByteString
_insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
fields = map (QualifiedIdentifier Nothing) ["text", "int4"]
----------------------
queryInsertNgrams :: PGS.Query
......
......@@ -53,14 +53,14 @@ type NgramsPostagInsert = ( Int
)
toInsert :: NgramsPostag -> NgramsPostagInsert
toInsert (NgramsPostag l a p form lem) =
( toDBid l
, toDBid a
, show p
, view ngramsTerms form
, view ngramsSize form
, view ngramsTerms lem
, view ngramsSize lem
toInsert (NgramsPostag { .. }) =
( toDBid _np_lang
, toDBid _np_algo
, show _np_postag
, view ngramsTerms _np_form
, view ngramsSize _np_form
, view ngramsTerms _np_lem
, view ngramsSize _np_lem
)
insertNgramsPostag :: [NgramsPostag] -> DBCmd err (HashMap Text NgramsId)
......@@ -154,17 +154,18 @@ SELECT terms,id FROM ins_form_ret
-- TODO add lang and postag algo
-- TODO remove when form == lem in insert
selectLems :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
selectLems l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems (PGS.In (map _ngramsTerms ns), toDBid l, toDBid server)
selectLems l (NLPServerConfig { server }) ns =
runPGSQuery querySelectLems (PGS.In (_ngramsTerms <$> ns), toDBid l, toDBid server)
----------------------
querySelectLems :: PGS.Query
querySelectLems = [sql|
WITH
trms
AS (SELECT id, terms, n
AS (SELECT id, terms
FROM ngrams
WHERE terms IN ?)
, input_rows(lang_id, algo_id, terms,n)
AS (SELECT ? as lang_id, ? as algo_id, terms, n, id
AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
JOIN ngrams_postag np ON np.ngrams_id = ir.id
......@@ -179,29 +180,29 @@ querySelectLems = [sql|
|]
-- | This is the same as 'selectLems', but slower.
selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas)
where
fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"]
datas = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns
-- selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
-- selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas)
-- where
-- fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"]
-- datas = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns
querySelectLems' :: PGS.Query
querySelectLems' = [sql|
WITH input_rows(lang_id, algo_id, terms,n)
AS (?) -- ((VALUES ('automata' :: "text")))
, lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir
JOIN ngrams n1 ON ir.terms = n1.terms
JOIN ngrams_postag np ON np.ngrams_id = n1.id
JOIN ngrams n2 ON n2.id = np.lemm_id
WHERE np.lang_id = ir.lang_id
AND np.algo_id = ir.algo_id
GROUP BY n1.terms, n2.terms
ORDER BY score DESC
)
SELECT t1,t2 from lems
|]
-- querySelectLems' :: PGS.Query
-- querySelectLems' = [sql|
-- WITH input_rows(lang_id, algo_id, terms,n)
-- AS (?) -- ((VALUES ('automata' :: "text")))
-- , lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir
-- JOIN ngrams n1 ON ir.terms = n1.terms
-- JOIN ngrams_postag np ON np.ngrams_id = n1.id
-- JOIN ngrams n2 ON n2.id = np.lemm_id
-- WHERE np.lang_id = ir.lang_id
-- AND np.algo_id = ir.algo_id
-- GROUP BY n1.terms, n2.terms
-- ORDER BY score DESC
-- )
-- SELECT t1,t2 from lems
-- |]
-- | Insert Table
createTable_NgramsPostag :: DBCmd err [Int]
......
......@@ -39,6 +39,10 @@ import Gargantext.Prelude
type NgramsId = Int
type Size = Int
-- | Ngrams table
-- 'n' is the size, see G.D.Q.T.Ngrams -> insertNgrams'
-- function. I.e. ngrams with 1 term are of size 1, ngrams with 2
-- terms are of size 2 etc.
data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
, _ngrams_terms :: !terms
, _ngrams_n :: !n
......@@ -90,7 +94,8 @@ instance PGS.ToRow Text where
toRow t = [toField t]
text2ngrams :: Text -> Ngrams
text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
text2ngrams txt = UnsafeNgrams { _ngramsTerms = txt'
, _ngramsSize = length $ splitOn " " txt' }
where
txt' = strip txt
......
......@@ -21,7 +21,9 @@ import Gargantext.Database.Schema.Prelude
import Gargantext.Prelude
-- | Index memory of any type in Gargantext
-- | Index memory of any type in Gargantext.
-- I.e. given entity 'a', we use this type to mark that it has a DB id of type 'i'.
-- An un-indexed entity 'a' might not have been INSERT-ed yet to the DB.
data Indexed i a =
Indexed { _index :: !i
, _unIndex :: !a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment