[refactor] more explicit type constructors

It's better to enumerate fields than not - easier to search later.

Also, a performance improvement for queryNgramsOccurrences (no VALUES).
parent 0f038a40
Pipeline #5829 passed with stages
...@@ -92,7 +92,8 @@ instance Hashable Ngrams ...@@ -92,7 +92,8 @@ instance Hashable Ngrams
makeLenses ''Ngrams makeLenses ''Ngrams
instance PGS.ToRow Ngrams where instance PGS.ToRow Ngrams where
toRow (UnsafeNgrams t s) = [toField t, toField s] toRow (UnsafeNgrams { .. }) = [ toField _ngramsTerms
, toField _ngramsSize ]
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------- -------------------------------------------------------------------------
......
...@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms) ...@@ -52,7 +52,7 @@ import Gargantext.Core.Text.Terms.Mono (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms) import Gargantext.Core.Text.Terms.Multi (multiterms)
import Gargantext.Core.Types ( TermsCount, POS, Terms(Terms), TermsWithCount ) import Gargantext.Core.Types ( TermsCount, POS, Terms(..), TermsWithCount )
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Database.Prelude (DBCmd) import Gargantext.Database.Prelude (DBCmd)
import Gargantext.Database.Query.Table.Ngrams (insertNgrams) import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
...@@ -60,6 +60,7 @@ import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgr ...@@ -60,6 +60,7 @@ import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgr
import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId) import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId)
import Gargantext.Prelude import Gargantext.Prelude
data TermType lang data TermType lang
= Mono { _tt_lang :: !lang } = Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang } | Multi { _tt_lang :: !lang }
...@@ -124,15 +125,15 @@ class ExtractNgramsT h ...@@ -124,15 +125,15 @@ class ExtractNgramsT h
-> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount)) -> DBCmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
------------------------------------------------------------------------ ------------------------------------------------------------------------
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) = enrichedTerms l pa po (Terms { .. }) =
NgramsPostag { _np_lang = l NgramsPostag { _np_lang = l
, _np_algo = pa , _np_algo = pa
, _np_postag = po , _np_postag = po
, _np_form = form , _np_form = form
, _np_lem = lem } , _np_lem = lem }
where where
form = text2ngrams $ Text.intercalate " " ng1 form = text2ngrams $ Text.unwords _terms_label
lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2 lem = text2ngrams $ Text.unwords $ Set.toList _terms_stem
------------------------------------------------------------------------ ------------------------------------------------------------------------
cleanNgrams :: Int -> Ngrams -> Ngrams cleanNgrams :: Int -> Ngrams -> Ngrams
......
...@@ -75,7 +75,7 @@ data Terms = Terms { _terms_label :: Label ...@@ -75,7 +75,7 @@ data Terms = Terms { _terms_label :: Label
, _terms_stem :: Stems , _terms_stem :: Stems
} deriving (Ord, Show) } deriving (Ord, Show)
instance Eq Terms where instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 == s2 (==) (Terms { _terms_stem = s1 }) (Terms { _terms_stem = s2 }) = s1 == s2
type TermsCount = Int type TermsCount = Int
......
...@@ -185,12 +185,10 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms = ...@@ -185,12 +185,10 @@ selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
( int ( int
, toDBid NodeDocument , toDBid NodeDocument
, cId , cId
, Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms)) , DPS.In (unNgramsTerm <$> (List.take 10000 tms))
, cId , cId
, toDBid nt , toDBid nt
) )
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
queryNgramsOccurrencesOnlyByContextUser_withSample = [sql| queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
...@@ -198,16 +196,19 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql| ...@@ -198,16 +196,19 @@ queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
JOIN nodes_contexts nn ON n.id = nn.context_id JOIN nodes_contexts nn ON n.id = nn.context_id
WHERE n.typename = ? WHERE n.typename = ?
AND nn.node_id = ?), AND nn.node_id = ?),
input_rows(terms) AS (?) input_rows AS (
SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng SELECT id, terms
JOIN ngrams ng ON cng.ngrams_id = ng.id FROM ngrams
JOIN input_rows ir ON ir.terms = ng.terms WHERE terms IN ?
)
SELECT ir.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
JOIN input_rows ir ON cng.ngrams_id = ir.id
JOIN nodes_contexts nn ON nn.context_id = cng.context_id JOIN nodes_contexts nn ON nn.context_id = cng.context_id
JOIN nodes_sample n ON nn.context_id = n.id JOIN nodes_sample n ON nn.context_id = n.id
WHERE nn.node_id = ? -- CorpusId WHERE nn.node_id = ? -- CorpusId
AND cng.ngrams_type = ? -- NgramsTypeId AND cng.ngrams_type = ? -- NgramsTypeId
AND nn.category > 0 AND nn.category > 0
GROUP BY cng.node_id, ng.terms GROUP BY cng.node_id, ir.terms
|] |]
selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
......
...@@ -29,7 +29,7 @@ import Data.HashMap.Strict qualified as HashMap ...@@ -29,7 +29,7 @@ import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List import Data.List qualified as List
import Data.Map.Strict qualified as Map import Data.Map.Strict qualified as Map
import Database.PostgreSQL.Simple qualified as PGS import Database.PostgreSQL.Simple qualified as PGS
import Gargantext.Core.Text.Ngrams (Ngrams, NgramsType) import Gargantext.Core.Text.Ngrams (Ngrams(..), NgramsType)
import Gargantext.Database.Admin.Types.Node ( pgNodeId, CorpusId, ListId, DocId ) import Gargantext.Database.Admin.Types.Node ( pgNodeId, CorpusId, ListId, DocId )
import Gargantext.Database.Prelude (runOpaQuery, formatPGSQuery, runPGSQuery, DBCmd) import Gargantext.Database.Prelude (runOpaQuery, formatPGSQuery, runPGSQuery, DBCmd)
import Gargantext.Database.Query.Join (leftJoin3) import Gargantext.Database.Query.Join (leftJoin3)
...@@ -80,14 +80,15 @@ insertNgrams ns = ...@@ -80,14 +80,15 @@ insertNgrams ns =
-- TODO-ACCESS: access must not be checked here but when insertNgrams' is called. -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
insertNgrams' :: [Ngrams] -> DBCmd err [Indexed Int Text] insertNgrams' :: [Ngrams] -> DBCmd err [Indexed Int Text]
insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns) insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns')
where where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"] ns' = (\n -> (_ngramsTerms n, _ngramsSize n)) <$> ns
fields = map (QualifiedIdentifier Nothing) ["text", "int4"]
_insertNgrams_Debug :: [(Text, Size)] -> DBCmd err ByteString _insertNgrams_Debug :: [(Text, Size)] -> DBCmd err ByteString
_insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns) _insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
where where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"] fields = map (QualifiedIdentifier Nothing) ["text", "int4"]
---------------------- ----------------------
queryInsertNgrams :: PGS.Query queryInsertNgrams :: PGS.Query
......
...@@ -53,14 +53,14 @@ type NgramsPostagInsert = ( Int ...@@ -53,14 +53,14 @@ type NgramsPostagInsert = ( Int
) )
toInsert :: NgramsPostag -> NgramsPostagInsert toInsert :: NgramsPostag -> NgramsPostagInsert
toInsert (NgramsPostag l a p form lem) = toInsert (NgramsPostag { .. }) =
( toDBid l ( toDBid _np_lang
, toDBid a , toDBid _np_algo
, show p , show _np_postag
, view ngramsTerms form , view ngramsTerms _np_form
, view ngramsSize form , view ngramsSize _np_form
, view ngramsTerms lem , view ngramsTerms _np_lem
, view ngramsSize lem , view ngramsSize _np_lem
) )
insertNgramsPostag :: [NgramsPostag] -> DBCmd err (HashMap Text NgramsId) insertNgramsPostag :: [NgramsPostag] -> DBCmd err (HashMap Text NgramsId)
...@@ -154,17 +154,18 @@ SELECT terms,id FROM ins_form_ret ...@@ -154,17 +154,18 @@ SELECT terms,id FROM ins_form_ret
-- TODO add lang and postag algo -- TODO add lang and postag algo
-- TODO remove when form == lem in insert -- TODO remove when form == lem in insert
selectLems :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)] selectLems :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
selectLems l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems (PGS.In (map _ngramsTerms ns), toDBid l, toDBid server) selectLems l (NLPServerConfig { server }) ns =
runPGSQuery querySelectLems (PGS.In (_ngramsTerms <$> ns), toDBid l, toDBid server)
---------------------- ----------------------
querySelectLems :: PGS.Query querySelectLems :: PGS.Query
querySelectLems = [sql| querySelectLems = [sql|
WITH WITH
trms trms
AS (SELECT id, terms, n AS (SELECT id, terms
FROM ngrams FROM ngrams
WHERE terms IN ?) WHERE terms IN ?)
, input_rows(lang_id, algo_id, terms,n) , input_rows(lang_id, algo_id, terms,n)
AS (SELECT ? as lang_id, ? as algo_id, terms, n, id AS (SELECT ? as lang_id, ? as algo_id, terms, id
FROM trms) FROM trms)
, lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir , lems AS ( select ir.terms as t1, n2.terms as t2, sum(np.score) as score from input_rows ir
JOIN ngrams_postag np ON np.ngrams_id = ir.id JOIN ngrams_postag np ON np.ngrams_id = ir.id
...@@ -179,29 +180,29 @@ querySelectLems = [sql| ...@@ -179,29 +180,29 @@ querySelectLems = [sql|
|] |]
-- | This is the same as 'selectLems', but slower. -- | This is the same as 'selectLems', but slower.
selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)] -- selectLems' :: Lang -> NLPServerConfig -> [Ngrams] -> DBCmd err [(Form, Lem)]
selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas) -- selectLems' l (NLPServerConfig { server }) ns = runPGSQuery querySelectLems' (PGS.Only $ Values fields datas)
where -- where
fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"] -- fields = map (QualifiedIdentifier Nothing) ["int4","int4","text", "int4"]
datas = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns -- datas = map (\d -> [toField $ toDBid l, toField $ toDBid server] <> toRow d) ns
querySelectLems' :: PGS.Query -- querySelectLems' :: PGS.Query
querySelectLems' = [sql| -- querySelectLems' = [sql|
WITH input_rows(lang_id, algo_id, terms,n) -- WITH input_rows(lang_id, algo_id, terms,n)
AS (?) -- ((VALUES ('automata' :: "text"))) -- AS (?) -- ((VALUES ('automata' :: "text")))
, lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir -- , lems AS ( select n1.terms as t1 ,n2.terms as t2 ,sum(np.score) as score from input_rows ir
JOIN ngrams n1 ON ir.terms = n1.terms -- JOIN ngrams n1 ON ir.terms = n1.terms
JOIN ngrams_postag np ON np.ngrams_id = n1.id -- JOIN ngrams_postag np ON np.ngrams_id = n1.id
JOIN ngrams n2 ON n2.id = np.lemm_id -- JOIN ngrams n2 ON n2.id = np.lemm_id
WHERE np.lang_id = ir.lang_id -- WHERE np.lang_id = ir.lang_id
AND np.algo_id = ir.algo_id -- AND np.algo_id = ir.algo_id
GROUP BY n1.terms, n2.terms -- GROUP BY n1.terms, n2.terms
ORDER BY score DESC -- ORDER BY score DESC
) -- )
SELECT t1,t2 from lems -- SELECT t1,t2 from lems
|] -- |]
-- | Insert Table -- | Insert Table
createTable_NgramsPostag :: DBCmd err [Int] createTable_NgramsPostag :: DBCmd err [Int]
......
...@@ -39,6 +39,10 @@ import Gargantext.Prelude ...@@ -39,6 +39,10 @@ import Gargantext.Prelude
type NgramsId = Int type NgramsId = Int
type Size = Int type Size = Int
-- | Ngrams table
-- 'n' is the size, see G.D.Q.T.Ngrams -> insertNgrams'
-- function. I.e. ngrams with 1 term are of size 1, ngrams with 2
-- terms are of size 2 etc.
data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
, _ngrams_terms :: !terms , _ngrams_terms :: !terms
, _ngrams_n :: !n , _ngrams_n :: !n
...@@ -90,7 +94,8 @@ instance PGS.ToRow Text where ...@@ -90,7 +94,8 @@ instance PGS.ToRow Text where
toRow t = [toField t] toRow t = [toField t]
text2ngrams :: Text -> Ngrams text2ngrams :: Text -> Ngrams
text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt' text2ngrams txt = UnsafeNgrams { _ngramsTerms = txt'
, _ngramsSize = length $ splitOn " " txt' }
where where
txt' = strip txt txt' = strip txt
......
...@@ -21,7 +21,9 @@ import Gargantext.Database.Schema.Prelude ...@@ -21,7 +21,9 @@ import Gargantext.Database.Schema.Prelude
import Gargantext.Prelude import Gargantext.Prelude
-- | Index memory of any type in Gargantext -- | Index memory of any type in Gargantext.
-- I.e. given entity 'a', we use this type to mark that it has a DB id of type 'i'.
-- An un-indexed entity 'a' might not have been INSERT-ed yet to the DB.
data Indexed i a = data Indexed i a =
Indexed { _index :: !i Indexed { _index :: !i
, _unIndex :: !a , _unIndex :: !a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment