Commit f8927c17 authored by delanoe's avatar delanoe

[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the...

[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the syntax but for production, only inner joins are accepted. Thanks.
parent fd66d6d5
...@@ -103,36 +103,42 @@ def compute_coocs( corpus, ...@@ -103,36 +103,42 @@ def compute_coocs( corpus,
# 1) MAIN DB QUERY # 1) MAIN DB QUERY
coocs_query = ( coocs_query = (
session.query(x1.ngram_id, x2.ngram_id, ucooc) session.query(x1.ngram_id, x2.ngram_id, ucooc)
.join(Node, Node.id == x1.node_id) # <- b/c within corpus
.filter(x1.node_id == x2.node_id) # <- by definition of cooc .join(x2, x1.node_id == Node.id ) # <- b/c within corpus
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus
.group_by(x1.ngram_id, x2.ngram_id) .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
.filter(x1.node_id == x2.node_id) # <- by definition of cooc
.filter(x1.ngram_id != x2.ngram_id) # <- b/c not with itself
.group_by(x1.ngram_id, x2.ngram_id)
) )
# 2) INPUT FILTERS (reduce N before O(N²)) # 2) INPUT FILTERS (reduce N before O(N²))
if mainlist_id: if mainlist_id:
main_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.filter( x1.ngram_id.in_(main_subquery) ) .join(m1, m1.ngram_id == x1.ngram_id)
.filter( x2.ngram_id.in_(main_subquery) ) .join(m2, m2.ngram_id == x2.ngram_id)
.filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id )
) )
if stoplist_id: if stoplist_id:
stop_subquery = ( s1 = aliased(NodeNgram)
session.query(NodeNgram.ngram_id) s2 = aliased(NodeNgram)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query coocs_query = ( coocs_query
.filter( ~ x1.ngram_id.in_(stop_subquery) ) .join(m1, s1.ngram_id == x1.ngram_id)
.filter( ~ x2.ngram_id.in_(stop_subquery) ) .join(m2, s2.ngram_id == x2.ngram_id)
.filter( s1.node_id == mainlist_id )
.filter( s2.node_id == mainlist_id )
) )
if start: if start:
...@@ -142,6 +148,7 @@ def compute_coocs( corpus, ...@@ -142,6 +148,7 @@ def compute_coocs( corpus,
start_str = str(start) start_str = str(start)
# doc_ids matching this limit # doc_ids matching this limit
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery = (session starttime_subquery = (session
.query(NodeHyperdata.node_id) .query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date") .filter(NodeHyperdata.key=="publication_date")
...@@ -160,6 +167,7 @@ def compute_coocs( corpus, ...@@ -160,6 +167,7 @@ def compute_coocs( corpus,
else: else:
end_str = str(end) end_str = str(end)
# TODO s/subqueries/inner joins/ && thanks!
endtime_subquery = (session endtime_subquery = (session
.query(NodeHyperdata.node_id) .query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date") .filter(NodeHyperdata.key=="publication_date")
......
...@@ -22,18 +22,22 @@ ...@@ -22,18 +22,22 @@
-- create INDEX on nodes (user_id, typename, parent_id) ; -- create INDEX on nodes (user_id, typename, parent_id) ;
-- create INDEX on nodes_hyperdata (node_id, key); -- create INDEX on nodes_hyperdata (node_id, key);
-- create INDEX on ngrams (id, n) ; -- create INDEX on ngrams (id, n) ;
create INDEX on ngrams (n) ;
-- create INDEX on nodes_ngrams (node_id, ngram_id) ; -- create INDEX on nodes_ngrams (node_id, ngram_id) ;
-- create INDEX on nodes_ngrams (node_id) ;
-- create INDEX on nodes_ngrams (ngram_id) ;
-- create INDEX on nodes_ngrams_ngrams (node_id, ngram1_id, ngram2_id) ; -- create INDEX on nodes_ngrams_ngrams (node_id, ngram1_id, ngram2_id) ;
create INDEX on nodes_ngrams_ngrams (node_id) ;
create INDEX on nodes_ngrams_ngrams (ngram1_id) ; -- create INDEX on nodes_ngrams_ngrams (node_id) ;
create INDEX on nodes_ngrams_ngrams (ngram2_id) ; -- create INDEX on nodes_ngrams_ngrams (ngram1_id) ;
-- create INDEX on nodes_ngrams_ngrams (ngram2_id) ;
---------------------------------------------------------------------- ----------------------------------------------------------------------
-- DELETE optimization of Nodes -- todo on dev -- DELETE optimization of Nodes -- todo on dev
-- create INDEX on nodes_nodes_ngrams (node1_id); -- create INDEX on nodes_nodes_ngrams (node1_id);
-- create INDEX on nodes_nodes_ngrams (node2_id); -- create INDEX on nodes_nodes_ngrams (node2_id);
create INDEX on nodes_nodes (node1_id, node2_id); -- create INDEX on nodes_nodes (node1_id, node2_id);
-- Maybe needed soon: -- Maybe needed soon:
-- create INDEX on nodes_nodes_ngrams (node1_id, node2_id); -- create INDEX on nodes_nodes_ngrams (node1_id, node2_id);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment