[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the...

[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the syntax but for production, only inner joins are accepted. Thanks.

[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the...
[OPTI] Subqueries are not optimized. Ok for local tests if you prefer the syntax but for production, only inner joins are accepted. Thanks.
f8927c17 · delanoe · fd66d6d5 · f8927c17 · f8927c17
Commit f8927c17 authored May 12, 2016 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 35 additions and 23 deletions

ngram_coocs.py gargantext/util/toolchain/ngram_coocs.py +27 -19

indexes.sql install/postgres/indexes.sql +8 -4

No files found.
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -103,36 +103,42 @@ def compute_coocs(  corpus,
    # 1) MAIN DB QUERY
    coocs_query = (
        session.query(x1.ngram_id, x2.ngram_id, ucooc)
-
-            .filter(x1.node_id  == x2.node_id)       # <- by definition of cooc
-            .filter(x1.ngram_id != x2.ngram_id)      # <- b/c not with itself
-            .filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus
-            .group_by(x1.ngram_id, x2.ngram_id)
+               .join(Node, Node.id == x1.node_id)   # <- b/c within corpus
+               .join(x2, x1.node_id == Node.id )     # <- b/c within corpus
+               
+               .filter(Node.parent_id == corpus.id) # <- b/c within corpus
+               .filter(Node.typename == "DOCUMENT") # <- b/c within corpus
+
+            
+               .filter(x1.node_id  == x2.node_id)       # <- by definition of cooc
+               .filter(x1.ngram_id != x2.ngram_id)      # <- b/c not with itself
+               .group_by(x1.ngram_id, x2.ngram_id)
           )

    # 2) INPUT FILTERS (reduce N before O(N²))
    if mainlist_id:
-        main_subquery = (
-            session.query(NodeNgram.ngram_id)
-                .filter(NodeNgram.node_id == mainlist_id)
-                .subquery()
-                )

+        m1 = aliased(NodeNgram)
+        m2 = aliased(NodeNgram)
+        
        coocs_query = ( coocs_query
-            .filter( x1.ngram_id.in_(main_subquery) )
-            .filter( x2.ngram_id.in_(main_subquery) )
+            .join(m1, m1.ngram_id == x1.ngram_id)
+            .join(m2, m2.ngram_id == x2.ngram_id)
+
+            .filter( m1.node_id == mainlist_id )
+            .filter( m2.node_id == mainlist_id )
        )

    if stoplist_id:
-        stop_subquery = (
-            session.query(NodeNgram.ngram_id)
-                .filter(NodeNgram.node_id == stoplist_id)
-                .subquery()
-                )
+        s1 = aliased(NodeNgram)
+        s2 = aliased(NodeNgram)

        coocs_query = ( coocs_query
-            .filter( ~ x1.ngram_id.in_(stop_subquery) )
-            .filter( ~ x2.ngram_id.in_(stop_subquery) )
+            .join(m1, s1.ngram_id == x1.ngram_id)
+            .join(m2, s2.ngram_id == x2.ngram_id)
+
+            .filter( s1.node_id == mainlist_id )
+            .filter( s2.node_id == mainlist_id )
        )

    if start:
@@ -142,6 +148,7 @@ def compute_coocs(  corpus,
            start_str = str(start)

        # doc_ids matching this limit
+        # TODO s/subqueries/inner joins/ && thanks!
        starttime_subquery = (session
                                .query(NodeHyperdata.node_id)
                                .filter(NodeHyperdata.key=="publication_date")
@@ -160,6 +167,7 @@ def compute_coocs(  corpus,
        else:
            end_str = str(end)

+        # TODO s/subqueries/inner joins/ && thanks!
        endtime_subquery = (session
                                .query(NodeHyperdata.node_id)
                                .filter(NodeHyperdata.key=="publication_date")

--- a/install/postgres/indexes.sql
+++ b/install/postgres/indexes.sql
@@ -22,18 +22,22 @@
 -- create INDEX on nodes (user_id, typename, parent_id) ;
 -- create INDEX on nodes_hyperdata (node_id, key);
 -- create INDEX on ngrams (id, n) ;
+create INDEX on ngrams (n) ;
 -- create INDEX on nodes_ngrams (node_id, ngram_id) ;
+-- create INDEX on nodes_ngrams (node_id) ;
+-- create INDEX on nodes_ngrams (ngram_id) ;
 -- create INDEX on nodes_ngrams_ngrams (node_id, ngram1_id, ngram2_id) ;
-create INDEX on nodes_ngrams_ngrams (node_id) ;
-create INDEX on nodes_ngrams_ngrams (ngram1_id) ;
-create INDEX on nodes_ngrams_ngrams (ngram2_id) ;
+
+-- create INDEX on nodes_ngrams_ngrams (node_id) ;
+-- create INDEX on nodes_ngrams_ngrams (ngram1_id) ;
+-- create INDEX on nodes_ngrams_ngrams (ngram2_id) ;

 ----------------------------------------------------------------------
 -- DELETE optimization of Nodes -- todo on dev
 -- create INDEX on nodes_nodes_ngrams (node1_id);
 -- create INDEX on nodes_nodes_ngrams (node2_id);

-create INDEX on nodes_nodes (node1_id, node2_id);
+-- create INDEX on nodes_nodes (node1_id, node2_id);

 -- Maybe needed soon:
 -- create INDEX on nodes_nodes_ngrams (node1_id, node2_id);