remove sql IN from global ranking score

049dc862 · Romain Loth · 389583e9 · 049dc862
Commit 049dc862 authored May 19, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 54 deletions

metric_tfidf.py gargantext/util/toolchain/metric_tfidf.py +73 -54

No files found.
--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -9,9 +9,10 @@ FIXME: "having the same source" means we need to select inside hyperdata
 """

 from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
-from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
-from sqlalchemy          import text  # for query from raw SQL statement
+from gargantext.util.db  import session, bulk_insert, aliased, \
+                                func # = sqlalchemy.func like sum() or count()
 from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
+from sqlalchemy import distinct   # for list of unique ngram_ids within a corpus
 from math                import log
 # £TODO
 # from gargantext.util.lists import WeightedContextIndex
@@ -140,7 +141,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
    """
    # TODO check if cumulated tfs correspond to app's use cases and intention

-    Calculates tfidf ranking (cumulated tfidf) within the given scope
+    Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope

    Parameters:
      - the corpus itself
@@ -148,92 +149,110 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type

-
        when the count_scope is global, there is another parameter:
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
-               (SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
+               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
             - global <=> output list of terms from all corpora of this type
-                                                    !!!! (more terms)
+                                                    !!!! (many more terms)

      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """

-    corpus_docids_subquery = (session
-                    .query(Node.id)
-                    .filter(Node.parent_id == corpus.id)
-                    .filter(Node.typename == "DOCUMENT")
-                    .subquery()
+    # MAIN QUERY SKELETON
+    tf_nd_query = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight),    # tf: same as occurrences
+                                                       # -----------------------
+
+                        func.count(NodeNgram.node_id)  # nd: n docs with term
+                                                       # --------------------
+                     )
+                    .group_by(NodeNgram.ngram_id)
+
+                    # optional *count_scope*: if we'll restrict the doc nodes
+                    #          -------------
+                    # .join(countdocs_subquery,
+                    #       countdocs_subquery.c.id == NodeNgram.node_id)
+
+                    # optional *termset_scope*: if we'll restrict the ngrams
+                    #          ---------------
+                    # .join(termset_subquery,
+                    #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
                   )

+    # validate string params
+    if count_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
+    if termset_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
+    if count_scope == "local" and termset_scope == "global":
+        raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
+
    # local <=> within this corpus
    if count_scope == "local":
        # All docs of this corpus
-        count_scope_subquery = corpus_docids_subquery
-
-        termset_scope_subquery = (session
-                        .query(NodeNgram.ngram_id)
-                        .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
+        countdocs_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .filter(Node.parent_id == corpus.id)
                        .subquery()
                       )

+        # both scopes are the same: no need to independantly restrict the ngrams
+        tf_nd_query = tf_nd_query.join(countdocs_subquery,
+                                       countdocs_subquery.c.id == NodeNgram.node_id)
+
+
    # global <=> within all corpora of this source
    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']

-        # all corpora with the same source type
-        # (we need raw SQL query for postgres JSON operators) (TODO test speed)
-        same_source_corpora_query = (session
-                        .query(Node.id)
-                        .from_statement(text(
-                            """
-                            SELECT id FROM nodes
-                            WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
-                            """ % this_source_type
-                            ))
-                        )
+        CorpusNode = aliased(Node)

        # All docs **in all corpora of the same source**
-        ressource_docids_subquery = (session
+        countdocs_subquery = (session
                        .query(Node.id)
-                        .filter(Node.parent_id.in_(same_source_corpora_query))
                        .filter(Node.typename == "DOCUMENT")
+
+                        # join on parent_id with selected corpora nodes
+                        .join(CorpusNode, CorpusNode.id == Node.parent_id)
+                        .filter(CorpusNode.typename == "CORPUS")
+                        .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
                        .subquery()
                       )

-
-        count_scope_subquery = ressource_docids_subquery
-
        if termset_scope == "global":
-            termset_scope_subquery = (session
-                            .query(NodeNgram.ngram_id)
-                            .filter(NodeNgram.node_id.in_(ressource_docids_subquery))
-                            .subquery()
-                           )
-        else:
-            termset_scope_subquery = (session
-                            .query(NodeNgram.ngram_id)
-                            .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
+            # both scopes are the same: no need to independantly restrict the ngrams
+            tf_nd_query = tf_nd_query.join(countdocs_subquery,
+                                           countdocs_subquery.c.id == NodeNgram.node_id)
+
+        elif termset_scope == "local":
+
+            # All unique terms in the original corpus
+            termset_subquery = (session
+                            .query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
+                            .join(Node)
+                            .filter(Node.typename == "DOCUMENT")
+                            .filter(Node.parent_id == corpus.id)
                            .subquery()
                           )

+            # only case of independant restrictions on docs and terms
+            tf_nd_query = (tf_nd_query
+                            .join(countdocs_subquery,
+                                  countdocs_subquery.c.id == NodeNgram.node_id)
+                            .join(termset_subquery,
+                                  termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
+                          )

    # N
-    total_docs = session.query(ressource_docids_subquery).count()
+    total_docs = session.query(countdocs_subquery).count()

-    # nb: possible to do the occurrences right now at the same time
-    tf_nd = (session
-                    .query(
-                        NodeNgram.ngram_id,
-                        func.sum(NodeNgram.weight),    # tf: same as occnode
-                        func.count(NodeNgram.node_id)  # nd: n docs with term
-                     )
-                    .filter(NodeNgram.node_id.in_(count_scope_subquery))
-                    .filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
-                    .group_by(NodeNgram.ngram_id)
-                    .all()
-                   )
+    # result
+    tf_nd = tf_nd_query.all()

    # -------------------------------------------------
    tfidfs = {}