workflow: fix set of terms used in global ranking score (still using IN)

2785cf15 · Romain Loth · 242acca7 · 2785cf15 · 2785cf15 · 2785cf15
Commit 2785cf15 authored May 13, 2016 by Romain Loth
5 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -195,9 +195,9 @@ RESOURCETYPES = [
 ]
 # linguistic extraction parameters ---------------------------------------------
-DEFAULT_TFIDF_CUTOFF_RATIO      = .75        # MAINLIST maximum terms in %
+DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %
-DEFAULT_TFIDF_HARD_LIMIT        = 5000       # MAINLIST maximum terms abs
+DEFAULT_RANK_HARD_LIMIT        = 5000        # MAINLIST maximum terms abs
                                             # (makes COOCS larger ~ O(N²) /!\)
 DEFAULT_COOC_THRESHOLD          = 2          # inclusive minimum for COOCS coefs

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -6,7 +6,7 @@ from .hyperdata_indexing  import index_hyperdata
 # in usual run order
 from .list_stop           import do_stoplist
-from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_cumulated_tfidf
+from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_ti_ranking
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
 from .metric_specificity  import compute_specificity
@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus):
    ltfidf_id = compute_tfidf_local(corpus)
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
-    # -> write global and cumulated tfidf to Node and NodeNodeNgram
+    # -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
-    gtfidf_id = compute_cumulated_tfidf(corpus, scope="global")
+    tirank_id = compute_ti_ranking(corpus,
-    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
+                                   count_scope="global",
+                                   termset_scope="local")
+    print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
    # -> mainlist: filter + write (to Node and NodeNgram)
    mainlist_id = do_mainlist(corpus,
-                              tfidf_id = gtfidf_id,
+                              ranking_scores_id = tirank_id,
                              stoplist_id = stop_id)
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus):
    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
    print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
    corpus.status('Lists', progress=0, complete=True)
    corpus.save_hyperdata()
    session.commit()

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -2,14 +2,14 @@ from gargantext.models     import Node, NodeNgram, NodeNodeNgram
 from gargantext.util.db    import session
 from gargantext.util.lists import UnweightedList
 from sqlalchemy            import desc
-from gargantext.constants  import DEFAULT_TFIDF_CUTOFF_RATIO, \
+from gargantext.constants  import DEFAULT_RANK_CUTOFF_RATIO, \
-                                  DEFAULT_TFIDF_HARD_LIMIT
+                                  DEFAULT_RANK_HARD_LIMIT
 def do_mainlist(corpus,
                    overwrite_id  = None,
-                    tfidf_id=None, stoplist_id=None,
+                    ranking_scores_id=None, stoplist_id=None,
-                    hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
+                    hard_limit=DEFAULT_RANK_HARD_LIMIT,
-                    ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
+                    ratio_limit=DEFAULT_RANK_CUTOFF_RATIO
                    ):
    """
    Select top n terms according to a global tfidf ranking and stoplist filter.
@@ -18,7 +18,7 @@ def do_mainlist(corpus,
        min(hard_limit, number_of_terms * ratio_limit)
    NB : We use a global tfidf node where the values are global but the ngrams
-         are already selected (== only within this corpus documents).
+         are already selected (termset_scope == only within this corpus docs).
         TO DISCUSS: allow influence of the local tfidf scores too
    Parameters:
@@ -37,12 +37,12 @@ def do_mainlist(corpus,
    """
    # retrieve helper nodes if not provided
-    if not tfidf_id:
+    if not ranking_scores_id:
-        tfidf_id  = session.query(Node.id).filter(
+        ranking_scores_id  = session.query(Node.id).filter(
                                Node.typename  == "TFIDF-GLOBAL",
                                Node.parent_id == corpus.id
                    ).first()
-        if not tfidf_id:
+        if not ranking_scores_id:
            raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
    if not stoplist_id:
@@ -64,7 +64,7 @@ def do_mainlist(corpus,
    # tfidf-ranked query
    ordered_filtered_tfidf = (session
        .query(NodeNodeNgram.ngram_id)
-        .filter(NodeNodeNgram.node1_id == tfidf_id)
+        .filter(NodeNodeNgram.node1_id == ranking_scores_id)
        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
        .order_by(desc(NodeNodeNgram.score))
        )

--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
    # v = d.sum(axis=1) (- lui-même)
    xs = x.sum(axis=1) - x
    ys = x.sum(axis=0) - x
    # top inclus ou exclus
    #n = ( xs + ys) / (2 * (x.shape[0] - 1))
    # top generic or specific (asc is spec, desc is generic)
    v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
    # print(v)
    pd.options.display.float_format = '${:,.2f}'.format
-    data = WeightedList(
+    if not v.empty:
-            zip(  v.index.tolist()
+        data = WeightedList(
-                , v.values.tolist()[0]
+                zip(  v.index.tolist()
-             )
+                    , v.values.tolist()[0]
-           )
+                 )
-    data.save(the_id)
+               )
+        data.save(the_id)
+    else:
+        print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
    return(the_id)
--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None):
    return the_id
-def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
+def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None):
    """
    # TODO check if cumulated tfs correspond to app's use cases and intention
@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
    Parameters:
      - the corpus itself
-      - scope: {"local" or "global"}
+      - count_scope: {"local" or "global"}
+         - local  <=> frequencies counted in the current corpus
+         - global <=> frequencies counted in all corpora of this type
+        when the count_scope is global, there is another parameter:
+          - termset_scope: {"local" or "global"}
+             - local <=> output list of terms limited to the current corpus
+               (SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
+             - global <=> output list of terms from all corpora of this type
+                                                    !!!! (more terms)
      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """
+    corpus_docids_subquery = (session
+                    .query(Node.id)
+                    .filter(Node.parent_id == corpus.id)
+                    .filter(Node.typename == "DOCUMENT")
+                    .subquery()
+                   )
    # local <=> within this corpus
-    if scope == "local":
+    if count_scope == "local":
        # All docs of this corpus
-        docids_subquery = (session
+        count_scope_subquery = corpus_docids_subquery
-                            .query(Node.id)
-                            .filter(Node.parent_id == corpus.id)
+        termset_scope_subquery = (session
-                            .filter(Node.typename == "DOCUMENT")
+                        .query(NodeNgram.ngram_id)
-                            .subquery()
+                        .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
-                           )
+                        .subquery()
+                       )
    # global <=> within all corpora of this source
-    elif scope == "global":
+    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']
        # all corpora with the same source type
        # (we need raw SQL query for postgres JSON operators) (TODO test speed)
        same_source_corpora_query = (session
-                            .query(Node.id)
+                        .query(Node.id)
-                            .from_statement(text(
+                        .from_statement(text(
-                                """
+                            """
-                                SELECT id FROM nodes
+                            SELECT id FROM nodes
-                                WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
+                            WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
-                                """ % this_source_type
+                            """ % this_source_type
-                                ))
+                            ))
-                            )
+                        )
        # All docs **in all corpora of the same source**
-        docids_subquery = (session
+        ressource_docids_subquery = (session
-                            .query(Node.id)
+                        .query(Node.id)
-                            .filter(Node.parent_id.in_(same_source_corpora_query))
+                        .filter(Node.parent_id.in_(same_source_corpora_query))
-                            .filter(Node.typename == "DOCUMENT")
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+        count_scope_subquery = ressource_docids_subquery
+        if termset_scope == "global":
+            termset_scope_subquery = (session
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id.in_(ressource_docids_subquery))
+                            .subquery()
+                           )
+        else:
+            termset_scope_subquery = (session
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id.in_(corpus_docids_subquery))
                            .subquery()
                           )
    # N
-    total_docs = session.query(docids_subquery).count()
+    total_docs = session.query(ressource_docids_subquery).count()
-    # or perhaps at least do the occurrences right now at the same time
+    # nb: possible to do the occurrences right now at the same time
    tf_nd = (session
                    .query(
                        NodeNgram.ngram_id,
                        func.sum(NodeNgram.weight),    # tf: same as occnode
                        func.count(NodeNgram.node_id)  # nd: n docs with term
                     )
-                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .filter(NodeNgram.node_id.in_(count_scope_subquery))
+                    .filter(NodeNgram.ngram_id.in_(termset_scope_subquery))
                    .group_by(NodeNgram.ngram_id)
                    .all()
                   )
@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
    else:
        # create the new TFIDF-XXXX node
        tfidf_nd = corpus.add_child()
-        if scope == "local":            # TODO discuss use and find new typename
+        if count_scope == "local":            # TODO discuss use and find new typename
            tfidf_nd.typename  = "TFIDF-CORPUS"
            tfidf_nd.name      = "tfidf-cumul-corpus (in:%s)" % corpus.id
-        elif scope == "global":
+        elif count_scope == "global":
            tfidf_nd.typename  = "TFIDF-GLOBAL"
            tfidf_nd.name      = "tfidf-cumul-global (in type:%s)" % this_source_type
        session.add(tfidf_nd)