[NGRAMS] workflow fixes.

7a141a02 · delanoe · 91e14e3e · 7a141a02 · 7a141a02
Commit 7a141a02 authored May 09, 2016 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

list_map.py gargantext/util/toolchain/list_map.py +3 -3

metric_tfidf.py gargantext/util/toolchain/metric_tfidf.py +9 -1

No files found.
--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -52,7 +52,7 @@ def do_maplist(corpus,

    primary_groupterms_subquery = (session
                            # we want only primary terms (ngram1)
-                            .query(NodeNgramNgram.ngram1_id)
+                            .query(NodeNgramNgram.ngram2_id)
                            .filter(NodeNgramNgram.node_id == grouplist_id)
                            .subquery()
                         )
@@ -64,7 +64,7 @@ def do_maplist(corpus,
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
                .filter(ScoreSpec.node_id == specificity_id)
                .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
-                .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
+                .filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
            )

    # TODO: move these 2 pools up to mainlist selection
@@ -81,7 +81,7 @@ def do_maplist(corpus,
                .limit(multigrams_limit)
                .all()
               )
-    obtained_mono = len(top_monograms)
+    obtained_mono  = len(top_monograms)
    obtained_multi = len(top_multigrams)
    obtained_total = obtained_mono + obtained_multi
    # print("MAPLIST: top_monograms =", obtained_mono)

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
       with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
 """

-from gargantext.models   import Node, NodeNgram, NodeNodeNgram
+from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
 from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
 from sqlalchemy          import text  # for query from raw SQL statement
 from math                import log
@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
    """
+    # 0) Get the groups
+    group_id = (session.query(Node.id)
+                       .filter(Node.parent_id == corpus.id)
+                       .filter(Node.typename  == "GROUPLIST")
+                       .first()
+                )
+

    # 1) all the doc_ids of our corpus (scope of counts for filter)
    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
                    NodeNgram.ngram_id,
                    func.sum(NodeNgram.weight)
                 )
+                #.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
                .filter(NodeNgram.node_id.in_(docids_subquery))
                .group_by(NodeNgram.ngram_id)
                .all()