Merge branch 'refactoring-rom' into refactoring-alex

20e969ed · delanoe · 9ad61799 · 6341dc12 · 20e969ed · 20e969ed
Commit 20e969ed authored Mar 30, 2016 by delanoe
Show whitespace changes
Inline Side-by-side

Showing with 93 additions and 10 deletions

__init__.py gargantext/util/toolchain/__init__.py +5 -5

metric_tfidf.py gargantext/util/toolchain/metric_tfidf.py +88 -5

No files found.
--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -4,7 +4,7 @@ from .hyperdata_indexing  import index_hyperdata

 # in usual run order
 from .list_stop           import do_stoplist
-from .metric_tfidf        import compute_occs, compute_tfidf
+from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_cumulated_tfidf
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
 from .metric_specificity  import compute_specificity
@@ -75,12 +75,12 @@ def parse_extract_indexhyperdata(corpus):
    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

    # ------------
-    # -> write local tfidf to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf(corpus, scope="local")
+    # -> write local tfidf similarities to Node and NodeNodeNgram
+    ltfidf_id = compute_tfidf_local(corpus)
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))

-    # -> write global tfidf to Node and NodeNodeNgram
-    gtfidf_id = compute_tfidf(corpus, scope="global")
+    # -> write global and cumulated tfidf to Node and NodeNodeNgram
+    gtfidf_id = compute_cumulated_tfidf(corpus, scope="global")
    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))

    # -> mainlist: filter + write (to Node and NodeNgram)

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -18,6 +18,8 @@ from math                import log

 def compute_occs(corpus, overwrite_id = None):
    """
+    # TODO check if cumulated occs correspond to app's use cases and intention
+
    Calculates sum of occs per ngram within corpus
    (used as info in the ngrams table view)

@@ -78,9 +80,11 @@ def compute_occs(corpus, overwrite_id = None):
    return the_id


-def compute_tfidf(corpus, scope="local", overwrite_id=None):
+def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
    """
-    Calculates tfidf within the current corpus
+    # TODO check if cumulated tfs correspond to app's use cases and intention
+
+    Calculates tfidf ranking (cumulated tfidf) within the given scope

    Parameters:
      - the corpus itself
@@ -150,12 +154,12 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
    else:
        # create the new TFIDF-XXXX node
        tfidf_nd = corpus.add_child()
-        if scope == "local":
+        if scope == "local":            # TODO discuss use and find new typename
            tfidf_nd.typename  = "TFIDF-CORPUS"
-            tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
+            tfidf_nd.name      = "tfidf-cumul-corpus (in:%s)" % corpus.id
        elif scope == "global":
            tfidf_nd.typename  = "TFIDF-GLOBAL"
-            tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+            tfidf_nd.name      = "tfidf-cumul-global (in type:%s)" % this_source_type
        session.add(tfidf_nd)
        session.commit()
        the_id = tfidf_nd.id
@@ -169,3 +173,82 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
    )

    return the_id
+
+
+
+def compute_tfidf_local(corpus, overwrite_id=None):
+    """
+    Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
+
+    Parameters:
+      - the corpus itself
+      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
+                   (the Node and its previous NodeNodeNgram rows will be replaced)
+    """
+
+    # All docs of this corpus
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # N
+    total_docs = session.query(docids_subquery).count()
+
+    # number of docs with given term (number of rows = M ngrams)
+    n_docswith_ng = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.count(NodeNgram.node_id).label("nd")  # nd: n docs with term
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # { ngram_id => log(nd) }
+    log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng}
+
+    # tf for each couple (number of rows = N docs X M ngrams)
+    tf_doc_ng = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        NodeNgram.node_id,
+                        func.sum(NodeNgram.weight).label("tf"),    # tf: occurrences
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.node_id, NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # ---------------------------------------------------------
+    tfidfs = {}
+    log_tot_docs = log(total_docs)
+    for (ngram_id, node_id, tf) in tf_doc_ng:
+        log_nd = log_nd_lookup[ngram_id]
+        # tfidfs[ngram_id] = tf * log(total_docs/nd)
+        tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)
+    # ---------------------------------------------------------
+
+    if overwrite_id:
+        the_id = overwrite_id
+    else:
+        # create the new TFIDF-CORPUS node
+        tfidf_node = corpus.add_child()
+        tfidf_node.typename  = "TFIDF-CORPUS"
+        tfidf_node.name      = "tfidf-sims-corpus (in:%s)" % corpus.id
+        session.add(tfidf_node)
+        session.commit()
+        the_id = tfidf_node.id
+
+    # reflect that in NodeNodeNgrams
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id', 'node2_id','ngram_id', 'score'),
+        ((the_id,    node_id,    ngram_id,   tfidfs[node_id,ngram_id]) for (node_id, ngram_id) in tfidfs)
+    )
+
+    return the_id