fixed local tfidf to compute (doc, ngram) similarities and NOT cumulated...

fixed local tfidf to compute (doc, ngram) similarities and NOT cumulated scores per ngram (TODO: discuss parallel changes for global tfidf)

fixed local tfidf to compute (doc, ngram) similarities and NOT cumulated...
fixed local tfidf to compute (doc, ngram) similarities and NOT cumulated scores per ngram (TODO: discuss parallel changes for global tfidf)
6341dc12 · Romain Loth · 87a27ff0 · 6341dc12 · 6341dc12
Commit 6341dc12 authored Mar 29, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 93 additions and 10 deletions

__init__.py gargantext/util/toolchain/__init__.py +5 -5

metric_tfidf.py gargantext/util/toolchain/metric_tfidf.py +88 -5

No files found.
--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -3,7 +3,7 @@ from .ngrams_extraction import extract_ngrams

 # in usual run order
 from .list_stop           import do_stoplist
-from .metric_tfidf        import compute_occs, compute_tfidf
+from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_cumulated_tfidf
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
 from .metric_specificity  import compute_specificity
@@ -54,12 +54,12 @@ def parse_extract(corpus):
    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

    # ------------
-    # -> write local tfidf to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf(corpus, scope="local")
+    # -> write local tfidf similarities to Node and NodeNodeNgram
+    ltfidf_id = compute_tfidf_local(corpus)
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))

-    # -> write global tfidf to Node and NodeNodeNgram
-    gtfidf_id = compute_tfidf(corpus, scope="global")
+    # -> write global and cumulated tfidf to Node and NodeNodeNgram
+    gtfidf_id = compute_cumulated_tfidf(corpus, scope="global")
    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))

    # -> mainlist: filter + write (to Node and NodeNgram)

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -18,6 +18,8 @@ from math                import log

 def compute_occs(corpus, overwrite_id = None):
    """
+    # TODO check if cumulated occs correspond to app's use cases and intention
+
    Calculates sum of occs per ngram within corpus
    (used as info in the ngrams table view)

@@ -78,9 +80,11 @@ def compute_occs(corpus, overwrite_id = None):
    return the_id


-def compute_tfidf(corpus, scope="local", overwrite_id=None):
+def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
    """
-    Calculates tfidf within the current corpus
+    # TODO check if cumulated tfs correspond to app's use cases and intention
+
+    Calculates tfidf ranking (cumulated tfidf) within the given scope

    Parameters:
      - the corpus itself
@@ -150,12 +154,12 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
    else:
        # create the new TFIDF-XXXX node
        tfidf_nd = corpus.add_child()
-        if scope == "local":
+        if scope == "local":            # TODO discuss use and find new typename
            tfidf_nd.typename  = "TFIDF-CORPUS"
-            tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
+            tfidf_nd.name      = "tfidf-cumul-corpus (in:%s)" % corpus.id
        elif scope == "global":
            tfidf_nd.typename  = "TFIDF-GLOBAL"
-            tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+            tfidf_nd.name      = "tfidf-cumul-global (in type:%s)" % this_source_type
        session.add(tfidf_nd)
        session.commit()
        the_id = tfidf_nd.id
@@ -169,3 +173,82 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
    )

    return the_id
+
+
+
+def compute_tfidf_local(corpus, overwrite_id=None):
+    """
+    Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
+
+    Parameters:
+      - the corpus itself
+      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
+                   (the Node and its previous NodeNodeNgram rows will be replaced)
+    """
+
+    # All docs of this corpus
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # N
+    total_docs = session.query(docids_subquery).count()
+
+    # number of docs with given term (number of rows = M ngrams)
+    n_docswith_ng = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.count(NodeNgram.node_id).label("nd")  # nd: n docs with term
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # { ngram_id => log(nd) }
+    log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng}
+
+    # tf for each couple (number of rows = N docs X M ngrams)
+    tf_doc_ng = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        NodeNgram.node_id,
+                        func.sum(NodeNgram.weight).label("tf"),    # tf: occurrences
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.node_id, NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # ---------------------------------------------------------
+    tfidfs = {}
+    log_tot_docs = log(total_docs)
+    for (ngram_id, node_id, tf) in tf_doc_ng:
+        log_nd = log_nd_lookup[ngram_id]
+        # tfidfs[ngram_id] = tf * log(total_docs/nd)
+        tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)
+    # ---------------------------------------------------------
+
+    if overwrite_id:
+        the_id = overwrite_id
+    else:
+        # create the new TFIDF-CORPUS node
+        tfidf_node = corpus.add_child()
+        tfidf_node.typename  = "TFIDF-CORPUS"
+        tfidf_node.name      = "tfidf-sims-corpus (in:%s)" % corpus.id
+        session.add(tfidf_node)
+        session.commit()
+        the_id = tfidf_node.id
+
+    # reflect that in NodeNodeNgrams
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id', 'node2_id','ngram_id', 'score'),
+        ((the_id,    node_id,    ngram_id,   tfidfs[node_id,ngram_id]) for (node_id, ngram_id) in tfidfs)
+    )
+
+    return the_id