Merge branch 'romain-refactoring' into unstable

6c438c85 · delanoe · f236759c · e52afd97 · 6c438c85 · 6c438c85
Commit 6c438c85 authored May 21, 2016 by delanoe
7 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -8,18 +8,19 @@ import re

 LISTTYPES = {
    'DOCUMENT'     : WeightedList,
-    'GROUPLIST'    : Translations,
+    'GROUPLIST'    : Translations,   # todo remove "LIST" from name
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
    'SPECIFICITY'  : WeightedList,
-    'OCCURRENCES'  : WeightedContextIndex,
+    'OCCURRENCES'  : WeightedIndex,   # todo replace by WeightedList
    'COOCCURRENCES': WeightedMatrix,
-    'TFIDF-CORPUS' : WeightedContextIndex,
-    'TFIDF-GLOBAL' : WeightedContextIndex,
+    'TFIDF-CORPUS' : WeightedIndex,   # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
+    'TFIDF-GLOBAL' : WeightedIndex,   # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
 }

 NODETYPES = [
+    # TODO separate id not array index, read by models.node
    None,
    # documents hierarchy
    'USER',                  # 1
@@ -40,6 +41,7 @@ NODETYPES = [
    'TFIDF-GLOBAL',          # 14
    # docs subset
    'FAVORITES'              # 15
+    # TODO add ti RANK
 ]

 INDEXED_HYPERDATA = {

--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -2,7 +2,7 @@
 """


-__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
+__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']


 from gargantext.util.db import session, bulk_insert
@@ -165,15 +165,18 @@ class Translations(_BaseClass):
        )


-class WeightedContextIndex(_BaseClass):
+class WeightedIndex(_BaseClass):
    """
    associated model   : NodeNodeNgram
    associated columns : node1_id  |  node2_id  |  ngram_id  |  score (float)
+                           ^^^^
+                    reserved for this
+                       object's id

-    Tensor representing a contextual index or registry
-    (matrix of weighted ngrams *per* doc *per* context)
+    Matrix representing a weighted word index across docs or small context nodes
+                   (matrix of weighted ngrams  *per*  doc)

-    Exemple : tfidf by corpus
+    Exemple : tfidf within a corpus
    """
    def __init__(self, source=None):
        self.items = defaultdict(float)
@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):



-
+# ?TODO rename WeightedWordmatrix
 class WeightedMatrix(_BaseClass):

    def __init__(self, source=None):
@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
                result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
        return result

-
+# ?TODO rename Wordlist
 class UnweightedList(_BaseClass):

    def __init__(self, source=None):
@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
        )


+# ?TODO rename WeightedWordlist
 class WeightedList(_BaseClass):

    def __init__(self, source=None):

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
    group_id = compute_groups(corpus, stoplist_id = None)
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))

-    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
-    occ_id = compute_occs(corpus)
-    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
-
    # ------------
-    # -> write local tfidf similarities to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf_local(corpus)
-    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+    # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
+    occ_id = compute_occs(corpus, groupings_id = group_id)
+    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

-    # -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
+    # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
    tirank_id = compute_ti_ranking(corpus,
-                                   count_scope="global",
-                                   termset_scope="local")
-    print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
+                                   groupings_id = group_id,
+                                   count_scope="global")
+    print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))

    # -> mainlist: filter + write (to Node and NodeNgram)
    mainlist_id = do_mainlist(corpus,
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
                              stoplist_id = stop_id)
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))

+    # -> write local tfidf similarities to Node and NodeNodeNgram
+    # TODO only on mainlist
+    ltfidf_id = compute_tfidf_local(corpus)
+    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+    # => used for doc <=> ngram association
+
    # ------------
    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
-    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))

    # -> specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
+    spec_id = compute_specificity(corpus, cooc_id=cooc_id
+            #   ,groupings_id = group_id
+              )
    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))

    # ?? maplist: compute + write (to Node and NodeNgram)

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
    ordered_filtered_tfidf = (session
        .query(NodeNodeNgram.ngram_id)
        .filter(NodeNodeNgram.node1_id == ranking_scores_id)
+        # NOT IN but speed theoretically ok here
+        # see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
+        # but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
        .order_by(desc(NodeNodeNgram.score))
        )

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
--- a/static/lib/gargantext/NGrams_dyna_chart_and_table.js
+++ b/static/lib/gargantext/NGrams_dyna_chart_and_table.js
@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
  // console.log(JSON.stringify(NGrams))
  // -------------------------------------------------------------------

+  // ----------------------------------------- MAPLIST
+  // keepstateId = 1
+  keepstateId = System[0]["statesD"]["keep"]
+  if( Object.keys(NGrams["map"]).length>0 ) {
+      for(var ngram_id in NGrams["map"]) {
+          myNgramInfo = NGrams["main"].ngrams[ngram_id]
+          // initialize state of maplist items
+          myNgramInfo["state"] = keepstateId ;
+      }
+  }
+
+  // ----------------------------------------- STOPLIST
+  // delstateId = 2
+  delstateId = System[0]["statesD"]["delete"]
+  if( Object.keys(NGrams["stop"]).length>0 ) {
+      for(var ngram_id in NGrams["stop"]) {
+          console.log('stopping ' + ngram_id)
+          myNgramInfo = NGrams["main"].ngrams[ngram_id]
+          // initialize state of stoplist items
+          myNgramInfo["state"] = delstateId ;
+      }
+  }
+
    // Deleting subforms from the ngrams-table, clean start baby!
    if( Object.keys(NGrams["group"].links).length>0 ) {

@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
            }
        }

-        // debug:
-        // console.log('~~~~~~~~~~~~~> (sub) _forms')
-        // console.log( _forms )
-
        // ------------------------------------------- MAINLIST
        // ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
        var ngrams_data_ = {}
@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
    // console.log( NGrams["main"] )


-    // ----------------------------------------- MAPLIST
-    if( Object.keys(NGrams["map"]).length>0 ) {
-        for(var ngram_id in NGrams["main"].ngrams) {
-            myNgram = NGrams["main"].ngrams[ngram_id]
-            if(NGrams["map"][ngram_id]) {
-                // keepstateId = 1
-                keepstateId = System[0]["statesD"]["keep"]
-
-                // initialize state of maplist items
-                myNgram["state"] = keepstateId ;
-            }
-            else if (NGrams["stop"][ngram_id]) {
-                // delstateId = 2
-                delstateId = System[0]["statesD"]["delete"]
-
-                // initialize state of stoplist items
-                myNgram["state"] = delstateId ;
-            }
-        }
-    }
-
    // Building the Score-Selector //NGrams["scores"]
    var FirstScore = NGrams["main"].scores.initial
    // TODO scores_div