Merge branch 'romain-refactoring' into unstable

6c438c85 · delanoe · f236759c · e52afd97 · 6c438c85 · 6c438c85
Commit 6c438c85 authored May 21, 2016 by delanoe
7 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -8,18 +8,19 @@ import re

 LISTTYPES = {
    'DOCUMENT'     : WeightedList,
-    'GROUPLIST'    : Translations,
+    'GROUPLIST'    : Translations,   # todo remove "LIST" from name
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
    'SPECIFICITY'  : WeightedList,
-    'OCCURRENCES'  : WeightedContextIndex,
+    'OCCURRENCES'  : WeightedIndex,   # todo replace by WeightedList
    'COOCCURRENCES': WeightedMatrix,
-    'TFIDF-CORPUS' : WeightedContextIndex,
-    'TFIDF-GLOBAL' : WeightedContextIndex,
+    'TFIDF-CORPUS' : WeightedIndex,   # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
+    'TFIDF-GLOBAL' : WeightedIndex,   # todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
 }

 NODETYPES = [
+    # TODO separate id not array index, read by models.node
    None,
    # documents hierarchy
    'USER',                  # 1
@@ -40,6 +41,7 @@ NODETYPES = [
    'TFIDF-GLOBAL',          # 14
    # docs subset
    'FAVORITES'              # 15
+    # TODO add ti RANK
 ]

 INDEXED_HYPERDATA = {

--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -2,7 +2,7 @@
 """


-__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
+__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedIndex']


 from gargantext.util.db import session, bulk_insert
@@ -165,15 +165,18 @@ class Translations(_BaseClass):
        )


-class WeightedContextIndex(_BaseClass):
+class WeightedIndex(_BaseClass):
    """
    associated model   : NodeNodeNgram
    associated columns : node1_id  |  node2_id  |  ngram_id  |  score (float)
+                           ^^^^
+                    reserved for this
+                       object's id

-    Tensor representing a contextual index or registry
-    (matrix of weighted ngrams *per* doc *per* context)
+    Matrix representing a weighted word index across docs or small context nodes
+                   (matrix of weighted ngrams  *per*  doc)

-    Exemple : tfidf by corpus
+    Exemple : tfidf within a corpus
    """
    def __init__(self, source=None):
        self.items = defaultdict(float)
@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):



-
+# ?TODO rename WeightedWordmatrix
 class WeightedMatrix(_BaseClass):

    def __init__(self, source=None):
@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
                result.items[key1, key2] = value / sqrt(other.items[key1] * other.items[key2])
        return result

-
+# ?TODO rename Wordlist
 class UnweightedList(_BaseClass):

    def __init__(self, source=None):
@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
        )


+# ?TODO rename WeightedWordlist
 class WeightedList(_BaseClass):

    def __init__(self, source=None):

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
    group_id = compute_groups(corpus, stoplist_id = None)
    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))

-    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
-    occ_id = compute_occs(corpus)
-    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
-
    # ------------
-    # -> write local tfidf similarities to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf_local(corpus)
-    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+    # -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
+    occ_id = compute_occs(corpus, groupings_id = group_id)
+    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))

-    # -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
+    # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
    tirank_id = compute_ti_ranking(corpus,
-                                   count_scope="global",
-                                   termset_scope="local")
-    print('CORPUS #%d: [%s] new tfidf ranking node #%i' % (corpus.id, t(), tirank_id))
+                                   groupings_id = group_id,
+                                   count_scope="global")
+    print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id))

    # -> mainlist: filter + write (to Node and NodeNgram)
    mainlist_id = do_mainlist(corpus,
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
                              stoplist_id = stop_id)
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))

+    # -> write local tfidf similarities to Node and NodeNodeNgram
+    # TODO only on mainlist
+    ltfidf_id = compute_tfidf_local(corpus)
+    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+    # => used for doc <=> ngram association
+
    # ------------
    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
-    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))

    # -> specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
+    spec_id = compute_specificity(corpus, cooc_id=cooc_id
+            #   ,groupings_id = group_id
+              )
    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))

    # ?? maplist: compute + write (to Node and NodeNgram)

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
    ordered_filtered_tfidf = (session
        .query(NodeNodeNgram.ngram_id)
        .filter(NodeNodeNgram.node1_id == ranking_scores_id)
+        # NOT IN but speed theoretically ok here
+        # see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
+        # but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
        .order_by(desc(NodeNodeNgram.score))
        )

--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
 """

 from gargantext.models   import Node, NodeNgram, NodeNodeNgram, NodeNgramNgram
+from gargantext.util.db_cache  import cache
 from gargantext.util.db  import session, bulk_insert, aliased, \
                                func # = sqlalchemy.func like sum() or count()
 from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
 from sqlalchemy import distinct   # for list of unique ngram_ids within a corpus
 from math                import log
+from re                  import match
 # £TODO
-# from gargantext.util.lists import WeightedContextIndex
+# from gargantext.util.lists import WeightedIndex


 def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    Parameters:
        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
                     (the Node and its previous NodeNodeNgram rows will be replaced)
-        - groupings_id: optional id of a GROUPLIST node for this corpus
+        - groupings_id: optional id of a GROUPLIST node for these ngrams
                        IF absent the occurrences are the sums for each ngram
                        IF present they're the sums for each ngram's mainform
    """
@@ -115,7 +117,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    if overwrite_id:
        # overwrite pre-existing id
        the_id = overwrite_id
-        # occnode = cache.Node[overwrite_id]
+        session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
+        session.commit()
    else:
        # create the new OCCURRENCES node
        occnode = corpus.add_child(
@@ -126,8 +129,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
        session.commit()
        the_id = occnode.id

-    # reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
-    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    # £TODO  make it NodeNgram instead NodeNodeNgram ! and rebase :/
+    #        (idem ti_ranking)
    bulk_insert(
        NodeNodeNgram,
        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
@@ -137,14 +140,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
    return the_id


-def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overwrite_id=None):
+def compute_ti_ranking(corpus,
+                       groupings_id = None,
+                       count_scope="local", termset_scope="local",
+                       overwrite_id=None):
    """
-    # TODO check if cumulated tfs correspond to app's use cases and intention
-
-    Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
+    Calculates tfidf ranking within given scope
+                ----------
+                   |
+            via weighting of
+            cumulated tfidf  --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
+             per ngram ng_i
+         (or per mainform ng_i' if groups)
+           across some docs d_j

    Parameters:
-      - the corpus itself
+      - the corpus itself (or corpus_id)
+      - groupings_id: optional id of a GROUPLIST node for these ngrams
+                        IF absent the ti weights are the sums for each ngram
+                        IF present they're the sums for each ngram's mainform
+
      - count_scope: {"local" or "global"}
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type
@@ -153,43 +168,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
-             - global <=> output list of terms from all corpora of this type
+             - global <=> output list of terms found in global doc scope
                                                    !!!! (many more terms)

-      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
-                   (the Node and its previous NodeNodeNgram rows will be replaced)
+      - overwrite_id: optional id of a pre-existing XXXX node for this corpus
+                   (the Node and its previous Node NodeNgram rows will be replaced)
    """
+    # validate string params
+    if count_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
+    if termset_scope not in ["local","global"]:
+        raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
+    if count_scope == "local" and termset_scope == "global":
+        raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
+
+    # get corpus
+    if type(corpus) == int:
+        corpus_id = corpus
+        corpus = cache.Node[corpus_id]
+    elif type(corpus) == str and match(r'\d+$', corpus):
+        corpus_id = int(corpus)
+        corpus = cache.Node[corpus_id]
+    else:
+        # assuming Node class
+        corpus_id = corpus.id
+
+    # prepare sqla mainform vs ngram selector
+    ngform_i = None
+
+    if not groupings_id:
+        ngform_i = NodeNgram.ngram_id
+
+    else:
+        # prepare translations
+        syno = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
+                .subquery()
+               )
+        # cf commentaire détaillé dans compute_occs() + todo facto
+
+        ngform_i = case([
+                            (syno.c.ngram1_id != None, syno.c.ngram1_id),
+                            (syno.c.ngram1_id == None, NodeNgram.ngram_id)
+                            #     condition               value
+                        ])

    # MAIN QUERY SKELETON
    tf_nd_query = (session
                    .query(
-                        NodeNgram.ngram_id,
+                        # NodeNgram.ngram_id
+                        # or similar if grouping ngrams under their mainform
+                        ngform_i.label("counted_ngform"),
+
+                        # the tfidf elements
+                        # ------------------
                        func.sum(NodeNgram.weight),    # tf: same as occurrences
                                                       # -----------------------

                        func.count(NodeNgram.node_id)  # nd: n docs with term
                                                       # --------------------
                     )
-                    .group_by(NodeNgram.ngram_id)
+                    .group_by("counted_ngform")

-                    # optional *count_scope*: if we'll restrict the doc nodes
-                    #          -------------
+                    # count_scope to specify in which doc nodes to count
+                    # -----------
                    # .join(countdocs_subquery,
                    #       countdocs_subquery.c.id == NodeNgram.node_id)

-                    # optional *termset_scope*: if we'll restrict the ngrams
-                    #          ---------------
+                    # optional termset_scope: if we'll restrict the ngrams
+                    #          -------------
                    # .join(termset_subquery,
                    #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
+
+                    # optional translations to bring the subform's replacement
+                    #          ------------
+                    # .outerjoin(syno,
+                    #           syno.c.ngram2_id == NodeNgram.ngram_id)
                   )

-    # validate string params
-    if count_scope not in ["local","global"]:
-        raise ValueError("compute_ti_ranking: count_scope param allowed values: 'local', 'global'")
-    if termset_scope not in ["local","global"]:
-        raise ValueError("compute_ti_ranking: termset_scope param allowed values: 'local', 'global'")
-    if count_scope == "local" and termset_scope == "global":
-        raise ValueError("compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too.")
+
+
+    # TUNING THE QUERY
+
+    if groupings_id:
+        tf_nd_query = tf_nd_query.outerjoin(
+                                        syno,
+                                        syno.c.ngram2_id == NodeNgram.ngram_id
+                                        )

    # local <=> within this corpus
    if count_scope == "local":
@@ -197,14 +263,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
        countdocs_subquery = (session
                        .query(Node.id)
                        .filter(Node.typename == "DOCUMENT")
-                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.parent_id == corpus_id)
                        .subquery()
                       )

-        # both scopes are the same: no need to independantly restrict the ngrams
+        # no need to independantly restrict the ngrams
        tf_nd_query = tf_nd_query.join(countdocs_subquery,
                                       countdocs_subquery.c.id == NodeNgram.node_id)
-
+        # ---

    # global <=> within all corpora of this source
    elif count_scope == "global":
@@ -220,6 +286,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
                        # join on parent_id with selected corpora nodes
                        .join(CorpusNode, CorpusNode.id == Node.parent_id)
                        .filter(CorpusNode.typename == "CORPUS")
+                        # TODO index corpus_sourcetype in DB
                        .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(this_source_type))
                        .subquery()
                       )
@@ -228,15 +295,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
            # both scopes are the same: no need to independantly restrict the ngrams
            tf_nd_query = tf_nd_query.join(countdocs_subquery,
                                           countdocs_subquery.c.id == NodeNgram.node_id)
+            # ---

        elif termset_scope == "local":

-            # All unique terms in the original corpus
+            # All unique terms...
            termset_subquery = (session
-                            .query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
+                            .query(
+                                distinct(NodeNgram.ngram_id).label("uniq_ngid")
+                              )
+                            # ... in the original corpus
                            .join(Node)
                            .filter(Node.typename == "DOCUMENT")
-                            .filter(Node.parent_id == corpus.id)
+                            .filter(Node.parent_id == corpus_id)
                            .subquery()
                           )

@@ -247,42 +318,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
                            .join(termset_subquery,
                                  termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
                          )
+            # ---

-    # N
+    # M
    total_docs = session.query(countdocs_subquery).count()
+    log_tot_docs = log(total_docs)

    # result
    tf_nd = tf_nd_query.all()

-    # -------------------------------------------------
-    tfidfs = {}
-    log_tot_docs = log(total_docs)
-    for (ngram_id, tf, nd) in tf_nd:
-        # tfidfs[ngram_id] = tf * log(total_docs/nd)
-        tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
-    # -------------------------------------------------
+    # -------------- "sommatoire" sur mot i ----------------
+    tfidfsum = {}
+    for (ngram_i, tf_i, nd_i) in tf_nd:
+        # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
+        tfidfsum[ngram_i] = tf_i * (log_tot_docs-log(nd_i))
+    # ------------------------------------------------------
+
+    # N pour info
+    total_ngramforms = len(tfidfsum)

    if overwrite_id:
        the_id = overwrite_id
+        session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
+        session.commit()
    else:
-        # create the new TFIDF-XXXX node
-        tfidf_nd = corpus.add_child()
-        if count_scope == "local":            # TODO discuss use and find new typename
-            tfidf_nd.typename  = "TFIDF-CORPUS"
-            tfidf_nd.name      = "tfidf-cumul-corpus (in:%s)" % corpus.id
+        # create the new TFIDF-XXXX node to get an id
+        tir_nd = corpus.add_child()
+        if count_scope == "local":
+            tir_nd.typename  = "TFIDF-CORPUS"
+            tir_nd.name      = "ti rank (%i ngforms in corpus:%s)" % (
+                                     total_ngramforms, corpus_id)
        elif count_scope == "global":
-            tfidf_nd.typename  = "TFIDF-GLOBAL"
-            tfidf_nd.name      = "tfidf-cumul-global (in type:%s)" % this_source_type
-        session.add(tfidf_nd)
+            tir_nd.typename  = "TFIDF-GLOBAL"
+            tir_nd.name      = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
+                                       total_ngramforms,
+                                       ("from corpus %i" % corpus_id) if (termset_scope == "local") else "" ,
+                                       this_source_type)
+
+        session.add(tir_nd)
        session.commit()
-        the_id = tfidf_nd.id
+        the_id = tir_nd.id
+
+
+    # TODO 1 discuss use and find new typename
+    # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
+    # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
+    # TODO 4 requalify this here as a NodeNgram
+    # then TODO 5 use WeightedList.save() !

    # reflect that in NodeNodeNgrams
-    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
    bulk_insert(
        NodeNodeNgram,
        ('node1_id', 'node2_id','ngram_id', 'score'),
-        ((the_id,    corpus.id,    ng,   tfidfs[ng]) for ng in tfidfs)
+        ((the_id,  corpus_id,    ng,   tfidfsum[ng]) for ng in tfidfsum)
    )

    return the_id
@@ -347,6 +435,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):

    if overwrite_id:
        the_id = overwrite_id
+        session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id == the_id).delete()
+        session.commit()
    else:
        # create the new TFIDF-CORPUS node
        tfidf_node = corpus.add_child()
@@ -357,7 +447,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
        the_id = tfidf_node.id

    # reflect that in NodeNodeNgrams
-    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    # £TODO replace bulk_insert by something like WeightedIndex.save()
    bulk_insert(
        NodeNodeNgram,
        ('node1_id', 'node2_id','ngram_id', 'score'),

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
 from gargantext.models         import Node, NodeNgram, NodeNgramNgram, \
-                                      NodeHyperdata
+                                      NodeHyperdata, Ngram
 from gargantext.util.lists     import WeightedMatrix
 from gargantext.util.db        import session, aliased, func
 from gargantext.util.db_cache  import cache
 from gargantext.constants      import DEFAULT_COOC_THRESHOLD
 from datetime                  import datetime

+from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
+
 def compute_coocs(  corpus,
                    overwrite_id    = None,
                    threshold       = DEFAULT_COOC_THRESHOLD,
+                    groupings_id    = None,
                    mainlist_id     = None,
                    stoplist_id     = None,
                    start           = None,
@@ -41,9 +44,11 @@ def compute_coocs(  corpus,
      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
      - threshold: on output cooc count (previously called hapax)
+      - groupings_id: optional synonym relations to add all subform counts
+                      with their mainform's counts
      - mainlist_id: mainlist to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
-                     (normally unnecessary if a mainlist is provided)
+                     (normally unnecessary if a mainlist is already provided)
      - start, end: provide one or both temporal limits to filter on doc date
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
@@ -56,25 +61,24 @@ def compute_coocs(  corpus,
    basic idea for one doc
    ======================
    each pair of ngrams sharing same doc (node_id)
-        SELEC idx1.ngram_id, idx2.ngram_id
-        FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
+        SELEC idxa.ngram_id, idxb.ngram_id
+        FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
        ---------------------------------
-        WHERE idx1.node_id = idx2.node_id      <== that's cooc
+        WHERE idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
-        AND idx1.ngram_id <> idx2.ngram_id
-        AND idx1.node_id = MY_DOC ;
+        AND idxa.ngram_id <> idxb.ngram_id
+        AND idxa.node_id = MY_DOC ;

    on entire corpus
    =================
    coocs for each doc :
      - each given pair like (termA, termB) will likely appear several times
-        => we do GROUP BY (x1.ngram_id, x2.ngram_id)
+        => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
      - we count unique appearances of the pair (cooc)


    """

-        #   - TODO add grouped element's values in grouping 'chief ngram'
        #   - TODO cvalue_id: allow a metric as additional  input filter
        #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
        #   - TODO weighted: if False normal cooc to be saved as result
@@ -85,130 +89,190 @@ def compute_coocs(  corpus,
    #  1.859.408 lignes pour la requête cooc simple
    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)

-    # docs of our corpus
-    docids_subquery = (session
-                        .query(Node.id)
-                        .filter(Node.parent_id == corpus.id)
-                        .filter(Node.typename == "DOCUMENT")
+    # 2 x the occurrence index table
+    Xindex = aliased(NodeNgram)
+    Yindex = aliased(NodeNgram)
+
+    # for debug (1/4)
+    # Xngram = aliased(Ngram)
+    # Yngram = aliased(Ngram)
+
+    # 1) prepare definition of counted forms
+    if not groupings_id:
+
+        # no groupings => the counted forms are the ngrams
+        Xindex_ngform_id = Xindex.ngram_id
+        Yindex_ngform_id = Yindex.ngram_id
+
+
+    # groupings: cf commentaire détaillé dans compute_occs() + todo facto
+    else:
+        # prepare translations
+        Xsyno = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
                .subquery()
               )

-    # 2 x the occurrence index table
-    x1 = aliased(NodeNgram)
-    x2 = aliased(NodeNgram)
+        # further use as anon tables prevent doing Ysyno = Xsyno
+        Ysyno = (session.query(NodeNgramNgram.ngram1_id,
+                             NodeNgramNgram.ngram2_id)
+                .filter(NodeNgramNgram.node_id == groupings_id)
+                .subquery()
+               )

-    # cooccurrences columns definition
-    ucooc = func.count(x1.ngram_id).label("ucooc")
+        # groupings => define the counted form depending on the existence of a synonym
+        Xindex_ngform_id = case([
+                            (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
+                            (Xsyno.c.ngram1_id == None, Xindex.ngram_id)
+                            #     condition               value
+                        ])

-    # 1) MAIN DB QUERY
-    coocs_query = (
-        session.query(x1.ngram_id, x2.ngram_id, ucooc)
-               .join(Node, Node.id == x1.node_id)   # <- b/c within corpus
-               .join(x2, x1.node_id == Node.id )     # <- b/c within corpus
+        Yindex_ngform_id = case([
+                            (Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
+                            (Ysyno.c.ngram1_id == None, Yindex.ngram_id)
+                        ])
+        # ---

+
+
+    # 2) BASE DB QUERY
+
+    # cooccurrences columns definition ----------------
+    ucooc = func.count(Xindex_ngform_id).label("ucooc")
+    # NB could be X or Y in this line
+    #    (we're counting grouped rows and just happen to do it on this column)
+    base_query = (
+        session.query(
+                    Xindex_ngform_id,
+                    Yindex_ngform_id,
+                    ucooc
+
+                    # for debug (2/4)
+                    #, Xngram.terms.label("w_x")
+                    #, Yngram.terms.label("w_y")
+                    )
+               .join(Yindex, Xindex.node_id == Yindex.node_id )   # <- by definition of cooc
+
+               .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
               .filter(Node.parent_id == corpus.id)   # <- b/c within corpus
               .filter(Node.typename == "DOCUMENT")   # <- b/c within corpus

+               .filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
+        )
+
+    # outerjoin the synonyms if needed
+    if groupings_id:
+        base_query = (base_query
+               .outerjoin(Xsyno,                 # <- synonyms for Xindex.ngrams
+                          Xsyno.c.ngram2_id == Xindex.ngram_id)
+               .outerjoin(Ysyno,                 # <- synonyms for Yindex.ngrams
+                          Ysyno.c.ngram2_id == Yindex.ngram_id)
+            )
+
+
+    # 3) counting clause in any case
+    coocs_query = (base_query
+               .group_by(
+                    Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
+                    # for debug (3/4)
+                    #,"w_x", "w_y"
+                    )
+
+            # for debug (4/4)
+            #.join(Xngram, Xngram.id == Xindex_ngform_id)
+            #.join(Yngram, Yngram.id == Yindex_ngform_id)

-               .filter(x1.node_id  == x2.node_id)       # <- by definition of cooc
-               .filter(x1.ngram_id != x2.ngram_id)      # <- b/c not with itself
-               .group_by(x1.ngram_id, x2.ngram_id)
+            .order_by(ucooc)
           )

-    # 2) INPUT FILTERS (reduce N before O(N²))
+
+    # 4) INPUT FILTERS (reduce N before O(N²))
    if mainlist_id:

        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)

        coocs_query = ( coocs_query
-            .join(m1, m1.ngram_id == x1.ngram_id)
-            .join(m2, m2.ngram_id == x2.ngram_id)
+            .join(m1, m1.ngram_id == Xindex_ngform_id)
+            .join(m2, m2.ngram_id == Yindex_ngform_id)

            .filter( m1.node_id == mainlist_id )
            .filter( m2.node_id == mainlist_id )
        )

    if stoplist_id:
-        s1 = aliased(NodeNgram)
-        s2 = aliased(NodeNgram)
+        s1 = (session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == stoplist_id)
+                .subquery()
+               )
+
+        # further use as anon tables prevent doing s2 = s1
+        s2 = (session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == stoplist_id)
+                .subquery()
+               )

        coocs_query = ( coocs_query
-            .join(m1, s1.ngram_id == x1.ngram_id)
-            .join(m2, s2.ngram_id == x2.ngram_id)
+            .outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
+            .outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
+
+            # équivalent NOT IN stoplist
+            .filter( s1.c.ngram_id == None )
+            .filter( s2.c.ngram_id == None )

-            .filter( s1.node_id == mainlist_id )
-            .filter( s2.node_id == mainlist_id )
        )

-    if start:
-        if isinstance(start, datetime):
-            start_str = start.strftime("%Y-%m-%d %H:%M:%S")
-        else:
-            start_str = str(start)
-
-        # doc_ids matching this limit
-        # TODO s/subqueries/inner joins/ && thanks!
-        starttime_subquery = (session
-                                .query(NodeHyperdata.node_id)
-                                .filter(NodeHyperdata.key=="publication_date")
-                                .filter(NodeHyperdata.value_str >= start_str)
-                                .subquery()
+    if start or end:
+        Time = aliased(NodeHyperdata)
+
+        coocs_query = (coocs_query
+                            .join(Time, Time.node_id == Xindex.node_id)
+                            .filter(Time.key=="publication_date")
                            )
-        # direct use of str comparison op because there is consistency b/w
-        # sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
+
+        if start:
+            if not isinstance(start, datetime):
+                try:
+                    start = datetime.strptime(start, '%Y-%m-%d')
+                except:
+                    raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")

            # the filtering by start limit
-        coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
+            coocs_query = coocs_query.filter(Time.value_utc >= start)

        if end:
-        if isinstance(end, datetime):
-            end_str = end.strftime("%Y-%m-%d %H:%M:%S")
-        else:
-            end_str = str(end)
+            if not isinstance(end, datetime):
+                try:
+                    end = datetime.strptime(end, '%Y-%m-%d')
+                except:
+                    raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")

-        # TODO s/subqueries/inner joins/ && thanks!
-        endtime_subquery = (session
-                                .query(NodeHyperdata.node_id)
-                                .filter(NodeHyperdata.key=="publication_date")
-                                .filter(NodeHyperdata.value_str <= end_str)
-                                .subquery()
-                           )
-
-        # the filtering by end limit
-        coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
+            # the filtering by start limit
+            coocs_query = coocs_query.filter(Time.value_utc <= end)


    if symmetry_filter:
        # 1 filtre tenant en compte de la symétrie
        #  -> réduit le travail de moitié !!
-        #  -> mais empêchera l'accès direct aux cooccurrences de x2
-        #  -> seront éparpillées: notées dans les x1 qui ont précédé x2
-        #  -> récupération sera plus couteuse via des requêtes OR comme:
+        #  -> mais récupération sera plus couteuse via des requêtes OR comme:
        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
-        coocs_query = coocs_query.filter(x1.ngram_id  < x2.ngram_id)
+        coocs_query = coocs_query.filter(Xindex_ngform_id  < Yindex_ngform_id)

-    # ------------
-    # 2 filtres amont possibles pour réduire combinatoire
-    #         - par exemple 929k lignes => 35k lignes
-    #         - ici sur weight mais dégrade les résultats
-    #            => imaginable sur une autre métrique (cvalue ou tfidf?)
-    # coocs_query = coocs_query.filter(x1.weight > 1)
-    # coocs_query = coocs_query.filter(x2.weight > 1)
-    # ------------

-
-    # 3) OUTPUT FILTERS
+    # 5) OUTPUT FILTERS
    # ------------------
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
    coocs_query = coocs_query.having(ucooc >= threshold)

-    # 4) EXECUTE QUERY
+
+    # 6) EXECUTE QUERY
    # ----------------
    #  => storage in our matrix structure
    matrix = WeightedMatrix(coocs_query.all())
+    #                      -------------------

    # fyi
    shape_0 = len({pair[0] for pair in matrix.items})

--- a/static/lib/gargantext/NGrams_dyna_chart_and_table.js
+++ b/static/lib/gargantext/NGrams_dyna_chart_and_table.js
@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
  // console.log(JSON.stringify(NGrams))
  // -------------------------------------------------------------------

+  // ----------------------------------------- MAPLIST
+  // keepstateId = 1
+  keepstateId = System[0]["statesD"]["keep"]
+  if( Object.keys(NGrams["map"]).length>0 ) {
+      for(var ngram_id in NGrams["map"]) {
+          myNgramInfo = NGrams["main"].ngrams[ngram_id]
+          // initialize state of maplist items
+          myNgramInfo["state"] = keepstateId ;
+      }
+  }
+
+  // ----------------------------------------- STOPLIST
+  // delstateId = 2
+  delstateId = System[0]["statesD"]["delete"]
+  if( Object.keys(NGrams["stop"]).length>0 ) {
+      for(var ngram_id in NGrams["stop"]) {
+          console.log('stopping ' + ngram_id)
+          myNgramInfo = NGrams["main"].ngrams[ngram_id]
+          // initialize state of stoplist items
+          myNgramInfo["state"] = delstateId ;
+      }
+  }
+
    // Deleting subforms from the ngrams-table, clean start baby!
    if( Object.keys(NGrams["group"].links).length>0 ) {

@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
            }
        }

-        // debug:
-        // console.log('~~~~~~~~~~~~~> (sub) _forms')
-        // console.log( _forms )
-
        // ------------------------------------------- MAINLIST
        // ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
        var ngrams_data_ = {}
@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
    // console.log( NGrams["main"] )


-    // ----------------------------------------- MAPLIST
-    if( Object.keys(NGrams["map"]).length>0 ) {
-        for(var ngram_id in NGrams["main"].ngrams) {
-            myNgram = NGrams["main"].ngrams[ngram_id]
-            if(NGrams["map"][ngram_id]) {
-                // keepstateId = 1
-                keepstateId = System[0]["statesD"]["keep"]
-
-                // initialize state of maplist items
-                myNgram["state"] = keepstateId ;
-            }
-            else if (NGrams["stop"][ngram_id]) {
-                // delstateId = 2
-                delstateId = System[0]["statesD"]["delete"]
-
-                // initialize state of stoplist items
-                myNgram["state"] = delstateId ;
-            }
-        }
-    }
-
    // Building the Score-Selector //NGrams["scores"]
    var FirstScore = NGrams["main"].scores.initial
    // TODO scores_div