[WIP] old specificity metric transformed into 2 better metrics: specclusion...

[WIP] old specificity metric transformed into 2 better metrics: specclusion and genclusion based on conditional probability

[WIP] old specificity metric transformed into 2 better metrics: specclusion...
[WIP] old specificity metric transformed into 2 better metrics: specclusion and genclusion based on conditional probability
49b9c2f0 · Romain Loth · 4f676883 · 49b9c2f0 · 49b9c2f0 · 49b9c2f0
Commit 49b9c2f0 authored Jul 04, 2016 by Romain Loth
6 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -12,7 +12,8 @@ LISTTYPES = {
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
-    'SPECIFICITY'  : WeightedList,
+    'SPECCLUSION'  : WeightedList,
+    'GENCLUSION'   : WeightedList,
    'OCCURRENCES'  : WeightedIndex,   # could be WeightedList
    'COOCCURRENCES': WeightedMatrix,
    'TFIDF-CORPUS' : WeightedIndex,
@@ -47,6 +48,7 @@ NODETYPES = [
    # more scores (sorry!)
    'TIRANK-LOCAL',          # 16
    'TIRANK-GLOBAL',         # 17
+    'GENCLUSION',            # 18
 ]
 INDEXED_HYPERDATA = {

--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -9,22 +9,26 @@ from gargantext.util.db_cache import cache
 from gargantext.util.lists    import UnweightedList
 from sqlalchemy               import desc, asc
 from gargantext.constants     import DEFAULT_MAPLIST_MAX,\
+                                     DEFAULT_MAPLIST_GENCLUSION_RATIO,\
                                     DEFAULT_MAPLIST_MONOGRAMS_RATIO
 def do_maplist(corpus,
               overwrite_id = None,
               mainlist_id  = None,
-               specificity_id = None,
+               specclusion_id = None,
+               genclusion_id = None,
               grouplist_id = None,
               limit=DEFAULT_MAPLIST_MAX,
+               genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
               monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
               ):
    '''
-    According to Specificities and mainlist
+    According to Genericity/Specificity and mainlist
    Parameters:
      - mainlist_id (starting point, already cleaned of stoplist terms)
-      - specificity_id (ranking factor)
+      - specclusion_id (inclusion by cooc specificity -- ranking factor)
+      - genclusion_id (inclusion by cooc genericity -- ranking factor)
      - grouplist_id (filtering grouped ones)
      - overwrite_id: optional if preexisting MAPLIST node to overwrite
@@ -33,8 +37,8 @@ def do_maplist(corpus,
        - monograms_part: a ratio of terms with only one lexical unit to keep
    '''
-    if not (mainlist_id and specificity_id and grouplist_id):
+    if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
-        raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
+        raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")
    monograms_limit = round(limit * monograms_part)
    multigrams_limit = limit - monograms_limit
@@ -58,7 +62,7 @@ def do_maplist(corpus,
    # specificity-ranked
    query = (session.query(ScoreSpec.ngram_id)
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
-                .filter(ScoreSpec.node_id == specificity_id)
+                .filter(ScoreSpec.node_id == specclusion_id)
                # we want only terms within mainlist
                .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
@@ -73,7 +77,7 @@ def do_maplist(corpus,
    # TODO: move these 2 pools up to mainlist selection
    top_monograms = (query
                .filter(Ngram.n == 1)
-                .order_by(asc(ScoreSpec.weight))
+                .order_by(desc(ScoreSpec.weight))
                .limit(monograms_limit)
                .all()
               )

--- a/gargantext/util/toolchain/main.py
+++ b/gargantext/util/toolchain/main.py
@@ -10,8 +10,8 @@ from .ngram_groups        import compute_groups
 from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_ti_ranking
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
-from .metric_specificity  import compute_specificity
+from .metric_specgen      import compute_specgen
-from .list_map            import do_maplist     # TEST
+from .list_map            import do_maplist
 from .mail_notification   import notify_owner
 from gargantext.util.db   import session
 from gargantext.models    import Node
@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
    # => used for doc <=> ngram association
    # ------------
-    # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
+    # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
    coocs = compute_coocs(corpus,
                            on_list_id = mainlist_id,
                            groupings_id = group_id,
-                            just_pass_result = True)
+                            just_pass_result = True,
+                            diagonal_filter = False) # preserving the diagonal
+                                                     # (useful for spec/gen)
    print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
-    # -> specificity: compute + write (=> NodeNodeNgram)
+    # -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
-    spec_id = compute_specificity(corpus,cooc_matrix = coocs)
+    (spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
    # no need here for subforms because cooc already counted them in mainform
-    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
+    print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
+    print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))
    # maplist: compute + write (to Node and NodeNgram)
    map_id = do_maplist(corpus,
                        mainlist_id = mainlist_id,
-                        specificity_id=spec_id,
+                        specclusion_id=spec_id,
+                        genclusion_id=gen_id,
                        grouplist_id=group_id
                        )
    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
@@ -187,7 +191,7 @@ def recount(corpus):
         - ndocs
         - ti_rank
         - coocs
-         - specificity
+         - specclusion/genclusion
         - tfidf
    NB: no new extraction, no list change, just the metrics
@@ -208,10 +212,15 @@ def recount(corpus):
        old_tirank_id = None
    try:
-        old_spec_id   = corpus.children("SPECIFICITY").first().id
+        old_spec_id   = corpus.children("SPECCLUSION").first().id
    except:
        old_spec_id   = None
+    try:
+        old_gen_id   = corpus.children("GENCLUSION").first().id
+    except:
+        old_gen_id   = None
    try:
        old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
    except:
@@ -254,11 +263,13 @@ def recount(corpus):
                            just_pass_result = True)
    print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))
-    # -> specificity: compute + write (=> NodeNgram)
-    spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)
+    # -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
+    (spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
+                            spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
-    print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id))
+    print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
+    print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
    print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))

--- a/gargantext/util/toolchain/metric_specgen.py
+++ b/gargantext/util/toolchain/metric_specgen.py
+"""
+Computes a specificity metric from the ngram cooccurrence matrix.
+ + SAVE => WeightedList => NodeNgram
+"""
+from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
+from gargantext.util.db       import session, aliased, func, bulk_insert
+from gargantext.util.lists    import WeightedList
+from collections              import defaultdict
+from pandas                   import DataFrame
+from numpy                    import diag
+def round3(floating_number):
+    """
+    Rounds a floating number to 3 decimals
+    Good when we don't need so much details in the DB writen data
+    """
+    return float("%.3f" % floating_number)
+def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
+                    spec_overwrite_id = None, gen_overwrite_id = None):
+    '''
+    Compute genericity/specificity:
+        P(j|i) = N(ij) / N(ii)
+        P(i|j) = N(ij) / N(jj)
+        Gen(i) = Sum{j} P(j_k|i)
+        Spec(i)  = Sum{j} P(i|j_k)
+        Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
+        Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
+    Parameters:
+        - cooc_id: mandatory id of a cooccurrences node to use as base
+        - spec_overwrite_id: optional preexisting specificity node to overwrite
+        - gen_overwrite_id: optional preexisting genericity node to overwrite
+    '''
+    matrix = defaultdict(lambda : defaultdict(float))
+    if cooc_id == None and cooc_matrix == None:
+        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
+    elif cooc_id:
+        cooccurrences = (session.query(NodeNgramNgram)
+                        .filter(NodeNgramNgram.node_id==cooc_id)
+                        )
+        # no filtering: cooc already filtered on mainlist_id at creation
+        for cooccurrence in cooccurrences:
+            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
+            # matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
+    elif cooc_matrix:
+        # copy WeightedMatrix into local matrix structure
+        for (ngram1_id, ngram2_id) in cooc_matrix.items:
+            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
+            # ------- 8< --------------------------------------------
+            # tempo hack to ignore lines/columns where diagonal == 0
+            # £TODO find why they exist and then remove this snippet
+            if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
+                ((ngram2_id,ngram2_id) not in cooc_matrix.items)):
+                continue
+            # ------- 8< --------------------------------------------
+            matrix[ngram1_id][ngram2_id] = w
+    nb_ngrams = len(matrix)
+    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
+    # example corpus (7 docs, 8 nouns)
+    # --------------------------------
+    # "The report says that humans are animals."
+    # "The report says that rivers are full of water."
+    # "The report says that humans like to make war."
+    # "The report says that animals must eat food."
+    # "The report says that animals drink water."
+    # "The report says that humans like food and water."
+    # "The report says that grass is food for some animals."
+    #===========================================================================
+    cooc_counts = DataFrame(matrix).fillna(0)
+    # cooc_counts matrix
+    # ------------------
+    #           animals  food  grass  humans  report  rivers  war  water
+    # animals         4     2      1       1       4       0    0      1
+    # food            2     3      1       1       3       0    0      1
+    # grass           1     1      1       0       1       0    0      0
+    # humans          1     1      0       3       3       0    1      1
+    # report          4     3      1       3       7       1    1      3
+    # rivers          0     0      0       0       1       1    0      1
+    # war             0     0      0       1       1       0    1      0
+    # water           1     1      0       1       3       1    0      3
+    #===========================================================================
+    # conditional p(col|line)
+    diagonal = list(diag(cooc_counts))
+    # debug
+    # print("WARN diag: ", diagonal)
+    # print("WARN diag: =================== 0 in diagonal ?\n",
+    #         0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
+    #         "\n===================")
+    p_col_given_line = cooc_counts / list(diag(cooc_counts))
+    # p_col_given_line
+    # ----------------
+    #          animals  food  grass  humans  report rivers   war  water
+    # animals      1.0   0.7    1.0     0.3     0.6    0.0   0.0    0.3
+    # food         0.5   1.0    1.0     0.3     0.4    0.0   0.0    0.3
+    # grass        0.2   0.3    1.0     0.0     0.1    0.0   0.0    0.0
+    # humans       0.2   0.3    0.0     1.0     0.4    0.0   1.0    0.3
+    # report       1.0   1.0    1.0     1.0     1.0    1.0   1.0    1.0
+    # rivers       0.0   0.0    0.0     0.0     0.1    1.0   0.0    0.3
+    # war          0.0   0.0    0.0     0.3     0.1    0.0   1.0    0.0
+    # water        0.2   0.3    0.0     0.3     0.4    1.0   0.0    1.0
+    #===========================================================================
+    # total per lines (<=> genericity)
+    Gen = p_col_given_line.sum(axis=1)
+    # Gen.sort_values(ascending=False)
+    # ---
+    # report    8.0
+    # animals   3.9
+    # food      3.6
+    # water     3.3
+    # humans    3.3
+    # grass     1.7
+    # war       1.5
+    # rivers    1.5
+    #===========================================================================
+    # total columnwise (<=> specificity)
+    Spec = p_col_given_line.sum(axis=0)
+    # Spec.sort_values(ascending=False)
+    # ----
+    # grass     4.0
+    # food      3.7
+    # water     3.3
+    # humans    3.3
+    # report    3.3
+    # animals   3.2
+    # war       3.0
+    # rivers    3.0
+    #===========================================================================
+    # our "inclusion by specificity" metric
+    Specclusion = Spec-Gen
+    # Specclusion.sort_values(ascending=False)
+    # -----------
+    # grass      1.1
+    # war        0.8
+    # rivers     0.8
+    # food       0.0
+    # humans    -0.0
+    # water     -0.0
+    # animals   -0.3
+    # report    -2.4
+    #===========================================================================
+    # our "inclusion by genericity" metric
+    Genclusion = Spec+Gen
+    # Genclusion.sort_values(ascending=False)
+    # -----------
+    # report     11.3
+    # food        7.3
+    # animals     7.2
+    # water       6.7
+    # humans      6.7
+    # grass       5.7
+    # war         4.5
+    # rivers      4.5
+    #===========================================================================
+    # specificity node
+    if spec_overwrite_id:
+        # overwrite pre-existing id
+        the_spec_id = spec_overwrite_id
+        session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
+        session.commit()
+    else:
+        specnode = corpus.add_child(
+            typename  = "SPECCLUSION",
+            name = "Specclusion (in:%s)" % corpus.id
+        )
+        session.add(specnode)
+        session.commit()
+        the_spec_id = specnode.id
+    # debug:
+    options.display.float_format = '${:,.3f}'.format
+    # print(Specclusion)
+    if not Specclusion.empty:
+        data = WeightedList(
+                zip(  Specclusion.index.tolist()
+                    , [v for v  in map(round3, Specclusion.values.tolist())]
+                 )
+               )
+        data.save(the_spec_id)
+    else:
+        print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
+    #===========================================================================
+    # genclusion node
+    if gen_overwrite_id:
+        the_gen_id = gen_overwrite_id
+        session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
+        session.commit()
+    else:
+        gennode = corpus.add_child(
+            typename  = "GENCLUSION",
+            name = "Genclusion (in:%s)" % corpus.id
+        )
+        session.add(gennode)
+        session.commit()
+        the_gen_id = gennode.id
+    if not Genclusion.empty:
+        data = WeightedList(
+                zip(  Genclusion.index.tolist()
+                    , [v for v  in map(round3, Genclusion.values.tolist())]
+                 )
+               )
+        data.save(the_gen_id)
+    else:
+        print("WARNING: had no terms in COOCS => empty GENCLUSION node")
+    #===========================================================================
+    return(the_spec_id, the_gen_id)
--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
-"""
-Computes a specificity metric from the ngram cooccurrence matrix.
- + SAVE => WeightedList => NodeNgram
-"""
-from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
-from gargantext.util.db       import session, aliased, func, bulk_insert
-from gargantext.util.lists    import WeightedList
-from collections              import defaultdict
-from pandas                   import DataFrame
-import pandas as pd
-def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
-    '''
-    Compute the specificity, simple calculus.
-    Parameters:
-        - cooc_id: mandatory id of a cooccurrences node to use as base
-        - overwrite_id: optional preexisting specificity node to overwrite
-    '''
-    matrix = defaultdict(lambda : defaultdict(float))
-    if cooc_id == None and cooc_matrix == None:
-        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
-    elif cooc_id:
-        cooccurrences = (session.query(NodeNgramNgram)
-                        .filter(NodeNgramNgram.node_id==cooc_id)
-                        )
-        # no filtering: cooc already filtered on mainlist_id at creation
-        for cooccurrence in cooccurrences:
-            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
-            matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
-    elif cooc_matrix:
-        # copy WeightedMatrix into local matrix structure
-        for (ngram1_id, ngram2_id) in cooc_matrix.items:
-            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
-            matrix[ngram1_id][ngram2_id] = w
-    nb_ngrams = len(matrix)
-    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
-    x = DataFrame(matrix).fillna(0)
-    # proba (x/y) ( <= on divise chaque ligne par son total)
-    x = x / x.sum(axis=1)
-    # vectorisation
-    # d:Matrix => v: Vector (len = nb_ngrams)
-    # v = d.sum(axis=1) (- lui-même)
-    xs = x.sum(axis=1) - x
-    ys = x.sum(axis=0) - x
-    # top inclus ou exclus
-    #n = ( xs + ys) / (2 * (x.shape[0] - 1))
-    # top generic or specific (asc is spec, desc is generic)
-    v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
-    ## d ##
-    #######
-    #               Grenelle  biodiversité  kilomètres  site  élus  île
-    # Grenelle             0             0           4     0     0    0
-    # biodiversité         0             0           0     0     4    0
-    # kilomètres           4             0           0     0     4    0
-    # site                 0             0           0     0     4    6
-    # élus                 0             4           4     4     0    0
-    # île                  0             0           0     6     0    0
-    ## d.sum(axis=1) ##
-    ###################
-    # Grenelle         4
-    # biodiversité     4
-    # kilomètres       8
-    # site            10
-    # élus            12
-    # île              6
-    # résultat temporaire
-    # -------------------
-    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
-    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
-    # TODO analyser la cohérence math ET sem de cet indicateur
-    #v.sort_values(inplace=True)
-    # [ ('biodiversité' , 0.333 ),
-    #   ('Grenelle'     , 0.5   ),
-    #   ('île'          , 0.599 ),
-    #   ('kilomètres'   , 1.333 ),
-    #   ('site'         , 1.333 ),
-    #   ('élus'         , 1.899 ) ]
-    # ----------------
-    # specificity node
-    if overwrite_id:
-        # overwrite pre-existing id
-        the_id = overwrite_id
-        session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
-        session.commit()
-    else:
-        specnode = corpus.add_child(
-            typename  = "SPECIFICITY",
-            name = "Specif (in:%s)" % corpus.id
-        )
-        session.add(specnode)
-        session.commit()
-        the_id = specnode.id
-    # print(v)
-    pd.options.display.float_format = '${:,.2f}'.format
-    if not v.empty:
-        data = WeightedList(
-                zip(  v.index.tolist()
-                    , v.values.tolist()[0]
-                 )
-               )
-        data.save(the_id)
-    else:
-        print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
-    return(the_id)
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -18,7 +18,8 @@ def compute_coocs(  corpus,
                    stoplist_id     = None,
                    start           = None,
                    end             = None,
-                    symmetry_filter = False):
+                    symmetry_filter = False,
+                    diagonal_filter = True):
    """
    Count how often some extracted terms appear
    together in a small context (document)
@@ -55,6 +56,9 @@ def compute_coocs(  corpus,
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
                          this convention: "2001-01-01" aka "%Y-%m-%d")
+      - symmetry_filter: prevent calculating where ngram1_id  > ngram2_id
+      - diagonal_filter: prevent calculating where ngram1_id == ngram2_id
     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
@@ -69,7 +73,7 @@ def compute_coocs(  corpus,
        JOIN nodes_ngrams AS idxb
        ON idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
-        AND idxa.ngram_id <> idxb.ngram_id
+        AND idxa.ngram_id <> idxb.ngram_id   (diagonal_filter)
        AND idxa.node_id = MY_DOC ;
    on entire corpus
@@ -152,16 +156,14 @@ def compute_coocs(  corpus,
                    ucooc
                    # for debug (2/4)
-                    #, Xngram.terms.label("w_x")
+                    # , Xngram.terms.label("w_x")
-                    #, Yngram.terms.label("w_y")
+                    # , Yngram.terms.label("w_y")
                    )
               .join(Yindex, Xindex.node_id == Yindex.node_id )   # <- by definition of cooc
               .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
               .filter(Node.parent_id == corpus.id)   # <- b/c within corpus
               .filter(Node.typename == "DOCUMENT")   # <- b/c within corpus
-               .filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
        )
    # outerjoin the synonyms if needed
@@ -179,12 +181,12 @@ def compute_coocs(  corpus,
               .group_by(
                    Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
                    # for debug (3/4)
-                    #,"w_x", "w_y"
+                    # ,"w_x", "w_y"
                    )
            # for debug (4/4)
-            #.join(Xngram, Xngram.id == Xindex_ngform_id)
+            # .join(Xngram, Xngram.id == Xindex_ngform_id)
-            #.join(Yngram, Yngram.id == Yindex_ngform_id)
+            # .join(Yngram, Yngram.id == Yindex_ngform_id)
            .order_by(ucooc)
           )
@@ -192,6 +194,9 @@ def compute_coocs(  corpus,
    # 4) INPUT FILTERS (reduce N before O(N²))
    if on_list_id:
+        # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
+        #       car permettrait expansion de liste aux plus proches voisins (MacLachlan)
+        #       (avec une matr rectangulaire)
        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)
@@ -226,6 +231,10 @@ def compute_coocs(  corpus,
        )
+    if diagonal_filter:
+        # don't compute ngram with itself
+        coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
    if start or end:
        Time = aliased(NodeHyperdata)
@@ -268,6 +277,7 @@ def compute_coocs(  corpus,
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
+    print("COOCS: filtering pairs under threshold:", threshold)
    coocs_query = coocs_query.having(ucooc >= threshold)