Merge branch 'romain-goodies' into unstable

4bfc0b6c · delanoe · 1925c104 · f542b69e · 4bfc0b6c · 4bfc0b6c
Commit 4bfc0b6c authored Jul 08, 2016 by delanoe
11 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -12,14 +12,16 @@ LISTTYPES = {
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
-    'SPECIFICITY'  : WeightedList,
+    'SPECCLUSION'  : WeightedList,
+    'GENCLUSION'   : WeightedList,
    'OCCURRENCES'  : WeightedIndex,   # could be WeightedList
    'COOCCURRENCES': WeightedMatrix,
    'TFIDF-CORPUS' : WeightedIndex,
    'TFIDF-GLOBAL' : WeightedIndex,
    'TIRANK-LOCAL' : WeightedIndex,   # could be WeightedList
-    'TIRANK-GLOBAL' : WeightedIndex   # could be WeightedList
+    'TIRANK-GLOBAL' : WeightedIndex,   # could be WeightedList
 }
+# 'OWNLIST'      : UnweightedList,    # £TODO use this for any term-level tags

 NODETYPES = [
    # TODO separate id not array index, read by models.node
@@ -37,7 +39,7 @@ NODETYPES = [
    'COOCCURRENCES',         # 9
    # scores
    'OCCURRENCES',           # 10
-    'SPECIFICITY',           # 11
+    'SPECCLUSION',           # 11
    'CVALUE',                # 12
    'TFIDF-CORPUS',          # 13
    'TFIDF-GLOBAL',          # 14
@@ -47,6 +49,7 @@ NODETYPES = [
    # more scores (sorry!)
    'TIRANK-LOCAL',          # 16
    'TIRANK-GLOBAL',         # 17
+    'GENCLUSION',            # 18
 ]

 INDEXED_HYPERDATA = {
@@ -222,12 +225,16 @@ DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %
 DEFAULT_RANK_HARD_LIMIT        = 5000        # MAINLIST maximum terms abs
                                             # (makes COOCS larger ~ O(N²) /!\)

-DEFAULT_COOC_THRESHOLD          = 2          # inclusive minimum for COOCS coefs
+DEFAULT_COOC_THRESHOLD          = 3          # inclusive minimum for COOCS coefs
                                             # (makes COOCS more sparse)

 DEFAULT_MAPLIST_MAX             = 350        # MAPLIST maximum terms

-DEFAULT_MAPLIST_MONOGRAMS_RATIO = .15         # part of monograms in MAPLIST
+DEFAULT_MAPLIST_MONOGRAMS_RATIO = .2         # quota of monograms in MAPLIST
+                                             # (vs multigrams = 1-mono)
+
+DEFAULT_MAPLIST_GENCLUSION_RATIO = .6        # quota of top genclusion in MAPLIST
+                                             # (vs top specclusion = 1-gen)

 DEFAULT_MAX_NGRAM_LEN           = 7          # limit used after POStagging rule
                                             # (initial ngrams number is a power law of this /!\)
@@ -272,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

 # about batch processing...
 BATCH_PARSING_SIZE          = 256
-BATCH_NGRAMSEXTRACTION_SIZE = 1024
+BATCH_NGRAMSEXTRACTION_SIZE = 3000   # how many distinct ngrams before INTEGRATE


 # Scrapers config
@@ -282,7 +289,7 @@ QUERY_SIZE_N_DEFAULT = 1000

 # Grammar rules for chunking
 RULE_JJNN   = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
-RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
+RULE_NPN    = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
 RULE_TINA   = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
               +?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
               ,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -19,7 +19,7 @@ from gargantext.constants        import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO

 # import will implement the same text cleaning procedures as toolchain
 from gargantext.util.toolchain.parsing           import normalize_chars
-from gargantext.util.toolchain.ngrams_extraction import normalize_terms
+from gargantext.util.toolchain.ngrams_extraction import normalize_forms

 from sqlalchemy.sql      import exists
 from os                  import path

--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
 from gargantext.util.languages import languages
-from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_JJDTNN
+from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN

 import nltk
 import re

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -39,11 +39,11 @@ def do_mainlist(corpus,
    # retrieve helper nodes if not provided
    if not ranking_scores_id:
        ranking_scores_id  = session.query(Node.id).filter(
-                                Node.typename  == "TFIDF-GLOBAL",
+                                Node.typename  == "TIRANK-GLOBAL",
                                Node.parent_id == corpus.id
                    ).first()
        if not ranking_scores_id:
-            raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
+            raise ValueError("MAINLIST: TIRANK node needed for mainlist creation")

    if not stoplist_id:
        stoplist_id  = session.query(Node.id).filter(

--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -9,37 +9,49 @@ from gargantext.util.db_cache import cache
 from gargantext.util.lists    import UnweightedList
 from sqlalchemy               import desc, asc
 from gargantext.constants     import DEFAULT_MAPLIST_MAX,\
+                                     DEFAULT_MAPLIST_GENCLUSION_RATIO,\
                                     DEFAULT_MAPLIST_MONOGRAMS_RATIO

 def do_maplist(corpus,
               overwrite_id = None,
               mainlist_id  = None,
-               specificity_id = None,
+               specclusion_id = None,
+               genclusion_id = None,
               grouplist_id = None,
               limit=DEFAULT_MAPLIST_MAX,
+               genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO,
               monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
               ):
    '''
-    According to Specificities and mainlist
+    According to Genericity/Specificity and mainlist

    Parameters:
      - mainlist_id (starting point, already cleaned of stoplist terms)
-      - specificity_id (ranking factor)
+      - specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
+      - genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
      - grouplist_id (filtering grouped ones)
      - overwrite_id: optional if preexisting MAPLIST node to overwrite

-      + 2 constants to modulate the terms choice
+      + 3 params to modulate the terms choice
        - limit for the amount of picked terms
        - monograms_part: a ratio of terms with only one lexical unit to keep
+                          (multigrams quota = limit * (1-monograms_part))
+        - genclusion_part: a ratio of terms with only one lexical unit to keep
+                           (speclusion quota = limit * (1-genclusion_part))
    '''

-    if not (mainlist_id and specificity_id and grouplist_id):
-        raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
+    if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
+        raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")

-    monograms_limit = round(limit * monograms_part)
-    multigrams_limit = limit - monograms_limit
-    print("MAPLIST: monograms_limit =", monograms_limit)
-    print("MAPLIST: multigrams_limit = ", multigrams_limit)
+    quotas = {'topgen':{}, 'topspec':{}}
+    genclusion_limit = round(limit * genclusion_part)
+    speclusion_limit = limit - genclusion_limit
+    quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part)
+    quotas['topgen']['multigrams'] = genclusion_limit - quotas['topgen']['monograms']
+    quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part)
+    quotas['topspec']['multigrams'] = speclusion_limit - quotas['topspec']['monograms']
+
+    print("MAPLIST quotas:", quotas)

    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)

@@ -54,11 +66,19 @@ def do_maplist(corpus,
                         )

    ScoreSpec=aliased(NodeNgram)
-
-    # specificity-ranked
-    query = (session.query(ScoreSpec.ngram_id)
+    ScoreGen=aliased(NodeNgram)
+
+    # ngram with both ranking factors spec and gen
+    query = (session.query(
+                        ScoreSpec.ngram_id,
+                        ScoreSpec.weight,
+                        ScoreGen.weight,
+                        Ngram.n
+                        )
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
-                .filter(ScoreSpec.node_id == specificity_id)
+                .join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
+                .filter(ScoreSpec.node_id == specclusion_id)
+                .filter(ScoreGen.node_id == genclusion_id)

                # we want only terms within mainlist
                .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
@@ -68,36 +88,99 @@ def do_maplist(corpus,
                .outerjoin(IsSubform,
                           IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
                .filter(IsSubform.c.ngram2_id == None)
-            )

-    # TODO: move these 2 pools up to mainlist selection
-    top_monograms = (query
-                .filter(Ngram.n == 1)
-                .order_by(asc(ScoreSpec.weight))
-                .limit(monograms_limit)
-                .all()
-               )
-
-    top_multigrams = (query
-                .filter(Ngram.n >= 2)
+                # specificity-ranked
                .order_by(desc(ScoreSpec.weight))
-                .limit(multigrams_limit)
-                .all()
            )
-    obtained_mono  = len(top_monograms)
-    obtained_multi = len(top_multigrams)
-    obtained_total = obtained_mono + obtained_multi
-    # print("MAPLIST: top_monograms =", obtained_mono)
-    # print("MAPLIST: top_multigrams = ", obtained_multi)
+
+    # format in scored_ngrams array:
+    # -------------------------------
+    # [(37723,    8.428, 14.239,   3    ),   etc]
+    #   ngramid   wspec   wgen    nwords
+    scored_ngrams = query.all()
+    n_ngrams = len(scored_ngrams)
+
+    if n_ngrams == 0:
+        raise ValueError("No ngrams in cooc table ?")
+
+    # results, with same structure as quotas
+    chosen_ngrams = {
+                     'topgen':{'monograms':[], 'multigrams':[]},
+                     'topspec':{'monograms':[], 'multigrams':[]}
+                     }
+
+    # specificity and genericity are rather reverse-correlated
+    # but occasionally they can have common ngrams (same ngram well ranked in both)
+    # => we'll use a lookup table to check if we didn't already get it
+    already_gotten_ngramids = {}
+
+    # 2 loops to fill spec-clusion then gen-clusion quotas
+    #   (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
+    for rkr in ['topspec', 'topgen']:
+        got_enough_mono = False
+        got_enough_multi = False
+        all_done = False
+        i = -1
+        while((not all_done) and (not (got_enough_mono and got_enough_multi))):
+            # retrieve sorted ngram n° i
+            i += 1
+            (ng_id, wspec, wgen, nwords) = scored_ngrams[i]
+
+            # before any continue case, we check the next i for max reached
+            all_done = (i+1 >= n_ngrams)
+
+            if ng_id in already_gotten_ngramids:
+                continue
+
+            # NB: nwords could be replaced by a simple search on r' '
+            if nwords == 1:
+                if got_enough_mono:
+                    continue
+                else:
+                    # add ngram to results and lookup
+                    chosen_ngrams[rkr]['monograms'].append(ng_id)
+                    already_gotten_ngramids[ng_id] = True
+            # multi
+            else:
+                if got_enough_multi:
+                    continue
+                else:
+                    # add ngram to results and lookup
+                    chosen_ngrams[rkr]['multigrams'].append(ng_id)
+                    already_gotten_ngramids[ng_id] = True
+
+            got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms'])
+            got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams'])
+
+        # at the end of the first loop we just need to sort all by the second ranker (gen)
+        scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
+
+    obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
+    obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
+    obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
+    obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
+    obtained_total = obtained_spec_mono   \
+                    + obtained_spec_multi \
+                    + obtained_gen_mono   \
+                    + obtained_gen_multi
+    print("MAPLIST: top_spec_monograms =",  obtained_spec_mono)
+    print("MAPLIST: top_spec_multigrams =", obtained_spec_multi)
+    print("MAPLIST: top_gen_monograms =",   obtained_gen_mono)
+    print("MAPLIST: top_gen_multigrams =",  obtained_gen_multi)
    print("MAPLIST: kept %i ngrams in total " % obtained_total)

+    obtained_data = chosen_ngrams['topspec']['monograms']      \
+                    + chosen_ngrams['topspec']['multigrams']   \
+                    + chosen_ngrams['topgen']['monograms']     \
+                    + chosen_ngrams['topgen']['multigrams']
+
    # NEW MAPLIST NODE
    # -----------------
    # saving the parameters of the analysis in the Node JSON
    new_hyperdata = { 'corpus': corpus.id,
                      'limit' : limit,
                      'monograms_part' :  monograms_part,
-                     'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
+                      'genclusion_part' : genclusion_part,
                    }
    if overwrite_id:
        # overwrite pre-existing node
@@ -118,9 +201,7 @@ def do_maplist(corpus,
        the_id = the_maplist.id

    # create UnweightedList object and save (=> new NodeNgram rows)
-    datalist = UnweightedList(
-                   [res.ngram_id for res in top_monograms + top_multigrams]
-               )
+    datalist = UnweightedList(obtained_data)

    # save
    datalist.save(the_id)

--- a/gargantext/util/toolchain/main.py
+++ b/gargantext/util/toolchain/main.py
@@ -10,8 +10,8 @@ from .ngram_groups        import compute_groups
 from .metric_tfidf        import compute_occs, compute_tfidf_local, compute_ti_ranking
 from .list_main           import do_mainlist
 from .ngram_coocs         import compute_coocs
-from .metric_specificity  import compute_specificity
-from .list_map            import do_maplist     # TEST
+from .metric_specgen      import compute_specgen
+from .list_map            import do_maplist
 from .mail_notification   import notify_owner
 from gargantext.util.db   import session
 from gargantext.models    import Node
@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
    # => used for doc <=> ngram association

    # ------------
-    # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
+    # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)*
    coocs = compute_coocs(corpus,
                            on_list_id = mainlist_id,
                            groupings_id = group_id,
-                            just_pass_result = True)
+                            just_pass_result = True,
+                            diagonal_filter = False) # preserving the diagonal
+                                                     # (useful for spec/gen)
    print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))

-    # -> specificity: compute + write (=> NodeNodeNgram)
-    spec_id = compute_specificity(corpus,cooc_matrix = coocs)
+    # -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram)
+    (spec_id, gen_id) = compute_specgen(corpus,cooc_matrix = coocs)
    # no need here for subforms because cooc already counted them in mainform
-    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
+    print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id))
+    print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id))

    # maplist: compute + write (to Node and NodeNgram)
    map_id = do_maplist(corpus,
                        mainlist_id = mainlist_id,
-                        specificity_id=spec_id,
+                        specclusion_id=spec_id,
+                        genclusion_id=gen_id,
                        grouplist_id=group_id
                        )
    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
@@ -187,7 +191,7 @@ def recount(corpus):
         - ndocs
         - ti_rank
         - coocs
-         - specificity
+         - specclusion/genclusion
         - tfidf

    NB: no new extraction, no list change, just the metrics
@@ -208,10 +212,15 @@ def recount(corpus):
        old_tirank_id = None

    try:
-        old_spec_id   = corpus.children("SPECIFICITY").first().id
+        old_spec_id   = corpus.children("SPECCLUSION").first().id
    except:
        old_spec_id   = None

+    try:
+        old_gen_id   = corpus.children("GENCLUSION").first().id
+    except:
+        old_gen_id   = None
+
    try:
        old_ltfidf_id = corpus.children("TFIDF-CORPUS").first().id
    except:
@@ -254,11 +263,13 @@ def recount(corpus):
                            just_pass_result = True)
    print('RECOUNT #%d: [%s] updated mainlist coocs for specif rank' % (corpus.id, t()))

-    # -> specificity: compute + write (=> NodeNgram)
-    spec_id = compute_specificity(corpus,cooc_matrix = coocs, overwrite_id = old_spec_id)

+    # -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
+    (spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
+                            spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)

-    print('RECOUNT #%d: [%s] updated specificity node #%i' % (corpus.id, t(), spec_id))
+    print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
+    print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))

    print('RECOUNT #%d: [%s] FINISHED metric recounts' % (corpus.id, t()))


--- a/gargantext/util/toolchain/metric_specgen.py
+++ b/gargantext/util/toolchain/metric_specgen.py
+"""
+Computes a specificity metric from the ngram cooccurrence matrix.
+ + SAVE => WeightedList => NodeNgram
+"""
+from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
+from gargantext.util.db       import session, aliased, func, bulk_insert
+from gargantext.util.lists    import WeightedList
+from collections              import defaultdict
+from pandas                   import DataFrame
+from numpy                    import diag
+
+def round3(floating_number):
+    """
+    Rounds a floating number to 3 decimals
+    Good when we don't need so much details in the DB writen data
+    """
+    return float("%.3f" % floating_number)
+
+def compute_specgen(corpus, cooc_id=None, cooc_matrix=None,
+                    spec_overwrite_id = None, gen_overwrite_id = None):
+    '''
+    Compute genericity/specificity:
+        P(j|i) = N(ij) / N(ii)
+        P(i|j) = N(ij) / N(jj)
+
+        Gen(i) = Sum{j} P(j_k|i)
+        Spec(i)  = Sum{j} P(i|j_k)
+
+        Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
+        Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
+
+    Parameters:
+        - cooc_id: mandatory id of a cooccurrences node to use as base
+        - spec_overwrite_id: optional preexisting specificity node to overwrite
+        - gen_overwrite_id: optional preexisting genericity node to overwrite
+    '''
+
+    matrix = defaultdict(lambda : defaultdict(float))
+
+    if cooc_id == None and cooc_matrix == None:
+        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
+
+    elif cooc_id:
+        cooccurrences = (session.query(NodeNgramNgram)
+                        .filter(NodeNgramNgram.node_id==cooc_id)
+                        )
+        # no filtering: cooc already filtered on mainlist_id at creation
+        for cooccurrence in cooccurrences:
+            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
+            # matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
+
+    elif cooc_matrix:
+        # copy WeightedMatrix into local matrix structure
+        for (ngram1_id, ngram2_id) in cooc_matrix.items:
+
+            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
+            # ------- 8< --------------------------------------------
+            # tempo hack to ignore lines/columns where diagonal == 0
+            # £TODO find why they exist and then remove this snippet
+            if (((ngram1_id,ngram1_id) not in cooc_matrix.items) or
+                ((ngram2_id,ngram2_id) not in cooc_matrix.items)):
+                continue
+            # ------- 8< --------------------------------------------
+            matrix[ngram1_id][ngram2_id] = w
+
+    nb_ngrams = len(matrix)
+
+    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
+
+    # example corpus (7 docs, 8 nouns)
+    # --------------------------------
+    # "The report says that humans are animals."
+    # "The report says that rivers are full of water."
+    # "The report says that humans like to make war."
+    # "The report says that animals must eat food."
+    # "The report says that animals drink water."
+    # "The report says that humans like food and water."
+    # "The report says that grass is food for some animals."
+
+    #===========================================================================
+    cooc_counts = DataFrame(matrix).fillna(0)
+
+    # cooc_counts matrix
+    # ------------------
+    #           animals  food  grass  humans  report  rivers  war  water
+    # animals         4     2      1       1       4       0    0      1
+    # food            2     3      1       1       3       0    0      1
+    # grass           1     1      1       0       1       0    0      0
+    # humans          1     1      0       3       3       0    1      1
+    # report          4     3      1       3       7       1    1      3
+    # rivers          0     0      0       0       1       1    0      1
+    # war             0     0      0       1       1       0    1      0
+    # water           1     1      0       1       3       1    0      3
+
+    #===========================================================================
+    # conditional p(col|line)
+    diagonal = list(diag(cooc_counts))
+
+
+    # debug
+    # print("WARN diag: ", diagonal)
+    # print("WARN diag: =================== 0 in diagonal ?\n",
+    #         0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
+    #         "\n===================")
+
+    p_col_given_line = cooc_counts / list(diag(cooc_counts))
+
+    # p_col_given_line
+    # ----------------
+    #          animals  food  grass  humans  report rivers   war  water
+    # animals      1.0   0.7    1.0     0.3     0.6    0.0   0.0    0.3
+    # food         0.5   1.0    1.0     0.3     0.4    0.0   0.0    0.3
+    # grass        0.2   0.3    1.0     0.0     0.1    0.0   0.0    0.0
+    # humans       0.2   0.3    0.0     1.0     0.4    0.0   1.0    0.3
+    # report       1.0   1.0    1.0     1.0     1.0    1.0   1.0    1.0
+    # rivers       0.0   0.0    0.0     0.0     0.1    1.0   0.0    0.3
+    # war          0.0   0.0    0.0     0.3     0.1    0.0   1.0    0.0
+    # water        0.2   0.3    0.0     0.3     0.4    1.0   0.0    1.0
+
+    #===========================================================================
+    # total per lines (<=> genericity)
+    Gen = p_col_given_line.sum(axis=1)
+
+    # Gen.sort_values(ascending=False)
+    # ---
+    # report    8.0
+    # animals   3.9
+    # food      3.6
+    # water     3.3
+    # humans    3.3
+    # grass     1.7
+    # war       1.5
+    # rivers    1.5
+
+    #===========================================================================
+    # total columnwise (<=> specificity)
+    Spec = p_col_given_line.sum(axis=0)
+
+    # Spec.sort_values(ascending=False)
+    # ----
+    # grass     4.0
+    # food      3.7
+    # water     3.3
+    # humans    3.3
+    # report    3.3
+    # animals   3.2
+    # war       3.0
+    # rivers    3.0
+
+
+    #===========================================================================
+    # our "inclusion by specificity" metric
+    Specclusion = Spec-Gen
+
+    # Specclusion.sort_values(ascending=False)
+    # -----------
+    # grass      1.1
+    # war        0.8
+    # rivers     0.8
+    # food       0.0
+    # humans    -0.0
+    # water     -0.0
+    # animals   -0.3
+    # report    -2.4
+
+    #===========================================================================
+    # our "inclusion by genericity" metric
+    Genclusion = Spec+Gen
+
+    # Genclusion.sort_values(ascending=False)
+    # -----------
+    # report     11.3
+    # food        7.3
+    # animals     7.2
+    # water       6.7
+    # humans      6.7
+    # grass       5.7
+    # war         4.5
+    # rivers      4.5
+
+    #===========================================================================
+    # specificity node
+    if spec_overwrite_id:
+        # overwrite pre-existing id
+        the_spec_id = spec_overwrite_id
+        session.query(NodeNgram).filter(NodeNgram.node_id==the_spec_id).delete()
+        session.commit()
+    else:
+        specnode = corpus.add_child(
+            typename  = "SPECCLUSION",
+            name = "Specclusion (in:%s)" % corpus.id
+        )
+        session.add(specnode)
+        session.commit()
+        the_spec_id = specnode.id
+
+    if not Specclusion.empty:
+        data = WeightedList(
+                zip(  Specclusion.index.tolist()
+                    , [v for v  in map(round3, Specclusion.values.tolist())]
+                 )
+               )
+        data.save(the_spec_id)
+    else:
+        print("WARNING: had no terms in COOCS => empty SPECCLUSION node")
+
+    #===========================================================================
+    # genclusion node
+    if gen_overwrite_id:
+        the_gen_id = gen_overwrite_id
+        session.query(NodeNgram).filter(NodeNgram.node_id==the_gen_id).delete()
+        session.commit()
+    else:
+        gennode = corpus.add_child(
+            typename  = "GENCLUSION",
+            name = "Genclusion (in:%s)" % corpus.id
+        )
+        session.add(gennode)
+        session.commit()
+        the_gen_id = gennode.id
+
+    if not Genclusion.empty:
+        data = WeightedList(
+                zip(  Genclusion.index.tolist()
+                    , [v for v  in map(round3, Genclusion.values.tolist())]
+                 )
+               )
+        data.save(the_gen_id)
+    else:
+        print("WARNING: had no terms in COOCS => empty GENCLUSION node")
+
+    #===========================================================================
+    return(the_spec_id, the_gen_id)
--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
-"""
-Computes a specificity metric from the ngram cooccurrence matrix.
- + SAVE => WeightedList => NodeNgram
-"""
-from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
-from gargantext.util.db       import session, aliased, func, bulk_insert
-from gargantext.util.lists    import WeightedList
-from collections              import defaultdict
-from pandas                   import DataFrame
-import pandas as pd
-
-def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
-    '''
-    Compute the specificity, simple calculus.
-
-    Parameters:
-        - cooc_id: mandatory id of a cooccurrences node to use as base
-        - overwrite_id: optional preexisting specificity node to overwrite
-    '''
-
-    matrix = defaultdict(lambda : defaultdict(float))
-
-    if cooc_id == None and cooc_matrix == None:
-        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
-
-    elif cooc_id:
-        cooccurrences = (session.query(NodeNgramNgram)
-                        .filter(NodeNgramNgram.node_id==cooc_id)
-                        )
-        # no filtering: cooc already filtered on mainlist_id at creation
-        for cooccurrence in cooccurrences:
-            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
-            matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
-
-    elif cooc_matrix:
-        # copy WeightedMatrix into local matrix structure
-        for (ngram1_id, ngram2_id) in cooc_matrix.items:
-            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
-            matrix[ngram1_id][ngram2_id] = w
-
-    nb_ngrams = len(matrix)
-
-    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
-
-    x = DataFrame(matrix).fillna(0)
-
-    # proba (x/y) ( <= on divise chaque ligne par son total)
-    x = x / x.sum(axis=1)
-
-    # vectorisation
-    # d:Matrix => v: Vector (len = nb_ngrams)
-    # v = d.sum(axis=1) (- lui-même)
-    xs = x.sum(axis=1) - x
-    ys = x.sum(axis=0) - x
-
-
-    # top inclus ou exclus
-    #n = ( xs + ys) / (2 * (x.shape[0] - 1))
-
-    # top generic or specific (asc is spec, desc is generic)
-    v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
-
-    ## d ##
-    #######
-    #               Grenelle  biodiversité  kilomètres  site  élus  île
-    # Grenelle             0             0           4     0     0    0
-    # biodiversité         0             0           0     0     4    0
-    # kilomètres           4             0           0     0     4    0
-    # site                 0             0           0     0     4    6
-    # élus                 0             4           4     4     0    0
-    # île                  0             0           0     6     0    0
-
-
-    ## d.sum(axis=1) ##
-    ###################
-    # Grenelle         4
-    # biodiversité     4
-    # kilomètres       8
-    # site            10
-    # élus            12
-    # île              6
-
-    # résultat temporaire
-    # -------------------
-    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
-    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
-    # TODO analyser la cohérence math ET sem de cet indicateur
-    #v.sort_values(inplace=True)
-
-    # [ ('biodiversité' , 0.333 ),
-    #   ('Grenelle'     , 0.5   ),
-    #   ('île'          , 0.599 ),
-    #   ('kilomètres'   , 1.333 ),
-    #   ('site'         , 1.333 ),
-    #   ('élus'         , 1.899 ) ]
-
-    # ----------------
-    # specificity node
-    if overwrite_id:
-        # overwrite pre-existing id
-        the_id = overwrite_id
-        session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
-        session.commit()
-    else:
-        specnode = corpus.add_child(
-            typename  = "SPECIFICITY",
-            name = "Specif (in:%s)" % corpus.id
-        )
-        session.add(specnode)
-        session.commit()
-        the_id = specnode.id
-
-    # print(v)
-    pd.options.display.float_format = '${:,.2f}'.format
-
-    if not v.empty:
-        data = WeightedList(
-                zip(  v.index.tolist()
-                    , v.values.tolist()[0]
-                 )
-               )
-        data.save(the_id)
-    else:
-        print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
-
-    return(the_id)
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
@@ -18,7 +18,8 @@ def compute_coocs(  corpus,
                    stoplist_id     = None,
                    start           = None,
                    end             = None,
-                    symmetry_filter = False):
+                    symmetry_filter = False,
+                    diagonal_filter = True):
    """
    Count how often some extracted terms appear
    together in a small context (document)
@@ -55,6 +56,9 @@ def compute_coocs(  corpus,
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
                          this convention: "2001-01-01" aka "%Y-%m-%d")
+      - symmetry_filter: prevent calculating where ngram1_id  > ngram2_id
+      - diagonal_filter: prevent calculating where ngram1_id == ngram2_id
+

     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
@@ -69,7 +73,7 @@ def compute_coocs(  corpus,
        JOIN nodes_ngrams AS idxb
        ON idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
-        AND idxa.ngram_id <> idxb.ngram_id
+        AND idxa.ngram_id <> idxb.ngram_id   (diagonal_filter)
        AND idxa.node_id = MY_DOC ;

    on entire corpus
@@ -152,16 +156,14 @@ def compute_coocs(  corpus,
                    ucooc

                    # for debug (2/4)
-                    #, Xngram.terms.label("w_x")
-                    #, Yngram.terms.label("w_y")
+                    # , Xngram.terms.label("w_x")
+                    # , Yngram.terms.label("w_y")
                    )
               .join(Yindex, Xindex.node_id == Yindex.node_id )   # <- by definition of cooc

               .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
               .filter(Node.parent_id == corpus.id)   # <- b/c within corpus
               .filter(Node.typename == "DOCUMENT")   # <- b/c within corpus
-
-               .filter(Xindex_ngform_id != Yindex_ngform_id) # <- b/c not with itself
        )

    # outerjoin the synonyms if needed
@@ -179,12 +181,12 @@ def compute_coocs(  corpus,
               .group_by(
                    Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
                    # for debug (3/4)
-                    #,"w_x", "w_y"
+                    # ,"w_x", "w_y"
                    )

            # for debug (4/4)
-            #.join(Xngram, Xngram.id == Xindex_ngform_id)
-            #.join(Yngram, Yngram.id == Yindex_ngform_id)
+            # .join(Xngram, Xngram.id == Xindex_ngform_id)
+            # .join(Yngram, Yngram.id == Yindex_ngform_id)

            .order_by(ucooc)
           )
@@ -192,6 +194,9 @@ def compute_coocs(  corpus,

    # 4) INPUT FILTERS (reduce N before O(N²))
    if on_list_id:
+        # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
+        #       car permettrait expansion de liste aux plus proches voisins (MacLachlan)
+        #       (avec une matr rectangulaire)

        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)
@@ -226,6 +231,10 @@ def compute_coocs(  corpus,

        )

+    if diagonal_filter:
+        # don't compute ngram with itself
+        coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
+
    if start or end:
        Time = aliased(NodeHyperdata)

@@ -268,6 +277,7 @@ def compute_coocs(  corpus,
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
+    print("COOCS: filtering pairs under threshold:", threshold)
    coocs_query = coocs_query.having(ucooc >= threshold)



--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
                    continue
                # get ngrams
                for ngram in ngramsextractor.extract(value):
-                    tokens = tuple(token[0] for token in ngram)
+                    tokens = tuple(normalize_forms(token[0]) for token in ngram)

                    if do_subngrams:
                        # ex tokens = ["very", "cool", "exemple"]
@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
                        subterms = [tokens]

                    for seqterm in subterms:
-                        ngram = normalize_terms(' '.join(seqterm))
+                        ngram = ' '.join(seqterm)
                        if len(ngram) > 1:
                            # doc <=> ngram index
                            nodes_ngrams_count[(document.id, ngram)] += 1
@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        raise error


-def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
+def normalize_forms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
    """
    Removes unwanted trailing punctuation
    AND optionally puts everything to lowercase
@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):

    (benefits from normalize_chars upstream so there's less cases to consider)
    """
-    # print('normalize_terms  IN: "%s"' % term_str)
-    term_str = sub(r'^[-",;/%(){}\\\[\]\.\' ]+', '', term_str)
-    term_str = sub(r'[-",;/%(){}\\\[\]\.\' ]+$', '', term_str)
+    # print('normalize_forms  IN: "%s"' % term_str)
+    term_str = sub(r'^[-\'",;/%(){}\\\[\]\. ©]+', '', term_str)
+    term_str = sub(r'[-\'",;/%(){}\\\[\]\. ©]+$', '', term_str)

    if do_lowercase:
        term_str = term_str.lower()

-    # print('normalize_terms OUT: "%s"' % term_str)
+    # print('normalize_forms OUT: "%s"' % term_str)

    return term_str


--- a/gargantext/views/api/ngramlists.py
+++ b/gargantext/views/api/ngramlists.py
@@ -57,7 +57,7 @@ class CSVLists(APIView):
        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched

-        params in request.FILES:
+        params in request.data:
            csvfile:      the csv file

        /!\ We assume we checked the file size client-side before upload