maplist creation from spec/gen metrics

b548874f · Romain Loth · d9b1cf7b · b548874f
Commit b548874f authored Jul 05, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 107 additions and 33 deletions

list_map.py gargantext/util/toolchain/list_map.py +107 -33

No files found.
--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -27,23 +27,31 @@ def do_maplist(corpus,

    Parameters:
      - mainlist_id (starting point, already cleaned of stoplist terms)
-      - specclusion_id (inclusion by cooc specificity -- ranking factor)
-      - genclusion_id (inclusion by cooc genericity -- ranking factor)
+      - specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
+      - genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
      - grouplist_id (filtering grouped ones)
      - overwrite_id: optional if preexisting MAPLIST node to overwrite

-      + 2 constants to modulate the terms choice
+      + 3 params to modulate the terms choice
        - limit for the amount of picked terms
        - monograms_part: a ratio of terms with only one lexical unit to keep
+                          (multigrams quota = limit * (1-monograms_part))
+        - genclusion_part: a ratio of terms with only one lexical unit to keep
+                           (speclusion quota = limit * (1-genclusion_part))
    '''

    if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id):
        raise ValueError("Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id")

-    monograms_limit = round(limit * monograms_part)
-    multigrams_limit = limit - monograms_limit
-    print("MAPLIST: monograms_limit =", monograms_limit)
-    print("MAPLIST: multigrams_limit = ", multigrams_limit)
+    quotas = {'topgen':{}, 'topspec':{}}
+    genclusion_limit = round(limit * genclusion_part)
+    speclusion_limit = limit - genclusion_limit
+    quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part)
+    quotas['topgen']['multigrams'] = genclusion_limit - quotas['topgen']['monograms']
+    quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part)
+    quotas['topspec']['multigrams'] = speclusion_limit - quotas['topspec']['monograms']
+
+    print("MAPLIST quotas:", quotas)

    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)

@@ -58,11 +66,19 @@ def do_maplist(corpus,
                         )

    ScoreSpec=aliased(NodeNgram)
-
-    # specificity-ranked
-    query = (session.query(ScoreSpec.ngram_id)
+    ScoreGen=aliased(NodeNgram)
+
+    # ngram with both ranking factors spec and gen
+    query = (session.query(
+                        ScoreSpec.ngram_id,
+                        ScoreSpec.weight,
+                        ScoreGen.weight,
+                        Ngram.n
+                        )
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
+                .join(ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id)
                .filter(ScoreSpec.node_id == specclusion_id)
+                .filter(ScoreGen.node_id == genclusion_id)

                # we want only terms within mainlist
                .join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
@@ -72,36 +88,96 @@ def do_maplist(corpus,
                .outerjoin(IsSubform,
                           IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
                .filter(IsSubform.c.ngram2_id == None)
-            )

-    # TODO: move these 2 pools up to mainlist selection
-    top_monograms = (query
-                .filter(Ngram.n == 1)
+                # specificity-ranked
                .order_by(desc(ScoreSpec.weight))
-                .limit(monograms_limit)
-                .all()
-               )
+            )

-    top_multigrams = (query
-                .filter(Ngram.n >= 2)
-                .order_by(desc(ScoreSpec.weight))
-                .limit(multigrams_limit)
-                .all()
-               )
-    obtained_mono  = len(top_monograms)
-    obtained_multi = len(top_multigrams)
-    obtained_total = obtained_mono + obtained_multi
-    # print("MAPLIST: top_monograms =", obtained_mono)
-    # print("MAPLIST: top_multigrams = ", obtained_multi)
+    # format in scored_ngrams array:
+    # -------------------------------
+    # [(37723,    8.428, 14.239,   3    ),   etc]
+    #   ngramid   wspec   wgen    nwords
+    scored_ngrams = query.all()
+    n_ngrams = len(scored_ngrams)
+
+    # results, with same structure as quotas
+    chosen_ngrams = {
+                     'topgen':{'monograms':[], 'multigrams':[]},
+                     'topspec':{'monograms':[], 'multigrams':[]}
+                     }
+
+    # specificity and genericity are rather reverse-correlated
+    # but occasionally they can have common ngrams (same ngram well ranked in both)
+    # => we'll use a lookup table to check if we didn't already get it
+    already_gotten_ngramids = {}
+
+    # 2 loops to fill spec-clusion then gen-clusion quotas
+    #   (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
+    for rkr in ['topspec', 'topgen']:
+        got_enough_mono = False
+        got_enough_multi = False
+        all_done = False
+        i = -1
+        while((not all_done) and (not (got_enough_mono and got_enough_multi))):
+            # retrieve sorted ngram n° i
+            i += 1
+            (ng_id, wspec, wgen, nwords) = scored_ngrams[i]
+
+            # before any continue case, we check the next i for max reached
+            all_done = (i+1 >= n_ngrams)
+
+            if ng_id in already_gotten_ngramids:
+                continue
+
+            # NB: nwords could be replaced by a simple search on r' '
+            if nwords == 1:
+                if got_enough_mono:
+                    continue
+                else:
+                    # add ngram to results and lookup
+                    chosen_ngrams[rkr]['monograms'].append(ng_id)
+                    already_gotten_ngramids[ng_id] = True
+            # multi
+            else:
+                if got_enough_multi:
+                    continue
+                else:
+                    # add ngram to results and lookup
+                    chosen_ngrams[rkr]['multigrams'].append(ng_id)
+                    already_gotten_ngramids[ng_id] = True
+
+            got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms'])
+            got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams'])
+
+        # at the end of the first loop we just need to sort all by the second ranker (gen)
+        scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True)
+
+    obtained_spec_mono = len(chosen_ngrams['topspec']['monograms'])
+    obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams'])
+    obtained_gen_mono = len(chosen_ngrams['topgen']['monograms'])
+    obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams'])
+    obtained_total = obtained_spec_mono   \
+                    + obtained_spec_multi \
+                    + obtained_gen_mono   \
+                    + obtained_gen_multi
+    print("MAPLIST: top_spec_monograms =",  obtained_spec_mono)
+    print("MAPLIST: top_spec_multigrams =", obtained_spec_multi)
+    print("MAPLIST: top_gen_monograms =",   obtained_gen_mono)
+    print("MAPLIST: top_gen_multigrams =",  obtained_gen_multi)
    print("MAPLIST: kept %i ngrams in total " % obtained_total)

+    obtained_data = chosen_ngrams['topspec']['monograms']      \
+                    + chosen_ngrams['topspec']['multigrams']   \
+                    + chosen_ngrams['topgen']['monograms']     \
+                    + chosen_ngrams['topgen']['multigrams']
+
    # NEW MAPLIST NODE
    # -----------------
    # saving the parameters of the analysis in the Node JSON
    new_hyperdata = { 'corpus': corpus.id,
                      'limit' : limit,
-                      'monograms_part' : monograms_part,
-                     'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
+                      'monograms_part' :  monograms_part,
+                      'genclusion_part' : genclusion_part,
                    }
    if overwrite_id:
        # overwrite pre-existing node
@@ -122,9 +198,7 @@ def do_maplist(corpus,
        the_id = the_maplist.id

    # create UnweightedList object and save (=> new NodeNgram rows)
-    datalist = UnweightedList(
-                   [res.ngram_id for res in top_monograms + top_multigrams]
-               )
+    datalist = UnweightedList(obtained_data)

    # save
    datalist.save(the_id)