maplist generation and better estimates for constants (thresholds)

58aa990d · Romain Loth · 744ec7f1 · 58aa990d · 58aa990d · 58aa990d
Commit 58aa990d authored Mar 14, 2016 by Romain Loth
11 changed files
--- a/doc/ngram_parsing_flow.png
+++ b/doc/ngram_parsing_flow.png
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -9,9 +9,11 @@ LISTTYPES = {
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
+    'SPECIFICITY'  : WeightedList,
    'OCCURRENCES'  : WeightedContextIndex,
    'COOCCURRENCES': WeightedMatrix,
    'TFIDF-CORPUS' : WeightedContextIndex,
+    'TFIDF-GLOBAL' : WeightedContextIndex,
 }
 NODETYPES = [
@@ -92,10 +94,21 @@ RESOURCETYPES = [
    # },
 ]
-# linguistic extraction parameters
+# linguistic extraction parameters ---------------------------------------------
-DEFAULT_TFIDF_CUTOFF_RATIO = .55      # for MAINLIST maximum terms
+DEFAULT_TFIDF_CUTOFF_RATIO = .45             # MAINLIST maximum terms in %
-DEFAULT_TFIDF_HARD_LIMIT = 1000       # for MAINLIST maximum terms
-DEFAULT_COOC_THRESHOLD = 4            # for COOCCURRENCES node
+DEFAULT_TFIDF_HARD_LIMIT = 750               # MAINLIST maximum terms abs
+                                             # (makes COOCS larger ~ O(N²) /!\)
+DEFAULT_COOC_THRESHOLD = 5                   # inclusive minimum for COOCS coefs
+                                             # (makes COOCS more sparse)
+DEFAULT_MAPLIST_MAX = 300                    # MAPLIST maximum terms
+DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5         # part of monograms in MAPLIST
+                                             # (NB: used to be 0.005 !!)
+# ------------------------------------------------------------------------------
 # other parameters
 # default number of docs POSTed to scrappers.views.py

--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
 from .parsing           import parse
 from .ngrams_extraction import extract_ngrams
+# in usual run order
 from .list_stop         import do_stoplist
 from .ngram_scores      import compute_occurrences_local, compute_tfidf
 from .list_main         import do_mainlist
 from .ngram_coocs_tempo import compute_coocs
 from .score_specificity import compute_specificity
-from .list_map          import compute_mapList     # TEST
+from .list_map          import do_maplist     # TEST
 from .ngram_groups      import compute_groups
 from gargantext.util.db import session
@@ -40,10 +41,19 @@ def parse_extract(corpus):
    # -------------------------------
    print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
-    # -> stoplist: compute + write (=> Node and NodeNgram)
+    # -> stoplist: filter + write (to Node and NodeNgram)
-    stop_id = compute_stop(corpus)
+    stop_id = do_stoplist(corpus)
    print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
+    # -> write groups to Node and NodeNgramNgram
+    group_id = compute_groups(corpus, stoplist_id = None)
+    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
+    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
+    occ_id = compute_occurrences_local(corpus)
+    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
+    # ------------
    # -> write local tfidf to Node and NodeNodeNgram
    ltfidf_id = compute_tfidf(corpus, scope="local")
    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
@@ -52,31 +62,27 @@ def parse_extract(corpus):
    gtfidf_id = compute_tfidf(corpus, scope="global")
    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
-    # -> mainlist: compute + write (to Node and NodeNgram)
+    # -> mainlist: filter + write (to Node and NodeNgram)
-    mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id)
+    mainlist_id = do_mainlist(corpus,
+                              tfidf_id = gtfidf_id,
+                              stoplist_id = stop_id)
    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
+    # ------------
    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
-    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
-    print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))
+    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
-    # ?? specificity: compute + write (=> NodeNodeNgram)
+    # -> specificity: compute + write (=> NodeNodeNgram)
    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
-    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
+    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
    # ?? maplist: compute + write (to Node and NodeNgram)
-    # map_id = compute_stop(corpus)
+    map_id = do_maplist(corpus,
-    # print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
+                        mainlist_id = mainlist_id,
+                        specificity_id=spec_id,
-    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
+                        grouplist_id=group_id)
-    occ_id = compute_occurrences_local(corpus)
+    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
-    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
-    # -> write groups to Node and NodeNgramNgram
-    group_id = compute_groups(corpus, stoplist_id = None)
-    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
 def t():

--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
@@ -2,26 +2,38 @@ from gargantext.models     import Node, NodeNgram, NodeNodeNgram
 from gargantext.util.db    import session
 from gargantext.util.lists import UnweightedList
 from sqlalchemy            import desc
-from gargantext.constants  import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT
+from gargantext.constants  import DEFAULT_TFIDF_CUTOFF_RATIO, \
-from math                  import floor
+                                  DEFAULT_TFIDF_HARD_LIMIT
-def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
+def do_mainlist(corpus,
+                    overwrite_id  = None,
+                    tfidf_id=None, stoplist_id=None,
                    hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
                    ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
                    ):
    """
-    Select terms for the mainlist according to a global tfidf and stoplist.
+    Select top n terms according to a global tfidf ranking and stoplist filter.
    The number of selected terms will be:
        min(hard_limit, number_of_terms * ratio_limit)
    NB : We use a global tfidf node where the values are global but the ngrams
         are already selected (== only within this corpus documents).
+         TO DISCUSS: allow influence of the local tfidf scores too
    Parameters:
-        2 limits are useful to set a maximum amount of picked terms
+        - the corpus itself
-        - ratio_limit: relative to the number of distinct ngrams  [0,1]
+        - a tfidf score for ranking the ngrams
-        - hard_limit: absolute value [default: 1000]
+        - a stoplist for filtering some ngrams
+        - overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
+                     (the Node and its previous NodeNgram rows will be replaced)
+      + 2 limits to set the amount of picked terms:
+        - ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
+          (default: 0.55)
+        - hard_limit: an absolute max value
+          (default: 1000)
    """
    # retrieve helper nodes if not provided
@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
    nb_ngrams = ordered_filtered_tfidf.count()
    # apply ratio to find smallest limit
-    our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit))
+    our_limit = min(hard_limit, round(nb_ngrams * ratio_limit))
+    print("MAINLIST: keeping %i ngrams out of %i" % (our_limit,nb_ngrams))
    # DB retrieve up to limit => MAINLIST
    top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
-    # now create the new MAINLIST node
+    if overwrite_id:
-    mainlist = corpus.add_child(
+        # overwrite pre-existing id
-        typename  = "MAINLIST",
+        the_id = overwrite_id
-        name = "Mainlist (in:%s)" % corpus.name[0:10]
+        # mainlist = cache.Node[overwrite_id]
-    )
+    else:
-    session.add(mainlist)
+        # now create the new MAINLIST node
-    session.commit()
+        mainlist = corpus.add_child(
+            typename  = "MAINLIST",
-    the_id = mainlist.id
+            name = "Mainlist (in:%s)" % corpus.id
+        )
+        session.add(mainlist)
+        session.commit()
+        the_id = mainlist.id
    # create UnweightedList object and save (=> new NodeNgram rows)
    UnweightedList(top_ngrams_ids).save(the_id)

--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
-from gargantext.util.db import *
+"""
-from gargantext.util.db_cache import *
+Selects a subset of corpus ngrams to use in the graph map.
-from gargantext.constants import *
+"""
-from gargantext.models.ngrams import Ngram, NodeNgram,\
+from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
-        NodeNodeNgram, NodeNgramNgram
+                                     NodeNgramNgram, NodeNodeNgram
+from gargantext.util.db       import session, aliased, func
+from gargantext.util.db_cache import cache
+from gargantext.util.lists    import UnweightedList
+from sqlalchemy               import desc
+from gargantext.constants     import DEFAULT_MAPLIST_MAX,\
+                                     DEFAULT_MAPLIST_MONOGRAMS_RATIO
+def do_maplist(corpus,
+               overwrite_id = None,
+               mainlist_id  = None,
+               specificity_id = None,
+               grouplist_id = None,
+               limit=DEFAULT_MAPLIST_MAX,
+               monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
+               ):
+    '''
+    According to Specificities and mainlist
-from sqlalchemy.sql import func
+    Parameters:
-from sqlalchemy import desc, asc, or_, and_, Date, cast, select
+      - mainlist_id (starting point, already cleaned of stoplist terms)
-from sqlalchemy import literal_column
+      - specificity_id (ranking factor)
-from sqlalchemy.orm import aliased
+      - grouplist_id (filtering grouped ones)
+      - overwrite_id: optional if preexisting MAPLIST node to overwrite
-from gargantext.util.toolchain.ngram_tools import insert_ngrams
-import csv
-def compute_mapList(corpus_id,limit=500,n=1, session=None):
+      + 2 constants to modulate the terms choice
-    '''
+        - limit for the amount of picked terms
-    According to Specificities and stoplist,
+        - monograms_part: a ratio of terms with only one lexical unit to keep
    '''
+    if not (mainlist_id and specificity_id and grouplist_id):
-    monograms_part = 0.005
+        raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
    monograms_limit = round(limit * monograms_part)
    multigrams_limit = limit - monograms_limit
+    print("MAPLIST: monograms_limit =", monograms_limit)
+    print("MAPLIST: multigrams_limit = ", multigrams_limit)
    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
-    list_main_id  = session.query(Node.id).filter(
+    mainterms_subquery = (session
-            Node.typename  == "MAINLIST",
+                            # we want only terms within mainlist
-            Node.parent_id == corpus_id).first()
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id == mainlist_id)
-    list_stop_id  = session.query(Node.id).filter(
+                            .subquery()
-            Node.typename  == "STOPLIST",
+                         )
-            Node.parent_id == corpus_id).first()
+    primary_groupterms_subquery = (session
-    list_group_id = session.query(Node.id).filter(
+                            # we want only primary terms (ngram1)
-            Node.typename  == "GROUPLIST",
+                            .query(NodeNgramNgram.ngram1_id)
-            Node.parent_id == corpus_id).first()
+                            .filter(NodeNgramNgram.node_id == grouplist_id)
+                            .subquery()
-    score_spec_id = session.query(Node.id).filter(
+                         )
-            Node.typename  == "SPECIFICITY",
-            Node.parent_id == corpus_id).first()
+    ScoreSpec=aliased(NodeNgram)
+    # specificity-ranked
-    ListMain=aliased(NodeNgram)
+    query = (session.query(ScoreSpec.ngram_id)
-    ListStop=aliased(NodeNgram)
-    ListGroup=aliased(NodeNgramNgram)
-    ScoreSpec=aliased(NodeNodeNgram)
-    # FIXME outerjoin does not work with current SqlAlchemy
-    # lines below the query do the job but it can be improved
-    query = (session.query(ScoreSpec.ngram_id, ScoreSpec.score)
-                .join(ListMain, ScoreSpec.ngram_id == ListMain.ngram_id)
                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
-                #.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
+                .filter(ScoreSpec.node_id == specificity_id)
-                #.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
+                .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
-                .filter(ListMain.node_id == list_main_id)
+                .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
-                #.filter(ListGroup.node_id == list_group_id)
-                #.filter(ListStop.node_id == list_stop_id)
-                .filter(ScoreSpec.nodex_id == score_spec_id)
            )
    top_monograms = (query
                .filter(Ngram.n == 1)
-                .order_by(desc(ScoreSpec.score))
+                .order_by(desc(ScoreSpec.weight))
                .limit(monograms_limit)
+                .all()
               )
    top_multigrams = (query
                .filter(Ngram.n >= 2)
-                .order_by(desc(ScoreSpec.score))
+                .order_by(desc(ScoreSpec.weight))
                .limit(multigrams_limit)
+                .all()
               )
-    stop_ngrams = (session.query(NodeNgram.ngram_id)
+    print("MAPLIST: top_monograms =", len(top_monograms))
-                         .filter(NodeNgram.node_id == list_stop_id)
+    print("MAPLIST: top_multigrams = ", len(top_multigrams))
-                         .all()
-                 )
+    # NEW MAPLIST NODE
+    # -----------------
-    grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
+    # saving the parameters of the analysis in the Node JSON
-                             .filter(NodeNgramNgram.node_id == list_group_id)
+    new_hyperdata = { 'corpus': corpus.id,
-                             .all()
+                      'limit' : limit,
+                      'monograms_part' : monograms_part
+                    }
+    if overwrite_id:
+        # overwrite pre-existing node
+        the_maplist = cache.Node[overwrite_id]
+        the_maplist.hyperdata = new_hyperdata
+        the_maplist.save_hyperdata()
+        session.commit()
+        the_id = overwrite_id
+    else:
+        # create a new maplist node
+        the_maplist = corpus.add_child(
+                        name="Maplist (in %i)" % corpus.id,
+                        typename="MAPLIST",
+                        hyperdata = new_hyperdata
                    )
+        session.add(the_maplist)
-    list_map_id = session.query(Node.id).filter(
-        Node.parent_id==corpus_id,
-        Node.typename == "MAPLIST"
-        ).first()
-    if list_map_id == None:
-        corpus = cache.Node[corpus_id]
-        user_id = corpus.user_id
-        list_map = Node(name="MAPLIST", parent_id=corpus_id, user_id=user_id, typename="MAPLIST")
-        session.add(list_map)
        session.commit()
-        list_map_id = list_map.id
+        the_id = the_maplist.id
+    # create UnweightedList object and save (=> new NodeNgram rows)
-    session.query(NodeNgram).filter(NodeNgram.node_id==list_map_id).delete()
+    datalist = UnweightedList(
-    session.commit()
+                   [res.ngram_id for res in top_monograms + top_multigrams]
+               )
-    data = zip(
-        [list_map_id for i in range(1,limit)]
+    # save
-        , [n[0] for n in list(top_multigrams) + list(top_monograms)
+    datalist.save(the_id)
-                if (n[0],) not in list(stop_ngrams) 
-            ]
+    # dbg.show('MapList computed')
-        , [1 for i in range(1,limit)]
-    )
-    #print([d for d in data])
-    bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
-    dbg.show('MapList computed')
+    return the_id
--- a/gargantext/util/toolchain/list_stop.py
+++ b/gargantext/util/toolchain/list_stop.py
-from gargantext.util.db import *
+"""
-from gargantext.util.db_cache import *
+Creates a filtering list for corpus ngrams.
-from gargantext.constants import *
+(implementation: regexp + "master" stoplist)
+"""
-from gargantext.util.db    import session, aliased, func
-from gargantext.util.lists import WeightedMatrix
 from gargantext.models        import User, Node, Ngram, NodeNgram
+from gargantext.util.db       import session, func
+from gargantext.constants     import LISTTYPES
+from re                       import compile
+from sqlalchemy               import desc
-import re
+def is_stop_word(ngram, stop_words=None):
-from sqlalchemy import desc, asc
-#from ngram.tools import insert_ngrams
-def isStopWord(ngram, stop_words=None):
    '''
    ngram :: (Int, String) => (ngram_id, ngram_terms)
    stop_words :: Set of String
-    (to avoid SQL query each time isStopWord is invoked, get in as parameter)
+    (to avoid SQL query each time is_stop_word is invoked, get in as parameter)
    '''
    word = ngram[1]
@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
            , "(.*)(travers)(.*)"
            , "(.*)(:|\|)(.*)"
            ] :
-        compiled_regexes.append(re.compile(regex))
+        compiled_regexes.append(compile(regex))
    for format_regex in compiled_regexes:
        if format_regex.match(word):
@@ -61,32 +59,27 @@ def create_gargantua_resources():
    session.add(stopList)
    session.commit()
-def compute_stop(corpus, stopList_id=None, debug=False):
+def do_stoplist(corpus, overwrite_id=None):
    '''
    Create list of stop words.
    TODO do a function to get all stop words with social scores
+    Parameters:
+        - overwrite_id: optional preexisting STOPLIST node to overwrite
    '''
-    # Get the StopList if it exist or create a new one
+    # Get preexisting StopList if provided in overwrite_id param
+    if overwrite_id:
+        stoplist_id = overwrite_id
    # At this step of development, a new StopList should be created
-    if stopList_id == None:
+    else:
-        stopList_id = session.query(Node.id).filter(
+        stoplist = corpus.add_child(
-            Node.parent_id==corpus.id,
+                    name="Stoplist (in:%s)" % corpus.id,
-            Node.typename == "STOPLIST"
+                    typename="STOPLIST"
-            ).first()
+                   )
-        if stopList_id == None:
+        session.add(stoplist)
-            stopList = Node(name="STOPLIST",
-                        parent_id=corpus.id,
-                        user_id=corpus.user_id,
-                        typename="STOPLIST")
-            session.add(stopList)
-            session.commit()
-            stopList_id = stopList.id
-    # For tests only
-    if debug == True:
-        session.query(Node).filter(Node.id==stopList_id).delete()
        session.commit()
+        stoplist_id = stoplist.id
    # Get common resources, all common StopWords on the platform
    ## First get the id of the StopList of Gargantua super user
@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
    ## Get the ngrams
    ## ngrams :: [(Int, String, Int)]
-    frequency = func.count( NodeNgram.weight )
+    ngrams = (session.query( Ngram.id, Ngram.terms)
-    ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
            .join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
            .join( Node, Node.id == NodeNgram.node_id )
            .filter( Node.parent_id == corpus.id,
                     Node.typename == "DOCUMENT")
            .group_by( Ngram.id )
-            .order_by( desc( frequency ) )
            #.limit(limit)
            .all()
            )
-    ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
+    ngrams_to_stop = filter(
+            lambda x: is_stop_word(x,stop_words=stop_words), ngrams
+        )
    # print([n for n in ngrams_to_stop])
    stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
    # stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
-    stop.save(stopList_id)
+    stop.save(stoplist_id)
-    return stopList_id
+    return stoplist_id
--- a/gargantext/util/toolchain/ngram_coocs_tempo.py
+++ b/gargantext/util/toolchain/ngram_coocs_tempo.py
-from gargantext.models     import Node, NodeNgram, NodeNgramNgram
+from gargantext.models         import Node, NodeNgram, NodeNgramNgram
-from gargantext.util.lists import WeightedMatrix
+from gargantext.util.lists     import WeightedMatrix
-from gargantext.util.db    import session, aliased, func
+from gargantext.util.db        import session, aliased, func
-from gargantext.constants  import DEFAULT_COOC_THRESHOLD
+from gargantext.util.db_cache  import cache
+from gargantext.constants      import DEFAULT_COOC_THRESHOLD
 def compute_coocs(corpus,
-                    threshold = DEFAULT_COOC_THRESHOLD,
+                    overwrite_id  = None,
-                    weighted  = False,
+                    threshold     = DEFAULT_COOC_THRESHOLD,
-                    our_id    = None,
+                    mainlist_id     = None,
-                    stop_id   = None,
+                    stoplist_id     = None,
                    symmetry_filter = True):
    """
    Count how often some extracted terms appear
    together in a small context (document)
    throughout a larger context (corpus).
-    node_id | ngram_id | weight       ngram1_id | ngram2_id | ucooc | wcooc |
+             [NodeNgram]                       [NodeNgramNgram]
-    --------+----------+--------      ----------+-----------+-------+-------+
-      MYDOC |      487 |      1   =>        487 |       294 |     1 |     4 |
+    node_id | ngram_id | weight       ngram1_id | ngram2_id | score |
-      MYDOC |      294 |      3
+    --------+----------+--------      ----------+-----------+-------+
+     MYDOCA |      487 |      1   =>        487 |       294 |     2 |
+     MYDOCA |      294 |      3
+     MYDOCB |      487 |      1
+     MYDOCB |      294 |      4
    Fill that info in DB:
      - a *new* COOCCURRENCES node
@@ -25,14 +30,16 @@ def compute_coocs(corpus,
    worse case complexity ~ O(N²/2) with N = number of ngrams
+    If a mainlist is provided, we filter doc ngrams to those also in the list.
    Parameters:
-      - threshold: on output ucooc count (previously called hapax)
+      - the corpus node
-      - weighted: if False normal cooc to be saved as result
+      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
-                  if True  weighted cooc (experimental)
+                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
-      - stop_id: stoplist for filtering input ngrams
+      - threshold: on output cooc count (previously called hapax)
-      - TODO cvalue_id: allow a metric as input filter
+      - mainlist_id: mainlist to constrain the input ngrams
-      - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
+      - stoplist_id: stoplist for filtering input ngrams
-      - TODO start, end : filter on document date
+                     (normally unnecessary if a mainlist is provided)
     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
@@ -54,14 +61,17 @@ def compute_coocs(corpus,
    coocs for each doc :
      - each given pair like (termA, termB) will likely appear several times
        => we do GROUP BY (x1.ngram_id, x2.ngram_id)
-      - normally we can count unique appearances of the pair (ucooc)
+      - we count unique appearances of the pair (cooc)
-      - we can count sum of sum of weights in the pair (wcooc or cofreq)
-    TODO
-    ====
-      use WeightedMatrix
    """
+        #   - TODO cvalue_id: allow a metric as additional  input filter
+        #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
+        #   - TODO start, end : filter on document date
+        #   - TODO weighted: if False normal cooc to be saved as result
+        #                    if True  weighted cooc (experimental)
    # /!\ big combinatorial complexity /!\
    # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
    #  1.859.408 lignes pour la requête cooc simple
@@ -94,10 +104,22 @@ def compute_coocs(corpus,
    # 2) INPUT FILTERS (reduce N before O(N²))
    #    £TODO add possibility to restrict to the mainlist
-    if stop_id:
+    if mainlist_id:
+        main_subquery = (
+            session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == mainlist_id)
+                .subquery()
+                )
+        coocs_query = ( coocs_query
+            .filter( x1.ngram_id.in_(main_subquery) )
+            .filter( x2.ngram_id.in_(main_subquery) )
+        )
+    if stoplist_id:
        stop_subquery = (
            session.query(NodeNgram.ngram_id)
-                .filter(NodeNgram.node_id == stop_id)
+                .filter(NodeNgram.node_id == stoplist_id)
                .subquery()
                )
@@ -128,30 +150,36 @@ def compute_coocs(corpus,
    # 3) OUTPUT FILTERS
    # ------------------
    # threshold
-    #
+    coocs_query = coocs_query.having(ucooc >= threshold)
-    coocs_query = coocs_query.having(ucooc > threshold)
    # 4) EXECUTE QUERY
    # ----------------
    #  => storage in our matrix structure
    matrix = WeightedMatrix(coocs_query.all())
+    # fyi
+    # shape_0 = len({pair[0] for pair in matrix.items})
+    # shape_1 = len({pair[1] for pair in matrix.items})
+    # print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
    # 5) SAVE
    # --------
-    if our_id:
+    # saving the parameters of the analysis in the Node JSON
-        # use pre-existing id
+    new_hyperdata = { 'corpus': corpus.id,
-        the_id = our_id
+                      'threshold': threshold }
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_cooc = cache.Node[overwrite_id]
+        the_cooc.hyperdata = new_hyperdata
+        the_cooc.save_hyperdata()
+        session.commit()
+        the_id = overwrite_id
    else:
        # create the new cooc node
-        the_cooc = Node(
+        the_cooc = corpus.add_child(
                        typename  = "COOCCURRENCES",
                        name      = "Coocs (in:%s)" % corpus.name[0:10],
-                        parent_id = corpus.id,
+                        hyperdata = new_hyperdata,
-                        user_id   = corpus.user_id,
-                        # saving the parameters of the analysis in the Node JSON
-                        hyperdata = { 'corpus': corpus.id,
-                                      'threshold': threshold }
                    )
        session.add(the_cooc)
        session.commit()

--- a/gargantext/util/toolchain/ngram_groups.py
+++ b/gargantext/util/toolchain/ngram_groups.py
-from gargantext.models   import Node, NodeNgramNgram
+"""
-from gargantext.util.db  import session
+For initial ngram groups via stemming
-from gargantext.util.lists import Translations
+ Exemple:
+   - groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
+   - groups['post']          = {'poste':3, 'poster':5, 'postés':2...}
+"""
+from gargantext.models        import Node, NodeNgramNgram
+from gargantext.util.db       import session
+from gargantext.util.lists    import Translations
 # to convert fr => french :/
 from gargantext.util.languages import languages
-from nltk.stem.snowball  import SnowballStemmer
+from re                       import split as resplit
-from re                  import split as resplit
+from collections              import defaultdict, Counter
-from collections import defaultdict, Counter
+from nltk.stem.snowball       import SnowballStemmer
 def prepare_stemmers(corpus):
    """
@@ -22,7 +29,7 @@ def prepare_stemmers(corpus):
        stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
    return stemmers_by_lg
-def compute_groups(corpus, stoplist_id = None):
+def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    """
    1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None):
    del my_groups
-    # 2) Create the list node
+    # 2) the list node
-    the_group = Node()
+    if overwrite_id:
-    the_group.typename  = "GROUPLIST"
+        # overwrite pre-existing id
-    the_group.name      = "Group (src:%s)" % corpus.name[0:10]
+        the_id = overwrite_id
-    the_group.parent_id = corpus.id    # could use corpus.parent_id if free list
+    # or create the new id
-    the_group.user_id   = corpus.user_id
+    else:
+        the_group =  corpus.add_child(
-    # and save the node
+            typename  = "GROUPLIST",
-    session.add(the_group)
+            name = "Group (src:%s)" % corpus.name[0:10]
-    session.commit()
+        )
-    the_id = the_group.id
+        # and save the node
+        session.add(the_group)
+        session.commit()
+        the_id = the_group.id
    # 3) Save each grouping couple to DB thanks to Translations.save() table
    ndngng_list = Translations(

--- a/gargantext/util/toolchain/ngram_scores.py
+++ b/gargantext/util/toolchain/ngram_scores.py
+"""
+Computes ngram scores with 3 ranking functions:
+   - the simple sum of occurrences inside the corpus
+   - the tfidf inside the corpus
+   - the global tfidf for all corpora having same source
+FIXME: "having the same source" means we need to select inside hyperdata
+       with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
+"""
 from gargantext.models   import Node, NodeNgram, NodeNodeNgram
-from gargantext.util.db  import session, bulk_insert
+from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
-from sqlalchemy import text
+from sqlalchemy          import text  # for query from raw SQL statement
+from math                import log
 # £TODO
 # from gargantext.util.lists import WeightedContextIndex
-from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
-from math  import log
-def compute_occurrences_local(corpus):
+def compute_occurrences_local(corpus, overwrite_id = None):
    """
    Calculates sum of occs per ngram within corpus
+    (used as info in the ngrams table view)
+    ? optimize ?  OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
+    Parameters:
+        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
+                     (the Node and its previous NodeNodeNgram rows will be replaced)
    """
    # 1) all the doc_ids of our corpus (scope of counts for filter)
@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus):
    #                    ^^^^  ^^^
    #                ngram_id  sum_wei
-    # create the new OCCURRENCES node
-    occnode = Node()
+    if overwrite_id:
-    occnode.typename  = "OCCURRENCES"
+        # overwrite pre-existing id
-    occnode.name      = "occ_sums (in:%s)" % corpus.id
+        the_id = overwrite_id
-    occnode.parent_id = corpus.id
+        # occnode = cache.Node[overwrite_id]
-    occnode.user_id   = corpus.user_id
+    else:
-    session.add(occnode)
+        # create the new OCCURRENCES node
-    session.commit()
+        occnode = corpus.add_child(
+            typename  = "OCCURRENCES",
+            name = "occ_sums (in:%s)" % corpus.id
+        )
+        session.add(occnode)
+        session.commit()
+        the_id = occnode.id
    # reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
    bulk_insert(
        NodeNodeNgram,
        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
-        ((occnode.id, corpus.id,  res[0], res[1]) for res in occ_sums)
+        ((the_id, corpus.id,  res[0], res[1]) for res in occ_sums)
    )
-    return occnode.id
+    return the_id
-def compute_tfidf(corpus, scope="local"):
+def compute_tfidf(corpus, scope="local", overwrite_id=None):
    """
    Calculates tfidf within the current corpus
-    Parameter:
+    Parameters:
+      - the corpus itself
      - scope: {"local" or "global"}
+      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
+                   (the Node and its previous NodeNodeNgram rows will be replaced)
    """
    # local <=> within this corpus
@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"):
        tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
    # -------------------------------------------------
-    # create the new TFIDF-CORPUS node
+    if overwrite_id:
-    tfidf_nd = Node(parent_id = corpus.id, user_id = corpus.user_id)
+        the_id = overwrite_id
-    if scope == "local":
+    else:
-        tfidf_nd.typename  = "TFIDF-CORPUS"
+        # create the new TFIDF-XXXX node
-        tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
+        tfidf_nd = corpus.add_child()
-    elif scope == "global":
+        if scope == "local":
-        tfidf_nd.typename  = "TFIDF-GLOBAL"
+            tfidf_nd.typename  = "TFIDF-CORPUS"
-        tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+            tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
-    session.add(tfidf_nd)
+        elif scope == "global":
-    session.commit()
+            tfidf_nd.typename  = "TFIDF-GLOBAL"
+            tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+        session.add(tfidf_nd)
+        session.commit()
+        the_id = tfidf_nd.id
    # reflect that in NodeNodeNgrams
    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
    bulk_insert(
        NodeNodeNgram,
-        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
+        ('node1_id', 'node2_id','ngram_id', 'score'),
-        ((tfidf_nd.id,  corpus.id,     ng, tfidfs[ng]) for ng in tfidfs)
+        ((the_id,    corpus.id,    ng,   tfidfs[ng]) for ng in tfidfs)
    )
-    return tfidf_nd.id
+    return the_id
--- a/gargantext/util/toolchain/score_specificity.py
+++ b/gargantext/util/toolchain/score_specificity.py
-from gargantext.util.db import session, aliased, func
+"""
-from gargantext.util.db_cache import *
+Computes a specificity metric from the ngram cooccurrence matrix.
-from gargantext.constants import *
+ + SAVE => WeightedList => NodeNgram
+"""
-# from gargantext.util.analysis.cooccurrences import do_cooc
+from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
+from gargantext.util.db       import session, aliased, func, bulk_insert
-from gargantext.models import Node, Ngram, NodeNgramNgram, NodeNodeNgram
+from gargantext.util.lists    import WeightedList
+from collections              import defaultdict
-import pandas as pd
+from pandas                   import DataFrame
-from collections import defaultdict
+def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
-def compute_specificity(corpus, cooc_id, limit=100):
    '''
    Compute the specificity, simple calculus.
+    Parameters:
+        - cooc_id: mandatory id of a cooccurrences node to use as base
+        - overwrite_id: optional preexisting specificity node to overwrite
    '''
    cooccurrences = (session.query(NodeNgramNgram)
                    .filter(NodeNgramNgram.node_id==cooc_id)
-                    # no filtering: new choice filter on tfidf before creation
-                    #    .order_by(NodeNgramNgram.weight)
-                    #    .limit(limit)
                    )
+    # no filtering: new choice cooc already filtered on tfidf before creation
    matrix = defaultdict(lambda : defaultdict(float))
@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100):
    nb_ngrams = len(matrix)
-    d = pd.DataFrame(matrix).fillna(0)
+    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
+    d = DataFrame(matrix).fillna(0)
    # proba (x/y) ( <= on divise chaque colonne par son total)
    d = d / d.sum(axis=0)
@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100):
    # ----------------
    # specificity node
-    node = session.query(Node).filter(
+    if overwrite_id:
-        Node.parent_id==corpus.id,
+        # overwrite pre-existing id
-        Node.typename == "SPECIFICITY"
+        the_id = overwrite_id
-        ).first()
+        session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==the_id).delete()
+        session.commit()
-    if node == None:
+    else:
-        user_id = corpus.user_id
+        specnode = corpus.add_child(
-        node = Node(name="Specif (in:%i)" % corpus.id,
+            typename  = "SPECIFICITY",
-                    parent_id=corpus.id,
+            name = "Specif (in:%s)" % corpus.id
-                    user_id=user_id,
+        )
-                    typename="SPECIFICITY")
+        session.add(specnode)
-        session.add(node)
        session.commit()
+        the_id = specnode.id
-    data = zip(  [node.id] * nb_ngrams
+    # print(v)
-               , [corpus.id] * nb_ngrams
-               , v.index.tolist()
-               , v.values.tolist()
-               )
-    session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==node.id).delete()
-    session.commit()
-    bulk_insert(NodeNodeNgram, ['node1_id', 'node2_id', 'ngram_id', 'score'], [d for d in data])
+    data = WeightedList(
+            zip(  v.index.tolist()
+                , v.values.tolist()
+             )
+           )
+    data.save(the_id)
-    return(node.id)
+    return(the_id)
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -94,6 +94,7 @@ def project(request, project_id):
        )
        session.add(corpus)
        session.commit()
+        # parse_extract: fileparsing -> ngram extraction -> lists
        scheduled(parse_extract)(corpus.id)
    # corpora within this project
@@ -101,16 +102,26 @@ def project(request, project_id):
    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
        # we only consider the first resource of the corpus to determine its type
-        resource = corpus.resources()[0]
+        resources = corpus.resources()
-        resource_type_name = RESOURCETYPES[resource['type']]['name']
+        if len(resources):
+            resource = resources[0]
+            resource_type_name = RESOURCETYPES[resource['type']]['name']
+        else:
+            print("(WARNING) PROJECT view: no listed resource")
        # add some data for the viewer
        corpus.count = corpus.children('DOCUMENT').count()
        status = corpus.status()
        if status is not None and not status['complete']:
-            corpus.status_message = '(in progress: %s, %d complete)' % (
+            if not status['error']:
-                status['action'].replace('_', ' '),
+                corpus.status_message = '(in progress: %s, %d complete)' % (
-                status['progress'],
+                    status['action'].replace('_', ' '),
-            )
+                    status['progress'],
+                )
+            else:
+                corpus.status_message = '(aborted: "%s" after %i docs)' % (
+                    status['error'][-1],
+                    status['progress']
+                )
        else:
            corpus.status_message = ''
        # add