[MERGE] merge of Romain and Mathieu branches.

f7d58faf · delanoe · 309e6c69 · eec89097 · f7d58faf · f7d58faf
Commit f7d58faf authored Mar 15, 2016 by delanoe
15 changed files
--- a/doc/ngram_parsing_flow.png
+++ b/doc/ngram_parsing_flow.png
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -9,29 +9,32 @@ LISTTYPES = {
    'STOPLIST'     : UnweightedList,
    'MAINLIST'     : UnweightedList,
    'MAPLIST'      : UnweightedList,
-    'OCCURRENCES'  : WeightedList,
+    'SPECIFICITY'  : WeightedList,
+    'OCCURRENCES'  : WeightedContextIndex,
    'COOCCURRENCES': WeightedMatrix,
+    'TFIDF-CORPUS' : WeightedContextIndex,
+    'TFIDF-GLOBAL' : WeightedContextIndex,
 }

 NODETYPES = [
    None,
    # documents hierarchy
-    'USER',
-    'PROJECT',
-    'CORPUS',
-    'DOCUMENT',
+    'USER',                  # 1
+    'PROJECT',               # 2
+    'CORPUS',                # 3
+    'DOCUMENT',              # 4
    # lists
-    'STOPLIST',
-    'GROUPLIST',
-    'MAINLIST',
-    'MAPLIST',
-    'COOCCURRENCES',
+    'STOPLIST',              # 5
+    'GROUPLIST',             # 6
+    'MAINLIST',              # 7
+    'MAPLIST',               # 8
+    'COOCCURRENCES',         # 9
    # scores
-    'OCCURRENCES',
-    'SPECIFICITY',
-    'CVALUE',
-    'TFIDF-CORPUS',
-    'TFIDF-GLOBAL',
+    'OCCURRENCES',           # 10
+    'SPECIFICITY',           # 11
+    'CVALUE',                # 12
+    'TFIDF-CORPUS',          # 13
+    'TFIDF-GLOBAL',          # 14
 ]

 import datetime
@@ -108,6 +111,21 @@ RESOURCETYPES = [
    # },
 ]

+# linguistic extraction parameters ---------------------------------------------
+DEFAULT_TFIDF_CUTOFF_RATIO = .45             # MAINLIST maximum terms in %
+
+DEFAULT_TFIDF_HARD_LIMIT = 750               # MAINLIST maximum terms abs
+                                             # (makes COOCS larger ~ O(N²) /!\)
+
+DEFAULT_COOC_THRESHOLD = 5                   # inclusive minimum for COOCS coefs
+                                             # (makes COOCS more sparse)
+
+DEFAULT_MAPLIST_MAX = 300                    # MAPLIST maximum terms
+
+DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5         # part of monograms in MAPLIST
+                                             # (NB: used to be 0.005 !!)
+
+# ------------------------------------------------------------------------------

 # other parameters
 # default number of docs POSTed to scrappers.views.py

--- a/gargantext/models/ngrams.py
+++ b/gargantext/models/ngrams.py
@@ -19,7 +19,7 @@ class NodeNgram(Base):
    weight = Column(Float)

 class NodeNodeNgram(Base):
-    """ for instance for tfidf:
+    """ for instance for TFIDF
    (
        doc                              ::Node ,
        corpus                           ::Node ,
@@ -37,8 +37,16 @@ class NodeNodeNgram(Base):
    # (cf. www.postgresql.org/docs/9.4/static/datatype-numeric.html#DATATYPE-FLOAT)

 class NodeNgramNgram(Base):
+    """ for instance for COOCCURRENCES and GROUPLIST
+    (
+        cooc_node/group_node  ::Node ,
+        term_A                ::Ngram ,
+        term_B                ::Ngram ,
+        weight                ::Float (real)
+    )
+    """
    __tablename__ = 'nodes_ngrams_ngrams'
    node_id = Column(Integer, ForeignKey(Node.id, ondelete='CASCADE'), primary_key=True)
    ngram1_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
    ngram2_id = Column(Integer, ForeignKey(Ngram.id, ondelete='CASCADE'), primary_key=True)
-    weight = Column(Float)
+    weight = Column(Float(precision=24))  # see comment for NodeNodeNgram.score
--- a/gargantext/util/analysis/cooccurrences.py
+++ b/gargantext/util/analysis/cooccurrences.py
+from gargantext.util.db import *
+from gargantext.util.db_cache import *
+from gargantext.constants import *
+
+from gargantext.models.nodes  import Node
+from gargantext.models.ngrams import Ngram, NodeNgram, NodeNgramNgram, \
+        NodeHyperdataNgram, NodeHyperdata, Hyperdata
+
+
+from sqlalchemy import desc, asc, or_, and_, Date, cast, select
+from sqlalchemy import literal_column
+from sqlalchemy.orm import aliased
+from sqlalchemy.sql import func
+
+import datetime
+import inspect
+
+def do_cooc(corpus=None
+         , field1='ngrams', field2='ngrams'
+         , main_id=None, stop_id=None, group_id=None
+         , cvalue_id=None
+         , n_min=1, n_max=None
+         , start=None, end=None
+         , limit=1000
+         , isMonopartite=True
+         , hapax = 3
+         , session=None):
+    '''
+    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
+    For the moment list of paramters are not supported because, lists need to
+    be merged before.
+    corpus :: Corpus
+    cvalue_id :: Int
+    main_id :: Int
+    stop_id :: Int
+    group_id :: Int
+
+    For the moment, start and end are simple, only year is implemented yet
+    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
+    end   :: TimeStamp
+    limit :: Int
+
+    '''
+    # TODO : add hyperdata here
+    
+    # Security test
+    field1,field2 = str(field1), str(field2)
+    
+    # Get node
+
+    node_cooc = session.query(Node).filter(
+            Node.parent_id==corpus.id,
+            Node.typename == "COOCCURRENCES"
+            ).first()
+    
+    if node_cooc == None:
+        node_cooc = Node(
+                name="Coccurrences node", 
+                parent_id=corpus.id, 
+                user_id=corpus.user_id, 
+                typename="COOCCURRENCES")
+        session.add(node_cooc)
+        session.commit()
+
+    
+    # BEGIN
+    # Saving the parameters of the analysis in the Node JSONB hyperdata field
+    args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
+#    hyperdata = dict()
+#    
+#    for parameter in parameters.keys():
+#        if parameter != 'corpus' and parameter != 'node_cooc':
+#            hyperdata[parameter] = parameters[parameter]
+#    
+#    node_cooc.hyperdata = hyperdata
+#
+#    session.add(node_cooc)
+#    session.commit()
+    # END
+
+    session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
+    session.commit()
+
+    NodeNgramX = aliased(NodeNgram)
+    cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')
+    #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
+   
+    #print([n for n in test_query])
+    if isMonopartite :
+        NodeNgramY = aliased(NodeNgram)
+
+        cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
+                 .join(Node, Node.id == NodeNgramX.node_id)
+                 .join(NodeNgramY, NodeNgramY.node_id == Node.id)
+                 .filter(Node.parent_id==corpus.id, Node.typename=="DOCUMENT")
+                    )
+    else :
+        NodeNgramY = aliased(NodeNgram)
+        
+        cooc_query = (session.query(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score)
+                 .join(Node, Node.id == NodeHyperdataNgram.node_id)
+                 .join(NodeNgramY, NodeNgramY.node_id == Node.id)
+                 .join(Hyperdata, Hyperdata.id == NodeHyperdataNgram.hyperdata_id)
+                 .filter(Node.parent_id == corpus.id, Node.typename == "DOCUMENT")
+                 .filter(Hyperdata.name == field1)
+                    )
+    #print(cooc_query)
+
+    # Size of the ngrams between n_min and n_max
+    if n_min is not None or n_max is not None:
+        if isMonopartite:
+            NgramX = aliased(Ngram)
+            cooc_query = cooc_query.join(NgramX, NgramX.id == NodeNgramX.ngram_id)
+        
+        NgramY = aliased(Ngram)
+        cooc_query = (cooc_query
+             .join(NgramY, NgramY.id == NodeNgramY.ngram_id)
+            )
+
+    if n_min is not None:
+        cooc_query = (cooc_query
+             .filter(NgramY.n >= n_min)
+            )
+        if isMonopartite:
+            cooc_query = cooc_query.filter(NgramX.n >= n_min)
+
+    if n_max is not None:
+        cooc_query = (cooc_query
+             .filter(NgramY.n >= n_min)
+            )
+        if isMonopartite:
+            cooc_query = cooc_query.filter(NgramX.n >= n_min)
+
+    # Cooc between the dates start and end
+    if start is not None:
+        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
+        # TODO : more complexe date format here.
+        date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
+        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
+        
+        Start=aliased(NodeHyperdata)
+        StartFormat = aliased(Hyperdata)
+        cooc_query = (cooc_query.join(Start, Start.node_id == Node.id)
+                                .join(StartFormat, StartFormat.id == Start.hyperdata_id)
+                                .filter(StartFormat.name == 'publication_date')
+                                .filter(Start.value_datetime >= date_start_utc)
+                      )
+
+
+    if end is not None:
+        # TODO : more complexe date format here.
+        date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
+        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
+        
+        End=aliased(NodeHyperdata)
+        EndFormat = aliased(Hyperdata)
+        cooc_query = (cooc_query.join(End, End.node_id == Node.id)
+                                .join(EndFormat, EndFormat.id == End.hyperdata_id)
+                                .filter(EndFormat.name == 'publication_date')
+                                .filter(End.value_datetime <= date_end_utc)
+                      )
+
+
+    if isMonopartite:
+        # Cooc is symetric, take only the main cooccurrences and cut at the limit
+        cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
+    
+    cooc_query = cooc_query.having(cooc_score > hapax)
+             
+    if isMonopartite:
+        cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
+    else:
+        cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)
+
+    cooc_query = cooc_query.order_by(desc('cooc_score'))
+    # END of the query
+
+    matrix = LISTTYPES["COOCCURRENCES"](cooc_query)
+    #print(matrix)
+    
+    if isMonopartite:
+        if main_id is not None :
+            main_list = LISTTYPES["MAINLIST"](main_id)
+        
+        if stop_id is not None :
+            stop_list = LISTTYPES["STOPLIST"](stop_id)
+
+        if group_id is not None :
+            group_list = LISTTYPES["GROUPLIST"](group_id)
+
+        if main_id is not None and stop_id is None and group_id is None :
+            cooc = matrix & main_list
+        elif main_id is not None and stop_id is not None and group_id is None :
+            cooc = matrix & (main_list - stop_list)
+        
+        elif main_id is not None and stop_id is not None and group_id is not None :
+            print("main_id is not None and stop_id is not None and group_id is not None") 
+            cooc = matrix & (main_list * group_list - stop_list)
+            #cooc = matrix & (main_list - stop_list)
+        elif main_id is not None and stop_id is None and group_id is not None :
+            cooc = matrix & (main_list * group_list)
+        else :
+            cooc = matrix
+    else:
+        cooc = matrix
+    cooc.save(node_cooc.id)
+    return(node_cooc.id)
--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -2,7 +2,7 @@
 """


-__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList']
+__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']


 from gargantext.util.db import session, bulk_insert
@@ -70,8 +70,10 @@ class _BaseClass:

 class Translations(_BaseClass):

-    def __init__(self, source=None):
+    def __init__(self, source=None, just_items=False):
        self.items = defaultdict(int)
+        # TODO lazyinit for groups
+        #      (not necessary for save)
        self.groups = defaultdict(set)
        if source is None:
            return
@@ -83,15 +85,35 @@ class Translations(_BaseClass):
                .filter(NodeNgramNgram.node_id == source)
            )
            self.items.update(query)
-            for key, value in self.items.items():
-                self.groups[value].add(key)
+            if not just_items:
+                for key, value in self.items.items():
+                    self.groups[value].add(key)
        elif isinstance(source, Translations):
            self.items.update(source.items)
-            self.groups.update(source.groups)
+            if not just_items:
+                self.groups.update(source.groups)
        elif hasattr(source, '__iter__'):
+            # not very intuitive with update here:
+            # /!\ source must be "reversed" (like self.items)
+
+            # bad exemple
+            # In > couples = [(1, 2), (1, 3)]
+            # In > tlko = Translations(couples)
+            # Out> Translations {1: 3}
+            # In > tlko.save()
+            # DB-- 3 -> 1
+
+            # good exemple
+            # In > reversed_couples = [(2, 1), (3, 1)]
+            # In > tlok = Translations(reversed_couples)
+            # Out> Translations {2: 1, 3: 1}
+            # In > tlok.save()
+            # DB-- 1 -> 2
+            # DB-- 1 -> 3
            self.items.update(source)
-            for key, value in self.items.items():
-                self.groups[value].add(key)
+            if not just_items:
+                for key, value in self.items.items():
+                    self.groups[value].add(key)
        else:
            raise TypeError

@@ -138,11 +160,29 @@ class Translations(_BaseClass):
        # insert new data
        bulk_insert(
            NodeNgramNgram,
-            ('node_id', 'ngram2_id', 'ngram1_id', 'score'),
+            ('node_id', 'ngram2_id', 'ngram1_id', 'weight'),
            ((node_id, key, value, 1.0) for key, value in self.items.items())
        )


+class WeightedContextIndex(_BaseClass):
+    """
+    associated model   : NodeNodeNgram
+    associated columns : node1_id  |  node2_id  |  ngram_id  |  score (float)
+
+    Tensor representing a contextual index or registry
+    (matrix of weighted ngrams *per* doc *per* context)
+
+    Exemple : tfidf by corpus
+    """
+    def __init__(self, source=None):
+        self.items = defaultdict(float)
+
+    # £TODO
+
+
+
+
 class WeightedMatrix(_BaseClass):

    def __init__(self, source=None):
@@ -184,7 +224,7 @@ class WeightedMatrix(_BaseClass):
        # insert new data
        bulk_insert(
            NodeNgramNgram,
-            ('node_id', 'ngram1_id', 'ngram2_id', 'score'),
+            ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
            ((node_id, key1, key2, value) for key1, key2, value in self)
        )


--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
-from .parsing import parse
+from .parsing           import parse
 from .ngrams_extraction import extract_ngrams
 from .hyperdata_indexing import index_hyperdata

+# in usual run order
+from .list_stop           import do_stoplist
+from .metric_tfidf        import compute_occs, compute_tfidf
+from .list_main           import do_mainlist
+from .ngram_coocs         import compute_coocs
+from .metric_specificity  import compute_specificity
+from .list_map            import do_maplist     # TEST
+from .ngram_groups        import compute_groups

 from gargantext.util.db import session
-from gargantext.models import Node
+from gargantext.models  import Node

+from datetime           import datetime

 def parse_extract(corpus):
    # retrieve corpus from database from id
@@ -18,6 +27,12 @@ def parse_extract(corpus):
    # apply actions
    print('CORPUS #%d' % (corpus.id))
    parse(corpus)
+
+    # was there an error in the process ?
+    if corpus.status()['error']:
+        print("ERROR: aborting parse_extract for corpus #%i" % corpus_id)
+        return None
+
    print('CORPUS #%d: parsed' % (corpus.id))
    extract_ngrams(corpus)
    print('CORPUS #%d: extracted ngrams' % (corpus.id))
@@ -38,3 +53,55 @@ def parse_extract_indexhyperdata(corpus):
    print('CORPUS #%d: extracted ngrams' % (corpus.id))
    index_hyperdata(corpus)
    print('CORPUS #%d: indexed hyperdata' % (corpus.id))
+    
+    # -------------------------------
+    # temporary ngram lists workflow
+    # -------------------------------
+    print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
+
+    # -> stoplist: filter + write (to Node and NodeNgram)
+    stop_id = do_stoplist(corpus)
+    print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
+
+    # -> write groups to Node and NodeNgramNgram
+    group_id = compute_groups(corpus, stoplist_id = None)
+    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
+
+    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
+    occ_id = compute_occs(corpus)
+    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
+
+    # ------------
+    # -> write local tfidf to Node and NodeNodeNgram
+    ltfidf_id = compute_tfidf(corpus, scope="local")
+    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+
+    # -> write global tfidf to Node and NodeNodeNgram
+    gtfidf_id = compute_tfidf(corpus, scope="global")
+    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
+
+    # -> mainlist: filter + write (to Node and NodeNgram)
+    mainlist_id = do_mainlist(corpus,
+                              tfidf_id = gtfidf_id,
+                              stoplist_id = stop_id)
+    print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
+
+    # ------------
+    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
+    cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
+    print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
+
+    # -> specificity: compute + write (=> NodeNodeNgram)
+    spec_id = compute_specificity(corpus, cooc_id=cooc_id)
+    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
+
+    # ?? maplist: compute + write (to Node and NodeNgram)
+    map_id = do_maplist(corpus,
+                        mainlist_id = mainlist_id,
+                        specificity_id=spec_id,
+                        grouplist_id=group_id)
+    print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
+
+
+def t():
+    return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
--- a/gargantext/util/toolchain/list_main.py
+++ b/gargantext/util/toolchain/list_main.py
+from gargantext.models     import Node, NodeNgram, NodeNodeNgram
+from gargantext.util.db    import session
+from gargantext.util.lists import UnweightedList
+from sqlalchemy            import desc
+from gargantext.constants  import DEFAULT_TFIDF_CUTOFF_RATIO, \
+                                  DEFAULT_TFIDF_HARD_LIMIT
+
+def do_mainlist(corpus,
+                    overwrite_id  = None,
+                    tfidf_id=None, stoplist_id=None,
+                    hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
+                    ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
+                    ):
+    """
+    Select top n terms according to a global tfidf ranking and stoplist filter.
+
+    The number of selected terms will be:
+        min(hard_limit, number_of_terms * ratio_limit)
+
+    NB : We use a global tfidf node where the values are global but the ngrams
+         are already selected (== only within this corpus documents).
+         TO DISCUSS: allow influence of the local tfidf scores too
+
+    Parameters:
+        - the corpus itself
+        - a tfidf score for ranking the ngrams
+        - a stoplist for filtering some ngrams
+        - overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
+                     (the Node and its previous NodeNgram rows will be replaced)
+
+      + 2 limits to set the amount of picked terms:
+        - ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
+          (default: 0.55)
+        - hard_limit: an absolute max value
+          (default: 1000)
+
+    """
+
+    # retrieve helper nodes if not provided
+    if not tfidf_id:
+        tfidf_id  = session.query(Node.id).filter(
+                                Node.typename  == "TFIDF-GLOBAL",
+                                Node.parent_id == corpus.id
+                    ).first()
+        if not tfidf_id:
+            raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
+
+    if not stoplist_id:
+        stoplist_id  = session.query(Node.id).filter(
+                                Node.typename  == "STOPLIST",
+                                Node.parent_id == corpus.id
+                        ).first()
+        if not stoplist_id:
+            raise ValueError("MAINLIST: STOPLIST node needed for mainlist creation")
+
+    # the ngrams we don't want
+    # NOTE: keep sure we do this only once during the ngram initial workflow
+    stopterms_subquery = (session
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id == stoplist_id)
+                            .subquery()
+                         )
+
+    # tfidf-ranked query
+    ordered_filtered_tfidf = (session
+        .query(NodeNodeNgram.ngram_id)
+        .filter(NodeNodeNgram.node1_id == tfidf_id)
+        .filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
+        .order_by(desc(NodeNodeNgram.score))
+        )
+
+    # total count
+    nb_ngrams = ordered_filtered_tfidf.count()
+
+    # apply ratio to find smallest limit
+    our_limit = min(hard_limit, round(nb_ngrams * ratio_limit))
+
+    print("MAINLIST: keeping %i ngrams out of %i" % (our_limit,nb_ngrams))
+
+    # DB retrieve up to limit => MAINLIST
+    top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
+
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_id = overwrite_id
+        # mainlist = cache.Node[overwrite_id]
+    else:
+        # now create the new MAINLIST node
+        mainlist = corpus.add_child(
+            typename  = "MAINLIST",
+            name = "Mainlist (in:%s)" % corpus.id
+        )
+        session.add(mainlist)
+        session.commit()
+        the_id = mainlist.id
+
+    # create UnweightedList object and save (=> new NodeNgram rows)
+    UnweightedList(top_ngrams_ids).save(the_id)
+
+    return the_id
--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
+"""
+Selects a subset of corpus ngrams to use in the graph map.
+"""
+
+from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
+                                     NodeNgramNgram, NodeNodeNgram
+from gargantext.util.db       import session, aliased, func
+from gargantext.util.db_cache import cache
+from gargantext.util.lists    import UnweightedList
+from sqlalchemy               import desc
+from gargantext.constants     import DEFAULT_MAPLIST_MAX,\
+                                     DEFAULT_MAPLIST_MONOGRAMS_RATIO
+
+def do_maplist(corpus,
+               overwrite_id = None,
+               mainlist_id  = None,
+               specificity_id = None,
+               grouplist_id = None,
+               limit=DEFAULT_MAPLIST_MAX,
+               monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
+               ):
+    '''
+    According to Specificities and mainlist
+
+    Parameters:
+      - mainlist_id (starting point, already cleaned of stoplist terms)
+      - specificity_id (ranking factor)
+      - grouplist_id (filtering grouped ones)
+      - overwrite_id: optional if preexisting MAPLIST node to overwrite
+
+      + 2 constants to modulate the terms choice
+        - limit for the amount of picked terms
+        - monograms_part: a ratio of terms with only one lexical unit to keep
+    '''
+
+    if not (mainlist_id and specificity_id and grouplist_id):
+        raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
+
+    monograms_limit = round(limit * monograms_part)
+    multigrams_limit = limit - monograms_limit
+    print("MAPLIST: monograms_limit =", monograms_limit)
+    print("MAPLIST: multigrams_limit = ", multigrams_limit)
+
+    #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
+
+    mainterms_subquery = (session
+                            # we want only terms within mainlist
+                            .query(NodeNgram.ngram_id)
+                            .filter(NodeNgram.node_id == mainlist_id)
+                            .subquery()
+                         )
+
+    primary_groupterms_subquery = (session
+                            # we want only primary terms (ngram1)
+                            .query(NodeNgramNgram.ngram1_id)
+                            .filter(NodeNgramNgram.node_id == grouplist_id)
+                            .subquery()
+                         )
+
+    ScoreSpec=aliased(NodeNgram)
+
+    # specificity-ranked
+    query = (session.query(ScoreSpec.ngram_id)
+                .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
+                .filter(ScoreSpec.node_id == specificity_id)
+                .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
+                .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
+            )
+
+    # TODO: move these 2 pools up to mainlist selection
+    top_monograms = (query
+                .filter(Ngram.n == 1)
+                .order_by(desc(ScoreSpec.weight))
+                .limit(monograms_limit)
+                .all()
+               )
+
+    top_multigrams = (query
+                .filter(Ngram.n >= 2)
+                .order_by(desc(ScoreSpec.weight))
+                .limit(multigrams_limit)
+                .all()
+               )
+
+    print("MAPLIST: top_monograms =", len(top_monograms))
+    print("MAPLIST: top_multigrams = ", len(top_multigrams))
+
+    # NEW MAPLIST NODE
+    # -----------------
+    # saving the parameters of the analysis in the Node JSON
+    new_hyperdata = { 'corpus': corpus.id,
+                      'limit' : limit,
+                      'monograms_part' : monograms_part
+                    }
+    if overwrite_id:
+        # overwrite pre-existing node
+        the_maplist = cache.Node[overwrite_id]
+        the_maplist.hyperdata = new_hyperdata
+        the_maplist.save_hyperdata()
+        session.commit()
+        the_id = overwrite_id
+    else:
+        # create a new maplist node
+        the_maplist = corpus.add_child(
+                        name="Maplist (in %i)" % corpus.id,
+                        typename="MAPLIST",
+                        hyperdata = new_hyperdata
+                    )
+        session.add(the_maplist)
+        session.commit()
+        the_id = the_maplist.id
+
+    # create UnweightedList object and save (=> new NodeNgram rows)
+    datalist = UnweightedList(
+                   [res.ngram_id for res in top_monograms + top_multigrams]
+               )
+
+    # save
+    datalist.save(the_id)
+
+    # dbg.show('MapList computed')
+
+    return the_id
--- a/gargantext/util/toolchain/list_stop.py
+++ b/gargantext/util/toolchain/list_stop.py
+"""
+Creates a filtering list for corpus ngrams.
+(implementation: regexp + "master" stoplist)
+"""
+
+from gargantext.models        import User, Node, Ngram, NodeNgram
+from gargantext.util.db       import session, func
+from gargantext.constants     import LISTTYPES
+from re                       import compile
+from sqlalchemy               import desc
+
+def is_stop_word(ngram, stop_words=None):
+    '''
+    ngram :: (Int, String) => (ngram_id, ngram_terms)
+    stop_words :: Set of String
+    (to avoid SQL query each time is_stop_word is invoked, get in as parameter)
+    '''
+    word = ngram[1]
+
+    if word in stop_words:
+        return(True)
+
+    compiled_regexes = []   # to compile them only once
+    for regex in [
+              "^.{1,2}$"
+            , "(.*)\d(.*)"
+            # , "(.*)(\.)(.*)"         trop fort (enlève les sigles !)
+            , "(.*)(\,)(.*)"
+            , "(.*)(< ?/?p ?>)(.*)"       # marques de paragraphes
+            , "(.*)(study)(.*)"
+            , "(.*)\b(xx|xi|xv)\b(.*)"
+            , "(.*)(result)(.*)"
+            , "(.*)(année|nombre|moitié)(.*)"
+            , "(.*)(temps)(.*)"
+            , "(.*)(%)(.*)"
+            , "(.*)(\{)(.*)"
+            , "(.*)(terme)(.*)"
+            , "(.*)(différent)(.*)"
+            , "(.*)(travers)(.*)"
+            , "(.*)(:|\|)(.*)"
+            ] :
+        compiled_regexes.append(compile(regex))
+
+    for format_regex in compiled_regexes:
+        if format_regex.match(word):
+            # print("STOPLIST += '%s' (regex: %s)" % (word, format_regex.pattern))
+            return(True)
+
+    return False
+
+def create_gargantua_resources():
+    gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
+    project = Node(
+            name="Resources",
+            user_id=gargantua_id,
+            typename="PROJECT")
+    stopList = Node(name="STOPLIST", parent_id=project.id, user_id=gargantua_id, typename="STOPLIST")
+    session.add(project)
+    session.add(stopList)
+    session.commit()
+
+def do_stoplist(corpus, overwrite_id=None):
+    '''
+    Create list of stop words.
+    TODO do a function to get all stop words with social scores
+
+    Parameters:
+        - overwrite_id: optional preexisting STOPLIST node to overwrite
+    '''
+
+    # Get preexisting StopList if provided in overwrite_id param
+    if overwrite_id:
+        stoplist_id = overwrite_id
+    # At this step of development, a new StopList should be created
+    else:
+        stoplist = corpus.add_child(
+                    name="Stoplist (in:%s)" % corpus.id,
+                    typename="STOPLIST"
+                   )
+        session.add(stoplist)
+        session.commit()
+        stoplist_id = stoplist.id
+
+    # Get common resources, all common StopWords on the platform
+    ## First get the id of the StopList of Gargantua super user
+    gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
+    rootStopList_id = session.query(Node.id).filter(
+            Node.user_id  == gargantua_id,
+            Node.typename == "STOPLIST"
+            ).first()
+    ## Then get all the stop words
+    ## stop_words :: [String]
+    stop_words = (session.query(Ngram.terms)
+                         .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
+                         .filter(NodeNgram.node_id == rootStopList_id)
+                         .all()
+                 )
+
+    # print([n for n in stop_words])
+
+    ## Get the ngrams
+    ## ngrams :: [(Int, String, Int)]
+    ngrams = (session.query( Ngram.id, Ngram.terms)
+            .join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
+            .join( Node, Node.id == NodeNgram.node_id )
+            .filter( Node.parent_id == corpus.id,
+                     Node.typename == "DOCUMENT")
+            .group_by( Ngram.id )
+            #.limit(limit)
+            .all()
+            )
+
+    ngrams_to_stop = filter(
+            lambda x: is_stop_word(x,stop_words=stop_words), ngrams
+        )
+
+    # print([n for n in ngrams_to_stop])
+
+    stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
+    # stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
+    stop.save(stoplist_id)
+    return stoplist_id
--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
+"""
+Computes a specificity metric from the ngram cooccurrence matrix.
+ + SAVE => WeightedList => NodeNgram
+"""
+from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
+from gargantext.util.db       import session, aliased, func, bulk_insert
+from gargantext.util.lists    import WeightedList
+from collections              import defaultdict
+from pandas                   import DataFrame
+
+def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
+    '''
+    Compute the specificity, simple calculus.
+
+    Parameters:
+        - cooc_id: mandatory id of a cooccurrences node to use as base
+        - overwrite_id: optional preexisting specificity node to overwrite
+    '''
+
+    cooccurrences = (session.query(NodeNgramNgram)
+                    .filter(NodeNgramNgram.node_id==cooc_id)
+                    )
+    # no filtering: new choice cooc already filtered on tfidf before creation
+
+    matrix = defaultdict(lambda : defaultdict(float))
+
+    # £TODO re-rename weight => score
+    for cooccurrence in cooccurrences:
+        matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
+        matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
+
+    nb_ngrams = len(matrix)
+
+    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
+
+    d = DataFrame(matrix).fillna(0)
+
+    # proba (x/y) ( <= on divise chaque colonne par son total)
+    d = d / d.sum(axis=0)
+
+    # d:Matrix => v: Vector (len = nb_ngrams)
+    v = d.sum(axis=1)
+
+    ## d ##
+    #######
+    #               Grenelle  biodiversité  kilomètres  site  élus  île
+    # Grenelle             0             0           4     0     0    0
+    # biodiversité         0             0           0     0     4    0
+    # kilomètres           4             0           0     0     4    0
+    # site                 0             0           0     0     4    6
+    # élus                 0             4           4     4     0    0
+    # île                  0             0           0     6     0    0
+
+
+    ## d.sum(axis=1) ##
+    ###################
+    # Grenelle         4
+    # biodiversité     4
+    # kilomètres       8
+    # site            10
+    # élus            12
+    # île              6
+
+    # résultat temporaire
+    # -------------------
+    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
+    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
+    # TODO analyser la cohérence math ET sem de cet indicateur
+    v.sort_values(inplace=True)
+
+    # [ ('biodiversité' , 0.333 ),
+    #   ('Grenelle'     , 0.5   ),
+    #   ('île'          , 0.599 ),
+    #   ('kilomètres'   , 1.333 ),
+    #   ('site'         , 1.333 ),
+    #   ('élus'         , 1.899 ) ]
+
+    # ----------------
+    # specificity node
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_id = overwrite_id
+        session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==the_id).delete()
+        session.commit()
+    else:
+        specnode = corpus.add_child(
+            typename  = "SPECIFICITY",
+            name = "Specif (in:%s)" % corpus.id
+        )
+        session.add(specnode)
+        session.commit()
+        the_id = specnode.id
+
+    # print(v)
+
+    data = WeightedList(
+            zip(  v.index.tolist()
+                , v.values.tolist()
+             )
+           )
+    data.save(the_id)
+
+    return(the_id)
--- a/gargantext/util/toolchain/metric_tfidf.py
+++ b/gargantext/util/toolchain/metric_tfidf.py
+"""
+Computes ngram scores with 3 ranking functions:
+   - the simple sum of occurrences inside the corpus
+   - the tfidf inside the corpus
+   - the global tfidf for all corpora having same source
+
+FIXME: "having the same source" means we need to select inside hyperdata
+       with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
+"""
+
+from gargantext.models   import Node, NodeNgram, NodeNodeNgram
+from gargantext.util.db  import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
+from sqlalchemy          import text  # for query from raw SQL statement
+from math                import log
+# £TODO
+# from gargantext.util.lists import WeightedContextIndex
+
+
+def compute_occs(corpus, overwrite_id = None):
+    """
+    Calculates sum of occs per ngram within corpus
+    (used as info in the ngrams table view)
+
+    ? optimize ?  OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
+
+    Parameters:
+        - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
+                     (the Node and its previous NodeNodeNgram rows will be replaced)
+    """
+
+    # 1) all the doc_ids of our corpus (scope of counts for filter)
+    # slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # 2) our sums per ngram_id
+    occ_sums = (session
+                .query(
+                    NodeNgram.ngram_id,
+                    func.sum(NodeNgram.weight)
+                 )
+                .filter(NodeNgram.node_id.in_(docids_subquery))
+                .group_by(NodeNgram.ngram_id)
+                .all()
+               )
+
+    # example result = [(1970, 1.0), (2024, 2.0),  (259, 2.0), (302, 1.0), ... ]
+    #                    ^^^^  ^^^
+    #                ngram_id  sum_wei
+
+
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_id = overwrite_id
+        # occnode = cache.Node[overwrite_id]
+    else:
+        # create the new OCCURRENCES node
+        occnode = corpus.add_child(
+            typename  = "OCCURRENCES",
+            name = "occ_sums (in:%s)" % corpus.id
+        )
+        session.add(occnode)
+        session.commit()
+        the_id = occnode.id
+
+    # reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
+        ((the_id, corpus.id,  res[0], res[1]) for res in occ_sums)
+    )
+
+    return the_id
+
+
+def compute_tfidf(corpus, scope="local", overwrite_id=None):
+    """
+    Calculates tfidf within the current corpus
+
+    Parameters:
+      - the corpus itself
+      - scope: {"local" or "global"}
+      - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
+                   (the Node and its previous NodeNodeNgram rows will be replaced)
+    """
+
+    # local <=> within this corpus
+    if scope == "local":
+        # All docs of this corpus
+        docids_subquery = (session
+                            .query(Node.id)
+                            .filter(Node.parent_id == corpus.id)
+                            .filter(Node.typename == "DOCUMENT")
+                            .subquery()
+                           )
+    # global <=> within all corpora of this source
+    elif scope == "global":
+        this_source_type = corpus.resources()[0]['type']
+
+        # all corpora with the same source type
+        # (we need raw SQL query for postgres JSON operators) (TODO test speed)
+        same_source_corpora_query = (session
+                            .query(Node.id)
+                            .from_statement(text(
+                                """
+                                SELECT id FROM nodes
+                                WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
+                                """ % this_source_type
+                                ))
+                            )
+
+        # All docs **in all corpora of the same source**
+        docids_subquery = (session
+                            .query(Node.id)
+                            .filter(Node.parent_id.in_(same_source_corpora_query))
+                            .filter(Node.typename == "DOCUMENT")
+                            .subquery()
+                           )
+
+    # N
+    total_docs = session.query(docids_subquery).count()
+
+    # or perhaps at least do the occurrences right now at the same time
+    tf_nd = (session
+                    .query(
+                        NodeNgram.ngram_id,
+                        func.sum(NodeNgram.weight),    # tf: same as occnode
+                        func.count(NodeNgram.node_id)  # nd: n docs with term
+                     )
+                    .filter(NodeNgram.node_id.in_(docids_subquery))
+                    .group_by(NodeNgram.ngram_id)
+                    .all()
+                   )
+
+    # -------------------------------------------------
+    tfidfs = {}
+    log_tot_docs = log(total_docs)
+    for (ngram_id, tf, nd) in tf_nd:
+        # tfidfs[ngram_id] = tf * log(total_docs/nd)
+        tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
+    # -------------------------------------------------
+
+    if overwrite_id:
+        the_id = overwrite_id
+    else:
+        # create the new TFIDF-XXXX node
+        tfidf_nd = corpus.add_child()
+        if scope == "local":
+            tfidf_nd.typename  = "TFIDF-CORPUS"
+            tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
+        elif scope == "global":
+            tfidf_nd.typename  = "TFIDF-GLOBAL"
+            tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+        session.add(tfidf_nd)
+        session.commit()
+        the_id = tfidf_nd.id
+
+    # reflect that in NodeNodeNgrams
+    # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
+    bulk_insert(
+        NodeNodeNgram,
+        ('node1_id', 'node2_id','ngram_id', 'score'),
+        ((the_id,    corpus.id,    ng,   tfidfs[ng]) for ng in tfidfs)
+    )
+
+    return the_id
--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
+from gargantext.models         import Node, NodeNgram, NodeNgramNgram
+from gargantext.util.lists     import WeightedMatrix
+from gargantext.util.db        import session, aliased, func
+from gargantext.util.db_cache  import cache
+from gargantext.constants      import DEFAULT_COOC_THRESHOLD
+
+def compute_coocs(corpus,
+                    overwrite_id  = None,
+                    threshold     = DEFAULT_COOC_THRESHOLD,
+                    mainlist_id     = None,
+                    stoplist_id     = None,
+                    symmetry_filter = True):
+    """
+    Count how often some extracted terms appear
+    together in a small context (document)
+    throughout a larger context (corpus).
+
+             [NodeNgram]                       [NodeNgramNgram]
+
+    node_id | ngram_id | weight       ngram1_id | ngram2_id | score |
+    --------+----------+--------      ----------+-----------+-------+
+     MYDOCA |      487 |      1   =>        487 |       294 |     2 |
+     MYDOCA |      294 |      3
+     MYDOCB |      487 |      1
+     MYDOCB |      294 |      4
+
+    Fill that info in DB:
+      - a *new* COOCCURRENCES node
+      - and all corresponding NodeNgramNgram rows
+
+    worse case complexity ~ O(N²/2) with N = number of ngrams
+
+    If a mainlist is provided, we filter doc ngrams to those also in the list.
+
+    Parameters:
+      - the corpus node
+      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
+                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
+      - threshold: on output cooc count (previously called hapax)
+      - mainlist_id: mainlist to constrain the input ngrams
+      - stoplist_id: stoplist for filtering input ngrams
+                     (normally unnecessary if a mainlist is provided)
+
+     (deprecated parameters)
+      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
+      - isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
+
+    basic idea for one doc
+    ======================
+    each pair of ngrams sharing same doc (node_id)
+        SELEC idx1.ngram_id, idx2.ngram_id
+        FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
+        ---------------------------------
+        WHERE idx1.node_id = idx2.node_id      <== that's cooc
+        ---------------------------------
+        AND idx1.ngram_id <> idx2.ngram_id
+        AND idx1.node_id = MY_DOC ;
+
+    on entire corpus
+    =================
+    coocs for each doc :
+      - each given pair like (termA, termB) will likely appear several times
+        => we do GROUP BY (x1.ngram_id, x2.ngram_id)
+      - we count unique appearances of the pair (cooc)
+
+
+    """
+
+        #   - TODO cvalue_id: allow a metric as additional  input filter
+        #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
+        #   - TODO start, end : filter on document date
+        #   - TODO weighted: if False normal cooc to be saved as result
+        #                    if True  weighted cooc (experimental)
+
+    # /!\ big combinatorial complexity /!\
+    # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
+    #  1.859.408 lignes pour la requête cooc simple
+    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
+
+    # docs of our corpus
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # 2 x the occurrence index table
+    x1 = aliased(NodeNgram)
+    x2 = aliased(NodeNgram)
+
+    # cooccurrences columns definition
+    ucooc = func.count(x1.ngram_id).label("ucooc")
+
+    # 1) MAIN DB QUERY
+    coocs_query = (
+        session.query(x1.ngram_id, x2.ngram_id, ucooc)
+
+            .filter(x1.node_id == x2.node_id)      # <- by definition of cooc
+            .filter(x1.ngram_id != x2.ngram_id)     # <- b/c not with itself
+            .filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus
+            .group_by(x1.ngram_id, x2.ngram_id)
+           )
+
+    # 2) INPUT FILTERS (reduce N before O(N²))
+    if mainlist_id:
+        main_subquery = (
+            session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == mainlist_id)
+                .subquery()
+                )
+
+        coocs_query = ( coocs_query
+            .filter( x1.ngram_id.in_(main_subquery) )
+            .filter( x2.ngram_id.in_(main_subquery) )
+        )
+
+    if stoplist_id:
+        stop_subquery = (
+            session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == stoplist_id)
+                .subquery()
+                )
+
+        coocs_query = ( coocs_query
+            .filter( ~ x1.ngram_id.in_(stop_subquery) )
+            .filter( ~ x2.ngram_id.in_(stop_subquery) )
+        )
+
+    if symmetry_filter:
+        # 1 filtre tenant en compte de la symétrie
+        #  -> réduit le travail de moitié !!
+        #  -> mais empêchera l'accès direct aux cooccurrences de x2
+        #  -> seront éparpillées: notées dans les x1 qui ont précédé x2
+        #  -> récupération sera plus couteuse via des requêtes OR comme:
+        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
+        coocs_query = coocs_query.filter(x1.ngram_id  < x2.ngram_id)
+
+    # ------------
+    # 2 filtres amont possibles pour réduire combinatoire
+    #         - par exemple 929k lignes => 35k lignes
+    #         - ici sur weight mais dégrade les résultats
+    #            => imaginable sur une autre métrique (cvalue ou tfidf?)
+    # coocs_query = coocs_query.filter(x1.weight > 1)
+    # coocs_query = coocs_query.filter(x2.weight > 1)
+    # ------------
+
+
+    # 3) OUTPUT FILTERS
+    # ------------------
+    # threshold
+    # £TODO adjust COOC_THRESHOLD a posteriori:
+    # ex: sometimes 2 sometimes 4 depending on sparsity
+    coocs_query = coocs_query.having(ucooc >= threshold)
+
+    # 4) EXECUTE QUERY
+    # ----------------
+    #  => storage in our matrix structure
+    matrix = WeightedMatrix(coocs_query.all())
+
+    # fyi
+    # shape_0 = len({pair[0] for pair in matrix.items})
+    # shape_1 = len({pair[1] for pair in matrix.items})
+    # print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
+
+    # 5) SAVE
+    # --------
+    # saving the parameters of the analysis in the Node JSON
+    new_hyperdata = { 'corpus': corpus.id,
+                      'threshold': threshold }
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_cooc = cache.Node[overwrite_id]
+        the_cooc.hyperdata = new_hyperdata
+        the_cooc.save_hyperdata()
+        session.commit()
+        the_id = overwrite_id
+    else:
+        # create the new cooc node
+        the_cooc = corpus.add_child(
+                        typename  = "COOCCURRENCES",
+                        name      = "Coocs (in:%s)" % corpus.name[0:10],
+                        hyperdata = new_hyperdata,
+                    )
+        session.add(the_cooc)
+        session.commit()
+
+        the_id = the_cooc.id
+
+    # ==> save all NodeNgramNgram with link to new cooc node id
+    matrix.save(the_id)
+
+    return the_id
--- a/gargantext/util/toolchain/ngram_groups.py
+++ b/gargantext/util/toolchain/ngram_groups.py
+"""
+For initial ngram groups via stemming
+ Exemple:
+   - groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
+   - groups['post']          = {'poste':3, 'poster':5, 'postés':2...}
+"""
+
+from gargantext.models        import Node, NodeNgramNgram
+from gargantext.util.db       import session
+from gargantext.util.lists    import Translations
+# to convert fr => french :/
+from gargantext.util.languages import languages
+
+from re                       import split as resplit
+from collections              import defaultdict, Counter
+from nltk.stem.snowball       import SnowballStemmer
+
+def prepare_stemmers(corpus):
+    """
+    Returns *several* stemmers (one for each language in the corpus)
+         (as a dict of stemmers with key = language_iso2)
+    """
+    stemmers_by_lg = {
+        # always get a generic stemmer in case language code unknown
+        '__unknown__' : SnowballStemmer("english")
+    }
+    for lgiso2 in corpus.hyperdata['languages'].keys():
+        lgname = languages[lgiso2].name.lower()
+        stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
+    return stemmers_by_lg
+
+def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
+    """
+    1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
+    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
+    3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
+    """
+
+    stop_ngrams_ids = {}
+    # we will need the ngrams of the stoplist to filter
+    if stoplist_id is not None:
+        for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
+            stop_ngrams_ids[id[0]] = True
+
+
+    # 1) compute stems/lemmas
+    #    and group if same stem/lemma
+    stemmers = prepare_stemmers(corpus)
+
+    # todo dict {lg => {ngrams_todo} }
+    todo_ngrams_per_lg = defaultdict(set)
+
+    # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
+    my_groups = defaultdict(Counter)
+
+    # preloop per doc to sort ngrams by language
+    for doc in corpus.children():
+        if ('language_iso2' in doc.hyperdata):
+            lgid = doc.hyperdata['language_iso2']
+        else:
+            lgid = "__unknown__"
+
+        # doc.ngrams is an sql query (ugly but useful intermediate step)
+        # FIXME: move the counting and stoplist filtering up here
+        for ngram_pack in doc.ngrams.all():
+            todo_ngrams_per_lg[lgid].add(ngram_pack)
+
+    # --------------------
+    # long loop per ngrams
+    for (lgid,todo_ngs) in todo_ngrams_per_lg.items():
+        # fun: word::str => stem::str
+        stem_it = stemmers[lgid].stem
+
+        for ng in todo_ngs:
+            doc_wei = ng[0]
+            ngram  = ng[1]       # Ngram obj
+
+            # break if in STOPLIST
+            if ngram.id in stop_ngrams_ids:
+                next
+
+            lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)]
+
+            # STEM IT, and this term's stems will become a new grouping key...
+            stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms])
+
+            # ex:
+            # groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
+            # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
+            my_groups[stemseq][ngram.id] += doc_wei
+
+    del todo_ngrams_per_lg
+
+    # now serializing all groups to a list of couples
+    ng_couples = []
+    addcouple = ng_couples.append
+    for grped_ngramids in my_groups.values():
+        if len(grped_ngramids) > 1:
+            # first find most frequent term in the counter
+            winner_id = grped_ngramids.most_common(1)[0][0]
+
+            for ngram_id in grped_ngramids:
+                if ngram_id != winner_id:
+                    addcouple((winner_id, ngram_id))
+
+    del my_groups
+
+    # 2) the list node
+    if overwrite_id:
+        # overwrite pre-existing id
+        the_id = overwrite_id
+    # or create the new id
+    else:
+        the_group =  corpus.add_child(
+            typename  = "GROUPLIST",
+            name = "Group (src:%s)" % corpus.name[0:10]
+        )
+
+        # and save the node
+        session.add(the_group)
+        session.commit()
+        the_id = the_group.id
+
+    # 3) Save each grouping couple to DB thanks to Translations.save() table
+    ndngng_list = Translations(
+                                [(sec,prim) for (prim,sec) in ng_couples],
+                                just_items=True
+                   )
+
+    # ...referring to the list node we just got
+    ndngng_list.save(the_id)
+
+    return the_id
--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -2,11 +2,16 @@ from gargantext.util.db import *
 from gargantext.models import *
 from gargantext.constants import *

+from collections import defaultdict

 def parse(corpus):
    try:
        documents_count = 0
        corpus.status('parsing', progress=0)
+
+        # will gather info about languages
+        observed_languages = defaultdict(int)
+
        # retrieve resource information
        for resource in corpus.resources():
            # information about the resource
@@ -22,6 +27,7 @@ def parse(corpus):
                    hyperdata = hyperdata,
                )
                session.add(document)
+                observed_languages[hyperdata["language_iso2"]] += 1
                if documents_count % BATCH_PARSING_SIZE == 0:
                    corpus.status('parsing', progress=documents_count)
                    corpus.save_hyperdata()
@@ -29,6 +35,8 @@ def parse(corpus):
                documents_count += 1
            # update info about the resource
            resource['extracted'] = True
+        # add a corpus-level info about languages
+        corpus.hyperdata['languages'] = observed_languages
        # commit all changes
        corpus.status('parsing', progress=documents_count, complete=True)
        corpus.save_hyperdata()

--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -94,23 +94,36 @@ def project(request, project_id):
        )
        session.add(corpus)
        session.commit()
+        
+        # parse_extract: fileparsing -> ngram extraction -> lists
        scheduled(parse_extract_indexhyperdata)(corpus.id)
+        #scheduled(parse_extract)(corpus.id)

    # corpora within this project
    corpora = project.children('CORPUS').all()
    sourcename2corpora = defaultdict(list)
    for corpus in corpora:
        # we only consider the first resource of the corpus to determine its type
-        resource = corpus.resources()[0]
-        resource_type_name = RESOURCETYPES[resource['type']]['name']
+        resources = corpus.resources()
+        if len(resources):
+            resource = resources[0]
+            resource_type_name = RESOURCETYPES[resource['type']]['name']
+        else:
+            print("(WARNING) PROJECT view: no listed resource")
        # add some data for the viewer
        corpus.count = corpus.children('DOCUMENT').count()
        status = corpus.status()
        if status is not None and not status['complete']:
-            corpus.status_message = '(in progress: %s, %d complete)' % (
-                status['action'].replace('_', ' '),
-                status['progress'],
-            )
+            if not status['error']:
+                corpus.status_message = '(in progress: %s, %d complete)' % (
+                    status['action'].replace('_', ' '),
+                    status['progress'],
+                )
+            else:
+                corpus.status_message = '(aborted: "%s" after %i docs)' % (
+                    status['error'][-1],
+                    status['progress']
+                )
        else:
            corpus.status_message = ''
        # add