prototype ngram toolchain in __init__.py (no mainlist nor maplist yet :/)

prototype ngram toolchain in init.py (no mainlist nor maplist yet :/)
89c8268c · Romain Loth · 61237884 · 89c8268c · 89c8268c · 89c8268c
Commit 89c8268c authored Mar 10, 2016 by Romain Loth
8 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -92,6 +92,8 @@ RESOURCETYPES = [
    # },
 ]

+# linguistic extraction parameters
+DEFAULT_COOC_THRESHOLD = 4

 # other parameters
 # default number of docs POSTed to scrappers.views.py

--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -178,6 +178,8 @@ class WeightedContextIndex(_BaseClass):
    def __init__(self, source=None):
        self.items = defaultdict(float)

+    # £TODO
+



@@ -222,7 +224,7 @@ class WeightedMatrix(_BaseClass):
        # insert new data
        bulk_insert(
            NodeNgramNgram,
-            ('node_id', 'ngram1_id', 'ngram2_id', 'score'),
+            ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
            ((node_id, key1, key2, value) for key1, key2, value in self)
        )


--- a/gargantext/util/toolchain/__init__.py
+++ b/gargantext/util/toolchain/__init__.py
 from .parsing           import parse
 from .ngrams_extraction import extract_ngrams
-from .ngram_scores      import compute_occurrences_local, compute_tfidf_local
+
+from .list_stop         import compute_stop
+from .ngram_scores      import compute_occurrences_local, compute_tfidf
+from .ngram_coocs_tempo import compute_coocs
+from .score_specificity import compute_specificity
+from .list_map          import compute_mapList     # TEST
 from .ngram_groups      import compute_groups

 from gargantext.util.db import session
 from gargantext.models  import Node

+from datetime           import datetime
+
 def parse_extract(corpus):
    # retrieve corpus from database from id
    if isinstance(corpus, int):
@@ -21,16 +28,47 @@ def parse_extract(corpus):
    extract_ngrams(corpus)
    print('CORPUS #%d: extracted ngrams' % (corpus.id))

+    # -------------------------------
    # temporary ngram lists workflow
+    # -------------------------------
+    print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
+
+    # -> stoplist: compute + write (=> Node and NodeNgram)
+    stop_id = compute_stop(corpus)
+    print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
+
+    # -> write local tfidf to Node and NodeNodeNgram
+    ltfidf_id = compute_tfidf(corpus, scope="local")
+    print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
+
+    # -> write global tfidf to Node and NodeNodeNgram
+    gtfidf_id = compute_tfidf(corpus, scope="global")
+    print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))

-    # write occurrences to Node and NodeNodeNgram
-    occnd_id = compute_occurrences_local(corpus)
-    print('CORPUS #%d: new occs node #%i' % (corpus.id, occnd_id))
+    # ?? mainlist: compute + write (to Node and NodeNgram)
+    # mainlist_id = compute_mainlist(corpus)
+    # print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))

-    # write local tfidf to Node and NodeNodeNgram
-    ltfidf_id = compute_tfidf_local(corpus)
-    print('CORPUS #%d: new localtfidf node #%i' % (corpus.id, ltfidf_id))
+    # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
+    cooc_id = compute_coocs(corpus, stop_id = None)
+    print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))

-    # write groups to Node and NodeNgramNgram
+    # ?? specificity: compute + write (=> NodeNodeNgram)
+    spec_id = compute_specificity(cooc_id=cooc_id, corpus=corpus)
+    print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
+
+    # ?? maplist: compute + write (to Node and NodeNgram)
+    # map_id = compute_stop(corpus)
+    # print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
+
+    # -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
+    occ_id = compute_occurrences_local(corpus)
+    print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
+
+    # -> write groups to Node and NodeNgramNgram
    group_id = compute_groups(corpus, stoplist_id = None)
-    print('CORPUS #%d: new grouplist node #%i' % (corpus.id, group_id))
+    print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
+
+
+def t():
+    return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
--- a/gargantext/util/toolchain/list_stop.py
+++ b/gargantext/util/toolchain/list_stop.py
@@ -2,15 +2,13 @@ from gargantext.util.db import *
 from gargantext.util.db_cache import *
 from gargantext.constants import *

-from gargantext.models.users  import User
-from gargantext.models.nodes  import Node
-from gargantext.models.ngrams import Ngram, NodeNgram
+from gargantext.util.db    import session, aliased, func
+from gargantext.util.lists import WeightedMatrix
+
+from gargantext.models        import User, Node, Ngram, NodeNgram

 import re
-import sqlalchemy as sa
-from sqlalchemy.sql import func
-from sqlalchemy.orm import aliased
-from sqlalchemy import desc, asc, or_, and_, Date, cast, select, literal_column
+from sqlalchemy import desc, asc
 #from ngram.tools import insert_ngrams

 def isStopWord(ngram, stop_words=None):
@@ -23,20 +21,16 @@ def isStopWord(ngram, stop_words=None):

    if word in stop_words:
        return(True)
-    
-    def test_match(word, regex):
-        format_regex = re.compile(regex)
-        if format_regex.match(word) :
-            return(True)

+    compiled_regexes = []   # to compile them only once
    for regex in [
              "^.{1,2}$"
            , "(.*)\d(.*)"
-            , "(.*)(\.)(.*)"
+            # , "(.*)(\.)(.*)"         trop fort (enlève les sigles !)
            , "(.*)(\,)(.*)"
            , "(.*)(< ?/?p ?>)(.*)"       # marques de paragraphes
            , "(.*)(study)(.*)"
-            , "(.*)(xx|xi|xv)(.*)"
+            , "(.*)\b(xx|xi|xv)\b(.*)"
            , "(.*)(result)(.*)"
            , "(.*)(année|nombre|moitié)(.*)"
            , "(.*)(temps)(.*)"
@@ -47,9 +41,15 @@ def isStopWord(ngram, stop_words=None):
            , "(.*)(travers)(.*)"
            , "(.*)(:|\|)(.*)"
            ] :
-        if test_match(word, regex) is True :
+        compiled_regexes.append(re.compile(regex))
+
+    for format_regex in compiled_regexes:
+        if format_regex.match(word):
+            # print("STOPLIST += '%s' (regex: %s)" % (word, format_regex.pattern))
            return(True)

+    return False
+
 def create_gargantua_resources():
    gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
    project = Node(
@@ -61,32 +61,33 @@ def create_gargantua_resources():
    session.add(stopList)
    session.commit()

-def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
+def compute_stop(corpus, stopList_id=None, debug=False):
    '''
    Create list of stop words.
    TODO do a function to get all stop words with social scores
    '''
-    
+
    # Get the StopList if it exist or create a new one
    # At this step of development, a new StopList should be created
    if stopList_id == None:
        stopList_id = session.query(Node.id).filter(
-            Node.parent_id==corpus_id,
+            Node.parent_id==corpus.id,
            Node.typename == "STOPLIST"
            ).first()
        if stopList_id == None:
-            corpus = cache.Node[corpus_id]
-            user_id = corpus.user_id
-            stopList = Node(name="STOPLIST", parent_id=corpus_id, user_id=user_id, typename="STOPLIST")
+            stopList = Node(name="STOPLIST",
+                        parent_id=corpus.id,
+                        user_id=corpus.user_id,
+                        typename="STOPLIST")
            session.add(stopList)
            session.commit()
            stopList_id = stopList.id
-    
+
    # For tests only
    if debug == True:
        session.query(Node).filter(Node.id==stopList_id).delete()
        session.commit()
-    
+
    # Get common resources, all common StopWords on the platform
    ## First get the id of the StopList of Gargantua super user
    gargantua_id = session.query(User.id).filter(User.username=="gargantua").first()
@@ -101,16 +102,16 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
                         .filter(NodeNgram.node_id == rootStopList_id)
                         .all()
                 )
-    print([n for n in stop_words])
-    
-    
+
+    # print([n for n in stop_words])
+
    ## Get the ngrams
    ## ngrams :: [(Int, String, Int)]
-    frequency = sa.func.count( NodeNgram.weight )
+    frequency = func.count( NodeNgram.weight )
    ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
            .join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
            .join( Node, Node.id == NodeNgram.node_id )
-            .filter( Node.parent_id == corpus_id,
+            .filter( Node.parent_id == corpus.id,
                     Node.typename == "DOCUMENT")
            .group_by( Ngram.id )
            .order_by( desc( frequency ) )
@@ -119,9 +120,10 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
            )

    ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
-    
-    print([n for n in ngrams_to_stop])
+
+    # print([n for n in ngrams_to_stop])

    stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
+    # stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
    stop.save(stopList_id)
-#    
+    return stopList_id
--- a/gargantext/util/toolchain/ngram_coocs_tempo.py
+++ b/gargantext/util/toolchain/ngram_coocs_tempo.py
+from gargantext.models     import Node, NodeNgram, NodeNgramNgram
+from gargantext.util.lists import WeightedMatrix
+from gargantext.util.db    import session, aliased, func
+from gargantext.constants  import DEFAULT_COOC_THRESHOLD
+
+def compute_coocs(corpus,
+                    threshold = DEFAULT_COOC_THRESHOLD,
+                    weighted  = False,
+                    our_id    = None,
+                    stop_id   = None,
+                    symmetry_filter = True):
+    """
+    Count how often some extracted terms appear
+    together in a small context (document)
+    throughout a larger context (corpus).
+
+    node_id | ngram_id | weight       ngram1_id | ngram2_id | ucooc | wcooc |
+    --------+----------+--------      ----------+-----------+-------+-------+
+      MYDOC |      487 |      1   =>        487 |       294 |     1 |     4 |
+      MYDOC |      294 |      3
+
+    Fill that info in DB:
+      - a *new* COOCCURRENCES node
+      - and all corresponding NodeNgramNgram rows
+
+    worse case complexity ~ O(N²/2) with N = number of ngrams
+
+    Parameters:
+      - threshold: on output ucooc count (previously called hapax)
+      - weighted: if False normal cooc to be saved as result
+                  if True  weighted cooc (experimental)
+      - stop_id: stoplist for filtering input ngrams
+      - TODO cvalue_id: allow a metric as input filter
+      - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
+      - TODO start, end : filter on document date
+
+     (deprecated parameters)
+      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
+      - isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
+
+    basic idea for one doc
+    ======================
+    each pair of ngrams sharing same doc (node_id)
+        SELEC idx1.ngram_id, idx2.ngram_id
+        FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
+        ---------------------------------
+        WHERE idx1.node_id = idx2.node_id      <== that's cooc
+        ---------------------------------
+        AND idx1.ngram_id <> idx2.ngram_id
+        AND idx1.node_id = MY_DOC ;
+
+    on entire corpus
+    =================
+    coocs for each doc :
+      - each given pair like (termA, termB) will likely appear several times
+        => we do GROUP BY (x1.ngram_id, x2.ngram_id)
+      - normally we can count unique appearances of the pair (ucooc)
+      - we can count sum of sum of weights in the pair (wcooc or cofreq)
+
+    TODO
+    ====
+      use WeightedMatrix
+    """
+
+    # /!\ big combinatorial complexity /!\
+    # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
+    #  1.859.408 lignes pour la requête cooc simple
+    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
+
+    # docs of our corpus
+    docids_subquery = (session
+                        .query(Node.id)
+                        .filter(Node.parent_id == corpus.id)
+                        .filter(Node.typename == "DOCUMENT")
+                        .subquery()
+                       )
+
+    # 2 x the occurrence index table
+    x1 = aliased(NodeNgram)
+    x2 = aliased(NodeNgram)
+
+    # cooccurrences columns definition
+    ucooc = func.count(x1.ngram_id).label("ucooc")
+
+    # 1) MAIN DB QUERY
+    coocs_query = (
+        session.query(x1.ngram_id, x2.ngram_id, ucooc)
+
+            .filter(x1.node_id == x2.node_id)      # <- by definition of cooc
+            .filter(x1.ngram_id != x2.ngram_id)     # <- b/c not with itself
+            .filter(x1.node_id.in_(docids_subquery)) # <- b/c within corpus
+            .group_by(x1.ngram_id, x2.ngram_id)
+           )
+
+    # 2) INPUT FILTERS (reduce N before O(N²))
+    #    £TODO add possibility to restrict to the mainlist
+    if stop_id:
+        stop_subquery = (
+            session.query(NodeNgram.ngram_id)
+                .filter(NodeNgram.node_id == stop_id)
+                .subquery()
+                )
+
+        coocs_query = ( coocs_query
+            .filter( ~ x1.ngram_id.in_(stop_subquery) )
+            .filter( ~ x2.ngram_id.in_(stop_subquery) )
+        )
+
+    if symmetry_filter:
+        # 1 filtre tenant en compte de la symétrie
+        #  -> réduit le travail de moitié !!
+        #  -> mais empêchera l'accès direct aux cooccurrences de x2
+        #  -> seront éparpillées: notées dans les x1 qui ont précédé x2
+        #  -> récupération sera plus couteuse via des requêtes OR comme:
+        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
+        coocs_query = coocs_query.filter(x1.ngram_id  < x2.ngram_id)
+
+    # ------------
+    # 2 filtres amont possibles pour réduire combinatoire
+    #         - par exemple 929k lignes => 35k lignes
+    #         - ici sur weight mais dégrade les résultats
+    #            => imaginable sur une autre métrique (cvalue ou tfidf?)
+    # coocs_query = coocs_query.filter(x1.weight > 1)
+    # coocs_query = coocs_query.filter(x2.weight > 1)
+    # ------------
+
+
+    # 3) OUTPUT FILTERS
+    # ------------------
+    # threshold
+    #
+    coocs_query = coocs_query.having(ucooc > threshold)
+
+    # 4) EXECUTE QUERY
+    # ----------------
+    #  => storage in our matrix structure
+    matrix = WeightedMatrix(coocs_query.all())
+
+    # 5) SAVE
+    # --------
+    if our_id:
+        # use pre-existing id
+        the_id = our_id
+    else:
+        # create the new cooc node
+        the_cooc = Node(
+                        typename  = "COOCCURRENCES",
+                        name      = "Coocs (in:%s)" % corpus.name[0:10],
+                        parent_id = corpus.id,
+                        user_id   = corpus.user_id,
+
+                        # saving the parameters of the analysis in the Node JSON
+                        hyperdata = { 'corpus': corpus.id,
+                                      'threshold': threshold }
+                    )
+        session.add(the_cooc)
+        session.commit()
+
+        the_id = the_cooc.id
+
+    # ==> save all NodeNgramNgram with link to new cooc node id
+    matrix.save(the_id)
+
+    return the_id
--- a/gargantext/util/toolchain/ngram_groups.py
+++ b/gargantext/util/toolchain/ngram_groups.py
@@ -32,7 +32,7 @@ def compute_groups(corpus, stoplist_id = None):
    stop_ngrams_ids = {}
    # we will need the ngrams of the stoplist to filter
    if stoplist_id is not None:
-        for id in session.query(NodeNgram.id).filter(NodeNgram.node_id == stoplist_id).all():
+        for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
            stop_ngrams_ids[id[0]] = True



--- a/gargantext/util/toolchain/ngram_scores.py
+++ b/gargantext/util/toolchain/ngram_scores.py
 from gargantext.models   import Node, NodeNgram, NodeNodeNgram
 from gargantext.util.db  import session, bulk_insert
-
+from sqlalchemy import text
 # £TODO
 # from gargantext.util.lists import WeightedContextIndex

@@ -57,19 +57,48 @@ def compute_occurrences_local(corpus):
    return occnode.id


-def compute_tfidf_local(corpus):
+def compute_tfidf(corpus, scope="local"):
    """
    Calculates tfidf within the current corpus
-    """

-    # ?? FIXME could we keep the docids somehow from previous computations ??
-    docids_subquery = (session
-                        .query(Node.id)
-                        .filter(Node.parent_id == corpus.id)
-                        .filter(Node.typename == "DOCUMENT")
-                        .subquery()
-                       )
+    Parameter:
+      - scope: {"local" or "global"}
+    """

+    # local <=> within this corpus
+    if scope == "local":
+        # All docs of this corpus
+        docids_subquery = (session
+                            .query(Node.id)
+                            .filter(Node.parent_id == corpus.id)
+                            .filter(Node.typename == "DOCUMENT")
+                            .subquery()
+                           )
+    # global <=> within all corpora of this source
+    elif scope == "global":
+        this_source_type = corpus.resources()[0]['type']
+
+        # all corpora with the same source type
+        # (we need raw SQL query for postgres JSON operators) (TODO test speed)
+        same_source_corpora_query = (session
+                            .query(Node.id)
+                            .from_statement(text(
+                                """
+                                SELECT id FROM nodes
+                                WHERE hyperdata->'resources' @> '[{\"type\"\:%s}]'
+                                """ % this_source_type
+                                ))
+                            )
+
+        # All docs **in all corpora of the same source**
+        docids_subquery = (session
+                            .query(Node.id)
+                            .filter(Node.parent_id.in_(same_source_corpora_query))
+                            .filter(Node.typename == "DOCUMENT")
+                            .subquery()
+                           )
+
+    # N
    total_docs = session.query(docids_subquery).count()

    # or perhaps at least do the occurrences right now at the same time
@@ -93,12 +122,14 @@ def compute_tfidf_local(corpus):
    # -------------------------------------------------

    # create the new TFIDF-CORPUS node
-    ltfidf = Node()
-    ltfidf.typename  = "TFIDF-CORPUS"
-    ltfidf.name      = "tfidf (in:%s)" % corpus.id
-    ltfidf.parent_id = corpus.id
-    ltfidf.user_id   = corpus.user_id
-    session.add(ltfidf)
+    tfidf_nd = Node(parent_id = corpus.id, user_id = corpus.user_id)
+    if scope == "local":
+        tfidf_nd.typename  = "TFIDF-CORPUS"
+        tfidf_nd.name      = "tfidf-c (in:%s)" % corpus.id
+    elif scope == "global":
+        tfidf_nd.typename  = "TFIDF-GLOBAL"
+        tfidf_nd.name      = "tfidf-g (in type:%s)" % this_source_type
+    session.add(tfidf_nd)
    session.commit()

    # reflect that in NodeNodeNgrams
@@ -106,7 +137,7 @@ def compute_tfidf_local(corpus):
    bulk_insert(
        NodeNodeNgram,
        ('node1_id' , 'node2_id', 'ngram_id', 'score'),
-        ((ltfidf.id,  corpus.id,     ng, tfidfs[ng]) for ng in tfidfs)
+        ((tfidf_nd.id,  corpus.id,     ng, tfidfs[ng]) for ng in tfidfs)
    )

-    return ltfidf.id
+    return tfidf_nd.id
--- a/gargantext/util/toolchain/score_specificity.py
+++ b/gargantext/util/toolchain/score_specificity.py
-from gargantext.util.db import *
+from gargantext.util.db import session, aliased, func
 from gargantext.util.db_cache import *
 from gargantext.constants import *

-from gargantext.util.analysis.cooccurrences import do_cooc
+# from gargantext.util.analysis.cooccurrences import do_cooc

-from gargantext.models.ngrams import Ngram, NodeNgram,\
-        NodeNgramNgram, NodeNodeNgram
+from gargantext.models import Node, Ngram, NodeNgramNgram, NodeNodeNgram

-import numpy as np
 import pandas as pd
 from collections import defaultdict
-from sqlalchemy import desc, asc, or_, and_, Date, cast, select

-def specificity(cooc_id=None, corpus=None, limit=100, session=None):
+def compute_specificity(corpus, cooc_id, limit=100):
    '''
    Compute the specificity, simple calculus.
    '''
- 
+
    cooccurrences = (session.query(NodeNgramNgram)
                    .filter(NodeNgramNgram.node_id==cooc_id)
-                    .order_by(NodeNgramNgram.score)
-                    .limit(limit)
+                    # no filtering: new choice filter on tfidf before creation
+                    #    .order_by(NodeNgramNgram.weight)
+                    #    .limit(limit)
                    )

    matrix = defaultdict(lambda : defaultdict(float))

+    # £TODO re-rename weight => score
    for cooccurrence in cooccurrences:
-        matrix[cooccurrence.ngramx_id][cooccurrence.ngramy_id] = cooccurrence.score
-        matrix[cooccurrence.ngramy_id][cooccurrence.ngramx_id] = cooccurrence.score
-
-    x = pd.DataFrame(matrix).fillna(0)
-    x = x / x.sum(axis=1)
-
-    xs = x.sum(axis=1)
-    ys = x.sum(axis=0)
-
-    m = ( xs - ys) / (2 * (x.shape[0] - 1))
-    m = m.sort(inplace=False)
-
-    #node = get_or_create_node(nodetype='Specificity',corpus=corpus)
-
+        matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
+        matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
+
+    nb_ngrams = len(matrix)
+
+    d = pd.DataFrame(matrix).fillna(0)
+
+    # proba (x/y) ( <= on divise chaque colonne par son total)
+    d = d / d.sum(axis=0)
+
+    # d:Matrix => v: Vector (len = nb_ngrams)
+    v = d.sum(axis=1)
+
+    ## d ##
+    #######
+    #               Grenelle  biodiversité  kilomètres  site  élus  île
+    # Grenelle             0             0           4     0     0    0
+    # biodiversité         0             0           0     0     4    0
+    # kilomètres           4             0           0     0     4    0
+    # site                 0             0           0     0     4    6
+    # élus                 0             4           4     4     0    0
+    # île                  0             0           0     6     0    0
+
+
+    ## d.sum(axis=1) ##
+    ###################
+    # Grenelle         4
+    # biodiversité     4
+    # kilomètres       8
+    # site            10
+    # élus            12
+    # île              6
+
+    # résultat temporaire
+    # -------------------
+    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
+    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
+    # TODO analyser la cohérence math ET sem de cet indicateur
+    v.sort_values(inplace=True)
+
+    # [ ('biodiversité' , 0.333 ),
+    #   ('Grenelle'     , 0.5   ),
+    #   ('île'          , 0.599 ),
+    #   ('kilomètres'   , 1.333 ),
+    #   ('site'         , 1.333 ),
+    #   ('élus'         , 1.899 ) ]
+
+    # ----------------
+    # specificity node
    node = session.query(Node).filter(
-        Node.parent_id==corpus_id,
+        Node.parent_id==corpus.id,
        Node.typename == "SPECIFICITY"
        ).first()

    if node == None:
-        corpus = cache.Node[corpus_id]
        user_id = corpus.user_id
-        node = Node(name="SPECIFICITY", parent_id=corpus_id, user_id=user_id, typename="SPECIFICITY")
+        node = Node(name="Specif (in:%i)" % corpus.id,
+                    parent_id=corpus.id,
+                    user_id=user_id,
+                    typename="SPECIFICITY")
        session.add(node)
        session.commit()

-
-    data = zip(  [node.id for i in range(1,m.shape[0])]
-               , [corpus.id for i in range(1,m.shape[0])]
-               , m.index.tolist()
-               , m.values.tolist()
+    data = zip(  [node.id] * nb_ngrams
+               , [corpus.id] * nb_ngrams
+               , v.index.tolist()
+               , v.values.tolist()
               )
-    session.query(NodeNodeNgram).filter(NodeNodeNgram.nodex_id==node.id).delete()
+    session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==node.id).delete()
    session.commit()

-    bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])
+    bulk_insert(NodeNodeNgram, ['node1_id', 'node2_id', 'ngram_id', 'score'], [d for d in data])

    return(node.id)
-    
-
-def compute_specificity(corpus,limit=100, session=None):
-    '''
-    Computing specificities as NodeNodeNgram.
-    All workflow is the following:
-        1) Compute the cooc matrix
-        2) Compute the specificity score, saving it in database, return its Node
-    '''
-    
-    #dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
-    
-    #list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
-    cooc_id = do_cooc(corpus=corpus, cvalue_id=list_cvalue.id,limit=limit)
-
-    specificity(cooc_id=cooc_id,corpus=corpus,limit=limit,session=session)
-    #dbg.show('specificity')
-
-#corpus=session.query(Node).filter(Node.id==244250).first()
-#compute_specificity(corpus)
-