Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

85aeadfb · PkSM3 · 273dd6b0 · ce193205 · 85aeadfb · 85aeadfb
Commit 85aeadfb authored Jun 04, 2015 by PkSM3
14 changed files
--- a/admin/env.py
+++ b/admin/env.py
+# Without this, we couldn't use the Django environment
+import os
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext_web.settings")
+os.environ.setdefault("DJANGO_HSTORE_GLOBAL_REGISTER", "False")
+from admin.utils import PrintException
+# database tools
+from node import models
+from gargantext_web.db import *
+from parsing.corpustools import *
--- a/admin/mergeuntable2prod.sh
+++ b/admin/mergeuntable2prod.sh
+#!/bin/bash
+git checkout unstable
+git checkout testing
+git merge unstable
+git checkout prod-dev
+git merge testing
+git checkout prod
+git merge prod-dev
+git checkout unstable
+echo "Push ? (yes)"
+read y
+if [[ $y == "yes" ]]; then
+	echo "je push"
+	git push origin prod prod-dev testing unstable
+fi
--- a/admin/update_corpus.py
+++ b/admin/update_corpus.py
+from env import *
+from gargantext_web.db import *
+from parsing.corpustools import *
+from gargantext_web.views import move_to_trash, empty_trash
+def do_empty():
+    corpus_ids = (session.query(Node.id)
+            .filter(Node.type_id == cache.NodeType['Corpus'].id)
+            .all()
+            )
+    for corpus_id in corpus_ids :
+        doc_count = int()
+        doc_count = (session.query(Node.id)
+                .filter(Node.parent_id == corpus_id)
+                .filter(Node.type_id == cache.NodeType['Document'].id)
+                .count()
+                )
+        if doc_count == 0 :
+            move_to_trash(corpus_id)
+    empty_trash()
+do_empty()
+def extract_again():
+    corpus_ids = (session.query(Node.id)
+            .join(Node_Resource, Node_Resource.node_id == Node.id)
+            .join(Resource, Node_Resource.resource_id == Resource.id )
+            .join(or_(Resource.name == 'Europress (French)',
+                      Resource.name == 'Europress (English)'))
+            .filter(Node.type_id == cache.NodeType['Corpus'].id )
+            .filter(Node.resource_id == cache.NodeType['Corpus'].id)
+            .all()
+            )
+    print(corpus_ids)
+extract_again()
+#add_resource(corpus,
+#    # file = './data_samples/pubmed_result.xml',
+#    file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
+#    type_id = cache.ResourceType['pubmed'].id,
+#)
+#parse_resources(corpus)
+#extract_ngrams(corpus, ('title', ))
+#
+#
+#
+## print(corpus)
+## corpus = session.query(Node).filter(Node.id == 72771).first()
+## corpus = session.query(Node).filter(Node.id == 73017).first()
+# compute_tfidf(corpus)
--- a/admin/utils.py
+++ b/admin/utils.py
@@ -28,3 +28,4 @@ def PrintException():
    line = linecache.getline(filename, lineno, f.f_globals)
    print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
--- a/analysis/cooccurrences.py
+++ b/analysis/cooccurrences.py
+from env import *
+from admin.utils import PrintException
+from gargantext_web.db import NodeNgram
+from gargantext_web.db import *
+from parsing.corpustools import *
+import sqlalchemy
+from sqlalchemy.sql import func
+from sqlalchemy import desc, asc, or_, and_, Date, cast, select
+from sqlalchemy import literal_column
+from sqlalchemy.orm import aliased
+# from gargantext_web.db import Node, get_cursor
+def cooccurrences(user_id=None, corpus_id=None,
+                mainlist_id=None, stoplist_id=None,
+                lem=False, stem=True, cvalue=False,
+                date_begin=None, date_end=None,
+                size=10, n_min=2, n_max=3):
+    '''
+    Function to create a cooccurrence Node
+    ---------------------------------------------------
+    cooccurrences :: [Text] -> [Word] -> [[Word]]
+    user_id      :: Integer, User.id who creates the cooccurrence matrix
+    corpus_id    :: Integer, Node.id with NodeType "Corpus"
+    miamlist_id  :: Integer, Node.id with NodeType "MiamList" and with parent_id=corpus_id
+    stoplist_id  :: Integer, Node.id with NodeType "StopList" and with parent_id=corpus_id
+    mainlist_id  :: Integer, Node.id with NodeType "MainList" and with parent_id=corpus_id
+    lem          :: False | True, if lemmatization  should be taken into account
+    stem         :: False | True, if stemmatization should be taken into account
+    cvalue       :: False | True, if cvalue         should be taken into account
+    group        :: False | True, if manual groups  should be taken into account
+    date_begin   :: Datetime, format YYYY-MM-DD, begin of corpus splitted by date
+    date_end     :: Datetime, format YYYY-MM-DD, end   of corpus splitted by date
+    size         :: Integer, size of the cooccurrence list
+    n_min        :: Integer, minimal ngram's size of n
+    n_max        :: Integer, maximal ngram's size of n
+    '''
+    # We create a new node of Type cooccurrence
+    if corpus_id is not None and user_id is not None:
+        node_cooc = session.query(Node).filter(
+                                Node.parent_id==corpus.id,
+                                Node.type_id == cache.NodeType['Cooccurrence'].id
+                                ).first()
+        if node_cooc is None:
+            node_cooc = Node(user_id = user_id,
+                             parent_id=corpus_id,
+                             type_id=cache.NodeType['Cooccurrence'].id,
+                             name="Cooccurrences corpus " + str(corpus_id))
+            session.add(node_cooc)
+            session.commit()
+    else:
+        print("Usage (Warning): Need corpus_id and user_id")
+    # Getting the main lists here, by default create or take the first one.
+    # Getting nodes for lems, stems and cvalue, if needed.
+    if stem is True:
+        node_stem = session.query(Node).filter(
+            Node.type_id==cache.NodeType['Stem'].id).first()
+    miamNgram   = aliased(NodeNgram)
+    stopNgram   = aliased(NodeNgram)
+    groupNgram   = aliased(NodeNgramNgram)
+    stemNgram   = aliased(NodeNgramNgram)
+    lemNgram    = aliased(NodeNgramNgram)
+    cvalueNgram = aliased(NodeNgramNgram)
+    # Literal query here
+    query = (session.query(Node.id, Ngram.id.label('x'), Ngram.id.label('y'), func.count().label('score'))
+        .join(NodeNgram, NodeNgram.node_id == Node.id)
+        #.outerjoin(stopNgram, stopNgram.ngram_id == Ngram.id)
+        .filter(Node.parent_id == corpus_id)
+        .filter(Node.type_id == cache.NodeType['Document'].id)
+        #.filter(Ngram.n > n_max)
+        #.group_by(x)
+        #.group_by(y)
+        #.limit(size)
+        .all()
+        )
+    return(query)
--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -17,23 +17,23 @@ def create_blacklist(user, corpus):
 def create_synonymes(user, corpus):
    pass
-size = 1000 
+size = 1000
 def create_whitelist(user, corpus_id, size=size, count_min=2):
    cursor = connection.cursor()
    whitelist_type_id = cache.NodeType['WhiteList'].id
    blacklist_type_id = cache.NodeType['BlackList'].id
    type_document_id  = cache.NodeType['Document'].id
    white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id)
    black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id)
    session.add(white_list)
    session.add(black_list)
    session.commit()
    # delete avant pour éviter les doublons
    #    try:
@@ -105,21 +105,21 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
        COUNT(*) AS score
    FROM
        node_node AS n  -- the nodes who are direct children of the corpus
    INNER JOIN
        node_node_ngram AS nngX ON nngX.node_id = n.id  --  list of ngrams contained in the node
    INNER JOIN
        node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
    INNER JOIN
        node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
    INNER JOIN
        node_node_ngram AS nngY ON nngY.node_id = n.id
    INNER JOIN
        node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
    INNER JOIN
        node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
    WHERE
        n.parent_id = %s
    AND
@@ -128,13 +128,13 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
        whitelistY.node_id = %s
    AND
        nngX.ngram_id < nngY.ngram_id   --  so we only get distinct pairs of ngrams
    GROUP BY
        ngX.id,
        ngX.terms,
        ngY.id,
        ngY.terms
    ORDER BY
        score DESC
    LIMIT
@@ -153,9 +153,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
    import networkx as nx
    from networkx.readwrite import json_graph
    from gargantext_web.api import JsonHttpResponse
    from analysis.louvain import best_partition
    #print(corpus_id, cooc_id)
    try:
@@ -172,7 +172,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
            cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
        else:
            cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
        for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
@@ -192,41 +192,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
        x = pd.DataFrame(matrix).fillna(0)
        y = pd.DataFrame(matrix).fillna(0)
        #xo = diag_null(x)
        #y = diag_null(y)
        x = x / x.sum(axis=1)
        y = y / y.sum(axis=0)
        #print(x)
        xs = x.sum(axis=1) - x
        ys = x.sum(axis=0) - x
        # top inclus ou exclus
        n = ( xs + ys) / (2 * (x.shape[0] - 1))
        # top generic or specific
        m = ( xs - ys) / (2 * (x.shape[0] - 1))
        n = n.sort(inplace=False)
        m = m.sort(inplace=False)
        print(n)
        print(m)
        nodes_included = 300 #int(round(size/20,0))
        #nodes_excluded = int(round(size/10,0))
        nodes_specific = 300 #int(round(size/10,0))
        #nodes_generic = int(round(size/10,0))
        # TODO user the included score for the node size
        n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
-        # Generic: 
+        # Generic:
        #m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
-        # Specific: 
+        # Specific:
        m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
        x_index = pd.Index.union(n_index, m_index)
        xx = x[list(x_index)].T[list(x_index)]
@@ -236,26 +236,28 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
        # Removing unconnected nodes
        xxx = xx.values
        threshold = min(xxx.max(axis=1))
-        matrix_filtered = np.where(xxx > threshold, xxx, 0)
+        matrix_filtered = np.where(xxx >= threshold, xxx, 0)
        #matrix_filtered = matrix_filtered.resize((90,90))
    except:
        PrintException()
    try:
-        G = nx.from_numpy_matrix(matrix_filtered)
+        G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
        G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
        #print(G)
        # Removing too connected nodes (find automatic way to do it)
+        #edges_to_remove = [ e for e in G.edges_iter() if
        degree = G.degree()
-        to_remove = [n for n in degree if degree[n] <= 1]
+        nodes_to_remove = [n for n in degree if degree[n] <= 1]
-        G.remove_nodes_from(to_remove)
+        G.remove_nodes_from(nodes_to_remove)
        partition = best_partition(G)
    except:
        PrintException()
    if type == "node_link":
@@ -270,7 +272,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
                # G.add_edge(node, "cluster " + str(partition[node]), weight=3)
            except Exception as error:
                print("error01: ",error)
        data = json_graph.node_link_data(G)
        links = []
@@ -285,7 +287,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
        # print(data)
        data["links"] = []
        data["links"] = links
    elif type == "adjacency":
        for node in G.nodes():
            try:
@@ -298,7 +300,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
            except Exception as error:
                print("error02: ",error)
        data = json_graph.node_link_data(G)
    #    data = json_graph.node_link_data(G, attrs={\
    #            'source':'source',\

--- a/init/README.rst
+++ b/init/README.rst
@@ -30,8 +30,8 @@ Install the requirements
 5)  Type: deactivate
-In PostreSQL
+In PostreSQL version 9.4 needed
-------------
+-------------------------------
 1)  Ensure postgres is started: sudo /etc/init.d/postgresql start
@@ -48,9 +48,8 @@ In PostreSQL
 7)  psql gargandb
-6)  CREATE EXTENSION hstore;
+8)  Ctrl + D
-7)  Ctrl + D
 Populate the database
@@ -58,6 +57,8 @@ Populate the database
 python manage.py syncdb
+run as postgres or gargantua user:
+psql -d gargandb -f /srv/gargantext/init/sql/changeDateformat.sql
 Last steps of configuration
 ---------------------------
@@ -65,65 +66,25 @@ Last steps of configuration
 1)  If your project is not in /srv/gargantext:
    ln -s [the project folder] /srv/gargantext
-2)  build gargantext_lib:
+2)  Install de Libraries
-    cd /srv/
+    cd /srv
-    wget http://docs.delanoe.org/gargantext_lib.tar.bz2
+    wget http://dl.gargantext.org/gargantext_lib.tar.bz2
-    sudo tar xvjf gargantext_lib.tar.bz2
+    tar xvjf gargantext_lib.tar.bz2
-    sudo chown user:user /srv/gargantext_lib
+    rm gargantext_lib.tar.bz2
-3)  Explorer: 
+3)  init nodetypes and main variables
-    cd /srv/gargantext_lib/js
+    /srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
-    git clone git@github.com:PkSM3/garg.git
-4)  Adapt all symlinks:
-    ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
-    Warning: for ln, path has to be absolute!
-5)  patch CTE:
+4)  patch CTE:
    patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/patches/cte_tree.models.diff
-6)  init nodetypes and main variables
+5)  DO NOT use the default aldjemy package:
-    /srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
-7)  DO NOT use the default aldjemy package:
    cd /tmp
    git clone https://github.com/mathieurodic/aldjemy
    cd aldjemy
    python3 setup.py install
-Extras
-=======
-Last steps of configuration:
----------------------------
-1) If your project is not in /srv/gargantext:
-    ln -s [the project folder] /srv/gargantext
-2) build gargantext_lib
-    wget http://docs.delanoe.org/gargantext_lib.tar.bz2
-    cd /srv/
-    sudo tar xvjf gargantext_lib.tar.bz2
-    sudo chown user:user /srv/gargantext_lib
-3) Explorer:
-create mkdir /srv/gargantext_lib/js
-sudo chown -R user:user /srv/gargantext_lib/
-cd /srv/gargantext_lib/js
-git clone git@github.com:PkSM3/garg.git
-4)  Adapt all symlinks:
-ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
-Warning: for ln, path has to be absolute!
-5) patch CTE
-patch /srv/gargantext_env/lib/python3.4/site-packages/cte_tree/models.py /srv/gargantext/init/cte_tree.models.diff
-6) init nodetypes and main variables
-/srv/gargantext/manage.py shell < /srv/gargantext/init/init.py
 Start Turbo parser server
 -------------------------

--- a/init/init_gargantext.py
+++ b/init/init_gargantext.py
@@ -82,9 +82,8 @@ print('Initialize node types...')
 node_types = [
        'Root', 'Trash',
        'Project', 'Corpus', 'Document',
-        'Stem', 'Lem', 'Tfidf',
+        'MiamList', 'StopList', 'MainList',
-        'Synonym',
+        'Stem', 'Lem', 'Group', 'Tfidf',
-        'MiamList', 'StopList',
        'Cooccurrence', 'WhiteList', 'BlackList'
        ]
@@ -93,6 +92,20 @@ for node_type in node_types:
 # Integration: resource types
+print('Initialize users...')
+me = session.query(User).filter(User.username=='alexandre').first()
+gargantua = session.query(User).filter(User.username=='gargantua').first()
+node_root = Node(user_id=gargantua.id, type_id=cache.NodeType['Root'].id, name='Root')
+node_stem = Node(user_id=gargantua.id, type_id=cache.NodeType['Stem'].id, name='Stem', parent_id=node_root.id)
+node_lem = Node(user_id=gargantua.id, type_id=cache.NodeType['Lem'].id, name='Lem', parent_id=node_root.id)
+session.add(node_root)
+session.add(node_stem)
+session.add(node_lem)
+session.commit()
 print('Initialize resource...')
 from parsing.parsers_config import parsers

--- a/init/sql/cooccurrences.sql
+++ b/init/sql/cooccurrences.sql
+---- BASIQUE calcul des cooccurrences en ne tenant pas compte des stems équivalents
+--
+--	SELECT
+--	-- %d as node_id,
+--	ngX.id,
+--	ngY.id,
+--	COUNT(*) AS score
+--FROM
+--	node_node AS n  -- the nodes who are direct children of the corpus
+--
+--INNER JOIN
+--	node_node_ngram AS nngX ON nngX.node_id = n.id  --  list of ngrams contained in the node
+--INNER JOIN
+--	node_node_ngram AS mainlistX ON mainlistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the mainlist and in the node
+--INNER JOIN
+--	node_ngram AS ngX ON ngX.id = mainlistX.ngram_id -- ngrams which are in both
+--
+--INNER JOIN
+--	node_node_ngram AS nngY ON nngY.node_id = n.id
+--INNER JOIN
+--	node_node_ngram AS mainlistY ON mainlistY.ngram_id = nngY.ngram_id
+--INNER JOIN
+--	node_ngram AS ngY ON ngY.id = mainlistY.ngram_id
+--
+--WHERE
+--	n.parent_id = 1298
+--AND
+--	n.type_id = 5
+--AND
+--	mainlistX.node_id = 1382
+--AND
+--	mainlistY.node_id = 1382
+--AND
+--	nngX.ngram_id < nngY.ngram_id   --  so we only get distinct pairs of ngrams
+--
+--GROUP BY
+--	ngX.id,
+--	ngX.terms,
+--	ngY.id,
+--	ngY.terms
+--
+--ORDER BY score DESC
+--LIMIT 3
+--;
+--
+-- calcul des cooccurrences en tenant compte des stems équivalents
+	SELECT
+	-- %d as node_id,
+	ngX.id,
+	ngY.id,
+	COUNT(*) AS score
+FROM
+	node_node AS n  -- the nodes who are direct children of the corpus
+INNER JOIN
+	node_node_ngram AS nngX ON nngX.node_id = n.id  --  list of ngrams contained in the node
+INNER JOIN
+	node_node_ngram AS mainlistX ON mainlistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the mainlist and in the node
+INNER JOIN
+	node_ngram AS ngX ON ngX.id = mainlistX.ngram_id -- ngrams which are in both
+LEFT JOIN
+	node_nodengramngram AS nggXX ON nggXX.node_id = 94
+	AND nggXX.ngramx_id = ngX.id
+LEFT JOIN
+	node_nodengramngram AS nggXY ON nggXY.node_id = 94
+	AND nggXY.ngramy_id = nggXY.ngramy_id
+	AND nggXY.ngramx_id < nggXY.ngramx_id
+INNER JOIN
+	node_node_ngram AS nngY ON nngY.node_id = n.id
+INNER JOIN
+	node_node_ngram AS mainlistY ON mainlistY.ngram_id = nngY.ngram_id
+INNER JOIN
+	node_ngram AS ngY ON ngY.id = mainlistY.ngram_id
+LEFT JOIN
+	node_nodengramngram AS nggYX ON nggYX.node_id = 94
+	AND nggYX.ngramx_id = ngY.id
+LEFT JOIN
+	node_nodengramngram AS nggYY ON nggYY.node_id = 94
+	AND nggYX.ngramy_id = nggYY.ngramy_id
+	AND nggYX.ngramx_id < nggYY.ngramx_id
+WHERE
+	n.parent_id = 1298
+AND
+	n.type_id = 5
+AND
+	mainlistX.node_id = 1382
+AND
+	mainlistY.node_id = 1382
+AND
+	nngX.ngram_id < nngY.ngram_id   --  so we only get distinct pairs of ngrams
+--AND
+--	nggYY.id is NULL
+--AND
+--	nggXY.id is NULL
+GROUP BY
+	ngX.id,
+	ngX.terms,
+	ngY.id,
+	ngY.terms
+ORDER BY score DESC
+LIMIT 3
+;
--- a/init/sql/mainList.sql
+++ b/init/sql/mainList.sql
+-- select tous les ngrams distincts de la miam list
+SELECT count(*) FROM
+(
+SELECT ngram_id FROM node_node_ngram 
+WHERE node_id = 1380  --> node.id de la miam list
+GROUP BY ngram_id
+) as global
+	;
+-- select tous les ngrams d'un corpus ayant un stem
+SELECT count(*) FROM
+(
+SELECT ngramx_id FROM node_nodengramngram as ng
+INNER JOIN node_node_ngram as nn
+ON nn.ngram_id = ng.ngramx_id
+INNER JOIN node_node as n
+ON n.id = nn.node_id
+AND n.parent_id = 1298 --> node.id du corpus 
+WHERE ng.node_id = 94  --> node.id de la stem list
+GROUP BY ng.ngramx_id
+) as global
+	;
+--- select uniquement tous les ngrams distincts qui ont des stems équivalents
+-- LEFT JOIN inclusif des ngrams qui on un stem
+-- LEFT JOIN exclusif des ngrams qui on un stem en commun
+select count(*) from
+(
+SELECT ngram_id FROM node_node_ngram  as nn
+INNER JOIN node_node as n
+ON nn.node_id = n.id
+AND n.parent_id = 1298 --> node.id du corpus 
+LEFT JOIN node_nodengramngram AS nx
+ON nx.node_id = 94 --> node.id Stem
+AND nx.ngramx_id = nn.ngram_id 
+LEFT JOIN node_nodengramngram AS ny
+ON nx.ngramy_id = ny.ngramy_id 
+AND nx.node_id = 94 --> node.id Stem
+AND nx.ngramx_id < ny.ngramx_id  --> pour supprimer les doublons
+WHERE nn.node_id = 1380 --> node.id de la miam list
+-- AND ny.id is NULL
+GROUP BY nn.ngram_id, nx.ngramx_id --, ny.ngramx_id
+) as global
+;
--- a/init/test_db.py
+++ b/init/test_db.py
@@ -35,7 +35,6 @@ extract_ngrams(corpus, ('title', ))
-# print(corpus)
 # corpus = session.query(Node).filter(Node.id == 72771).first()
 # corpus = session.query(Node).filter(Node.id == 73017).first()
 compute_tfidf(corpus)
--- a/ngram/lists.py
+++ b/ngram/lists.py
+import sys
+from admin.utils import PrintException
+from gargantext_web.db import NodeNgram
+from gargantext_web.db import *
+from parsing.corpustools import *
+import sqlalchemy
+from sqlalchemy.sql import func
+from sqlalchemy import desc, asc, or_, and_, Date, cast, select
+from sqlalchemy import literal_column
+from sqlalchemy.orm import aliased
+# from gargantext_web.db import Node, get_cursor
+def nodeList(user_id=None, corpus_id=None, typeList='MiamList'):
+    '''
+    nodeList : get or create NodeList.
+    nodeList :: Integer -> Integer -> String -> [Node]
+    user_id   :: Integer
+    corpus_id :: Integer
+    typeList  :: String, Type of the Node that should be created
+    [Node]      :: List of Int, returned or created by the function
+    '''
+    if corpus_id is not None and user_id is not None:
+        # Nodes are either in root_list or user_list
+        root_list = ['Stem', 'Lem']
+        user_list   = ['MiamList', 'StopList', 'MainList']
+        if typeList in user_list:
+            nodes = session.query(Node).filter(
+                                    Node.user_id == user_id,
+                                    Node.parent_id==corpus_id,
+                                    Node.type_id == cache.NodeType[typeList].id
+                                    ).order_by(desc(Node.id)).all()
+        elif typeList in root_list:
+            nodes = session.query(Node).filter(
+                                    Node.type_id == cache.NodeType[typeList].id
+                                    ).order_by(desc(Node.id)).all()
+        else:
+            print('typeList not supported yet')
+            sys.exit(0)
+        if nodes == []:
+            node = Node(user_id = user_id,
+                        parent_id=corpus_id,
+                        type_id=cache.NodeType[typeList].id,
+                        name="First default Node " + str(typeList))
+            session.add(node)
+            session.commit()
+            return([(node.id, node.name),])
+        else:
+            return([(node.id, node.name) for node in nodes])
+    else:
+        print("Usage (Warning): Need corpus_id and user_id")
+def stopList(user_id=None, corpus_id=None,
+            stop_id=None,
+            reset=False, limit=None
+             ):
+    '''
+    Compute the stopList and returns its Node.id
+    '''
+    if stop_id is None:
+        stop_id = nodeList(user_id=user_id,
+                            corpus_id=corpus_id,
+                            typeList='StopList')
+    # according to type of corpus, choose the right default stopList
+def doList(
+            type_list='miam',
+            user_id=None, corpus_id=None,
+            miam_id=None, stop_id=None, main_id=None,
+            lem_id=None, stem_id=None, cvalue_id=None, group_id=None,
+            reset=True, limit=None
+             ):
+    '''
+    Compute the miamList and returns its Node.id
+    miamList = allList - stopList
+    where:
+        allList  = all Ngrams
+        stopList = all Stop Ngrams
+    OR
+    Compute the mainList : main Forms
+    mainList = miamList - (stem|lem|group|cvalue) List
+    where:
+        group   = Words grouped manually by user
+        stem    = equivalent Words which are stemmed (but the main form)
+        lem     = equivalent Words which are lemmatized (but the main form)
+        cvalue  = equivalent N-Words according to C-Value (but the main form)
+    '''
+    if type_list not in ['miam', 'main']:
+        print('Type List supported: \'miam\' or \'main\'')
+        sys.exit(0)
+    try:
+        list_dict = {
+            'miam' : { 'type' : 'MiamList', 'id' : miam_id},
+            'stop' : { 'type' : 'StopList', 'id' : stop_id},
+                    }
+        if 'main' == type_list:
+            list_dict.update(
+            {
+                'main' : { 'type' : 'MainList', 'id' : main_id},
+                'stem' : { 'type' : 'Stem', 'id' : stem_id},
+                #'lem' : { 'type' : 'LemList', 'id' : lem_id},
+                #'group' : { 'type' : 'Group', 'id' : group_id},
+            }
+            )
+        for list_ in list_dict.keys():
+            if  list_dict[list_]['id'] is None:
+                list_dict[list_]['id'] = nodeList(user_id=user_id,
+                                        corpus_id=corpus_id,
+                                        typeList=list_dict[list_]['type'])[0][0]
+        # Delete previous List ?
+        # By default, miamList is computed each time
+        if reset is True:
+            session.query(NodeNgram).filter(
+                    NodeNgram.node_id == list_dict[type_list]['id']
+                    ).delete()
+    except:
+        PrintException()
+    stopNgram        = aliased(NodeNgram)
+    if 'miam' == type_list:
+        query = (session.query(
+                literal_column(str(list_dict['miam']['id'])).label("node_id"),
+                Ngram.id,
+                func.count(),
+                )
+                .select_from(Ngram)
+                .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
+                .join(Node, NodeNgram.node_id == Node.id)
+                .outerjoin(stopNgram,
+                            and_(stopNgram.ngram_id == Ngram.id,
+                                stopNgram.node_id == list_dict['stop']['id']))
+                .filter(Node.parent_id == corpus_id)
+                .filter(Node.type_id == cache.NodeType['Document'].id)
+                .filter(stopNgram.id == None )
+                .group_by(Ngram.id)
+                )
+    elif 'main' == type_list:
+        # Query to get Ngrams for main list
+        query = (session.query(
+                literal_column(str(list_dict['main']['id'])).label("node_id"),
+                Ngram.id,
+                func.count(),
+                )
+                .select_from(Ngram)
+                .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
+                .filter(NodeNgram.node_id == list_dict['miam']['id'])
+                )
+        if stem_id is not None:
+        # Query with Stems Result need to be checked before prod
+            snn1   = aliased(NodeNgramNgram)
+            snn2   = aliased(NodeNgramNgram)
+            query = (query.outerjoin(snn1,
+                          and_(snn1.ngramx_id == Ngram.id,
+                               snn1.node_id   == list_dict['stem']['id']
+                              )
+                                    )
+                          .outerjoin(snn2,
+                          and_(snn1.ngramy_id == snn2.ngramy_id,
+                               snn2.node_id   == list_dict['stem']['id'],
+                               snn1.ngramx_id < snn2.ngramx_id
+                              )
+                                    )
+                    .filter(snn2.id == None)
+                    )
+    # Specific group by:
+    if stem_id is not None:
+        query = query.group_by(Ngram.id, snn1.ngramx_id)
+    else:
+        query = query.group_by(Ngram.id)
+    # here add filter for size of the ngram
+    # Order result by occurrences descending
+    query = query.order_by(desc(func.count()))
+    # Adding specific filters
+    if limit is not None:
+        query = query.limit(limit)
+    else:
+        query = query.all()
+    bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], query)
+    return(list_dict[type_list]['id'])
--- a/ngram/stemLem.py
+++ b/ngram/stemLem.py
+from admin.utils import PrintException
+from gargantext_web.db import *
+from parsing.corpustools import *
+from gargantext_web.db import NodeNgram
+from sqlalchemy import and_
+from gargantext_web.db import get_cursor, bulk_insert
+def get_ngramogram(corpus, limit=None):
+    """
+    Ngram is a composition of ograms (ogram = 1gram)
+    """
+    try:
+        query = (session
+         .query(Ngram.id, Ngram.terms)
+         .outerjoin(NgramNgram, NgramNgram.ngram_id == Ngram.id)
+         .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
+         .join(Node, NodeNgram.node_id == Node.id)
+         .filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
+         .filter(Ngram.n > 1)
+         .filter(NgramNgram.id == None)
+         .group_by(Ngram.id, Ngram.terms)
+         )
+        #print(str(query))
+        if isinstance(limit, (int,)):
+            query = query.limit(limit)
+        return(query.all())
+    except Exception as error:
+        PrintException()
+def split_ngram(ngram):
+    if isinstance(ngram, str):
+        count = 0
+        result = list()
+        ngram_splitted = ngram.split(' ')
+        for x in ngram_splitted:
+            if count <= len(ngram_splitted):
+                result.append((ngram_splitted[count], count))
+                count += 1
+        return(result)
+    else:
+        print("Parameter should be a string.")
+def insert_ngramngram(ngramngram):
+    ngrams = list()
+    for n in ngramngram:
+        for i in split_ngram(n[1]):
+            ngrams.append((n[0], i[0], 1, i[1]))
+    db, cursor = get_cursor()
+    cursor.execute('''
+        CREATE TEMPORARY TABLE tmp__ngram (
+            id INT,
+            ngram_id INT,
+            terms VARCHAR(255) NOT NULL,
+            terms_id INT,
+            n INT,
+            position INT
+            );
+        ''')
+    bulk_insert('tmp__ngram', ['ngram_id', 'terms', 'n', 'position'], ngrams, cursor=cursor)
+    cursor.execute('''
+        UPDATE
+            tmp__ngram
+        SET
+            terms_id = ngram.id
+        FROM
+            %s AS ngram
+        WHERE
+            tmp__ngram.terms = ngram.terms
+            ''' % (Ngram.__table__.name,))
+    cursor.execute('''
+        INSERT INTO
+            %s (n, terms)
+        SELECT
+            n, terms
+        FROM
+            tmp__ngram
+        WHERE
+            terms_id IS NULL
+            ''' % (Ngram.__table__.name,))
+    cursor.execute('''
+        UPDATE
+            tmp__ngram
+        SET
+            id = ngram.id
+        FROM
+            %s AS ngram
+        WHERE
+            ngram.terms = tmp__ngram.terms
+        AND
+            tmp__ngram.id IS NULL
+            ''' % (Ngram.__table__.name,))
+    ngram_ids = dict()
+    cursor.execute('SELECT id, terms FROM tmp__ngram')
+    for row in cursor.fetchall():
+        ngram_ids[row[1]] = row[0]
+    db.commit()
+    return(ngram_ids)
+    return(result)
+def get_ngrams(corpus, unstemmed=True, unlemmatized=False, n=1, limit=None, count_all=False):
+    '''
+    Node with NodeType 'Stem' should be created at the root of the project.
+    '''
+    if unstemmed is True:
+        node_  = session.query(Node).filter(Node.type_id == cache.NodeType['Stem'].id).first()
+    try:
+        query = (session
+         .query(Ngram.id, Ngram.terms)
+         .outerjoin(NodeNgramNgram, and_(
+             NodeNgramNgram.ngramx_id == Ngram.id,
+             NodeNgramNgram.node_id==node_.id)
+         )
+         .join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
+         .join(Node, NodeNgram.node_id == Node.id)
+         .filter(Node.parent_id == corpus.id, Node.type_id == cache.NodeType['Document'].id)
+         .filter(NodeNgramNgram.id == None)
+         .filter(Ngram.n == n)
+         .group_by(Ngram.id, Ngram.terms)
+         )
+        #print(str(query))
+        if isinstance(limit, (int,)):
+            query = query.limit(limit)
+        if count_all is True:
+            return(query.count())
+        else:
+            return(query.all())
+    except Exception as error:
+        print("Error Query:", error)
+def get_stems(corpus, n=1, limit=None,
+              node_stem=session.query(Node).filter(
+                  Node.type_id==cache.NodeType['Stem'].id).first()):
+    '''
+    get_stems :: Corpus -> [Stem]
+    '''
+    result = set()
+    if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
+        from nltk.stem.porter import PorterStemmer
+        stemmer = PorterStemmer()
+        #stemmer.stem('honeybees')
+    elif corpus.language_id == cache.Language['fr'].id:
+        from nltk.stem.snowball import FrenchStemmer
+        stemmer = FrenchStemmer()
+        #stemmer.stem('abeilles')
+    for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
+        result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
+    return(result)
+def get_lems(corpus, n=1, limit=None, node_stem=cache.Node['Lem']):
+    '''
+    get_stems :: Corpus -> [Lem]
+    '''
+    result = set()
+    if corpus.language_id is None or corpus.language_id == cache.Language['en'].id:
+        from nltk.wordnet import PorterStemmer
+        stemmer = PorterStemmer()
+        #stemmer.stem('honeybees')
+    elif corpus.language_id == cache.Language['fr'].id:
+        from nltk.stem.snowball import FrenchStemmer
+        stemmer = FrenchStemmer()
+        #stemmer.stem('abeilles')
+    for ngram_id, word in get_ngrams(corpus, limit=limit, n=n):
+        result.add((node_stem.id, ngram_id, stemmer.stem(word), n))
+    return(result)
+def insert_ngrams(stems):
+    db, cursor = get_cursor()
+    cursor.execute('''
+        CREATE TEMPORARY TABLE tmp__ngram (
+            id INT,
+            terms VARCHAR(255) NOT NULL,
+            n INT
+            );
+        ''')
+    bulk_insert('tmp__ngram', ['terms', 'n'], stems, cursor=cursor)
+    cursor.execute('''
+        UPDATE
+            tmp__ngram
+        SET
+            id = ngram.id
+        FROM
+            %s AS ngram
+        WHERE
+            tmp__ngram.terms = ngram.terms
+            ''' % (Ngram.__table__.name,))
+    cursor.execute('''
+        INSERT INTO
+            %s (n, terms)
+        SELECT
+            n, terms
+        FROM
+            tmp__ngram
+        WHERE
+            id IS NULL
+            ''' % (Ngram.__table__.name,))
+    cursor.execute('''
+        UPDATE
+            tmp__ngram
+        SET
+            id = ngram.id
+        FROM
+            %s AS ngram
+        WHERE
+            ngram.terms = tmp__ngram.terms
+        AND
+            tmp__ngram.id IS NULL
+            ''' % (Ngram.__table__.name,))
+    ngram_ids = dict()
+    cursor.execute('SELECT id, terms FROM tmp__ngram')
+    for row in cursor.fetchall():
+        ngram_ids[row[1]] = row[0]
+    db.commit()
+    return(ngram_ids)
+def insert_nodengramstem(node_ngram_stem):
+    db, cursor = get_cursor()
+    cursor.execute('''
+        CREATE TEMPORARY TABLE tmp__nnn (
+            id INT,
+            node_id INT,
+            ngramx_id INT,
+            ngramy_id  INT
+            );
+        ''')
+    bulk_insert('tmp__nnn',
+                ['node_id', 'ngramx_id', 'ngramy_id'],
+                node_ngram_stem, cursor=cursor)
+    # nnn = NodeNgramNgram
+    cursor.execute('''
+        UPDATE
+             tmp__nnn
+        SET
+            id = nnn.id
+        FROM
+            %s AS nnn
+        WHERE
+            tmp__nnn.node_id = nnn.node_id
+        AND
+            tmp__nnn.ngramx_id = nnn.ngramx_id
+        AND
+            tmp__nnn.ngramy_id = nnn.ngramy_id
+            ''' % (NodeNgramNgram.__table__.name,))
+    cursor.execute('''
+        INSERT INTO
+            %s (node_id, ngramx_id, ngramy_id, score)
+        SELECT
+            node_id, ngramx_id, ngramy_id, 1
+        FROM
+            tmp__nnn
+        WHERE
+            id is NULL
+            ''' % (NodeNgramNgram.__table__.name,))
+    db.commit()
+def stem_corpus(corpus_id=None):
+    '''
+    Returns Int as id of the Stem Node
+    stem_corpus :: Int
+    '''
+    corpus = session.query(Node).filter(Node.id == corpus_id).first()
+    print('Number of new ngrams to stem:',
+          get_ngrams(corpus, n=2, count_all=True))
+    if corpus is not None:
+        try:
+            result = get_stems(corpus, n=2)
+            stems = set([(stem[2], stem[3]) for stem in result])
+            print('Number of new stems', len(stems))
+            stem_ids = insert_ngrams(stems)
+            node_ngram_stem = set([ (ngram[0],
+                                     ngram[1],
+                                     stem_ids[ngram[2]]
+                                     ) for ngram in list(result) ])
+            print(list(node_ngram_stem)[:3])
+            insert_nodengramstem(node_ngram_stem)
+        except:
+            PrintException()
+    else:
+        print('Usage: stem_corpus(corpus_id=corpus.id)')
--- a/test-list-management.py
+++ b/test-list-management.py
+# Without this, we couldn't use the Django environment
+from admin.env import *
+from ngram.stemLem import *
+from ngram.lists import *
+#from cooccurrences import *
+#from gargantext_web.views import empty_trash
+#empty_trash()
+#
+#user = session.query(User).all()[0]
+user = session.query(User).filter(User.username=='alexandre').first()
+print('Current user is:', user.username)
+project = session.query(Node).filter(Node.name == 'Test').first()
+if project is None:
+    project = Node(
+        name = 'Test',
+        type_id = cache.NodeType['Project'].id,
+        user_id = user.id
+    )
+    session.add(project)
+    session.commit()
+#corpora = session.query(Node).filter(Node.parent_id == project.id,
+#                           Node.type_id == cache.NodeType['Corpus'].id
+#                           ).delete()
+#
+#models.Node.objects(parent_id = project.id, type_id = cache.NodeType['Corpus']).all().delete()
+#
+corpus = session.query(Node).filter(Node.parent_id == project.id,
+                                    Node.type_id   == cache.NodeType['Corpus'].id).first()
+if corpus is None:
+    corpus = Node(
+        parent_id = project.id,
+        name = 'Test Corpus',
+        type_id = cache.NodeType['Corpus'].id,
+        user_id = user.id
+    )
+    session.add(corpus)
+    session.commit()
+    add_resource(corpus,
+        file = '/srv/gargantext_lib/data_samples/pubmed.zip',
+#    #file = '/srv/gargantext_lib/data_samples/pubmed_2013-04-01_HoneyBeesBeeBees.xml',
+        type_id = cache.ResourceType['Pubmed (xml format)'].id,
+    )
+    parse_resources(corpus)
+    extract_ngrams(corpus, ('title', 'abstract'))
+    compute_tfidf(corpus)
+# Stemming the corpus
+print('Working on corpus:', corpus.id, corpus.name)
+stem_id = stem_corpus(corpus_id=corpus.id)
+print('Stem Node.id is', stem_id)
+for typeList in ['MiamList', 'StopList', 'MainList', 'Stem']:
+    n = nodeList(user_id=user.id,
+                           corpus_id=corpus.id,
+                           typeList=typeList)
+    print(n)
+type_list='miam'
+try:
+    d = doList(type_list=type_list, user_id = user.id, corpus_id = corpus.id, stem_id=stem_id, limit=150)
+    print('Size of the ' + type_list + ' list:',
+          session.query(NodeNgram).filter(NodeNgram.node_id == d).count()
+          )
+except:
+    PrintException()