Merge branch 'testing' into stable

fe23f25f · Alexandre Delanoë · a3e8e25d · c12a0dae · fe23f25f · fe23f25f
Commit fe23f25f authored Sep 12, 2017 by Alexandre Delanoë
31 changed files
--- a/docs/contribution.md
+++ b/docs/contribution.md
@@ -2,7 +2,7 @@

 ## Community
 * [http://gargantext.org/about](http://gargantext.org/about)
-* IRC Chat: (OFTC/FreeNode) #gargantex
+* IRC Chat: (OFTC/FreeNode) #gargantext

 ##Tools
 * gogs

--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -263,7 +263,7 @@ RESOURCETYPES = [
   },

   {    "type": 11,
-        "name": 'HAL [API]',
+        "name": 'HAL (english) [API]',
        "parser": "HalParser",
        "format": 'JSON',
        'file_formats':["zip","json"],

--- a/gargantext/models/base.py
+++ b/gargantext/models/base.py
 from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
-from sqlalchemy.orm import relationship
+from sqlalchemy.orm import relationship, validates
 from sqlalchemy.types import TypeDecorator, \
                             Integer, Float, Boolean, DateTime, String, Text
 from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
@@ -7,6 +7,7 @@ from sqlalchemy.ext.mutable import MutableDict, MutableList
 from sqlalchemy.ext.declarative import declarative_base

 __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
+           "validates", "ValidatorMixin",
           "Integer", "Float", "Boolean", "DateTime", "String", "Text",
           "TypeDecorator",
           "JSONB", "Double",
@@ -18,6 +19,25 @@ __all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
 # all tables handled by Alembic migration scripts.
 Base = declarative_base()

+
 # To be used by tables already handled by Django ORM, such as User model. We
 # separate them in order to keep those out of Alembic sight.
 DjangoBase = declarative_base()
+
+
+class ValidatorMixin(object):
+    def enforce_length(self, key, value):
+        """Truncate a string according to its column length
+
+        Usage example:
+
+        .. code-block:: python
+
+            @validates('some_column')
+            def validate_some_column(self, key, value):
+                self.enforce_length(key, value)
+        """
+        max_len = getattr(self.__class__, key).prop.columns[0].type.length
+        if value and len(value) > max_len:
+            return value[:max_len]
+        return value
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -9,7 +9,7 @@ from datetime import datetime

 from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
                  Integer, Float, String, DateTime, JSONB, \
-                  MutableList, MutableDict
+                  MutableList, MutableDict, validates, ValidatorMixin
 from .users import User

 __all__ = ['Node', 'NodeNode', 'CorpusNode']
@@ -26,7 +26,7 @@ class NodeType(TypeDecorator):
        return NODETYPES[typeindex]


-class Node(Base):
+class Node(ValidatorMixin, Base):
    """This model can fit many purposes:

    myFirstCorpus = session.query(CorpusNode).first()
@@ -112,6 +112,10 @@ class Node(Base):
               'user_id={0.user_id}, parent_id={0.parent_id}, ' \
               'name={0.name!r}, date={0.date})>'.format(self)

+    @validates('name')
+    def validate_name(self, key, value):
+        return self.enforce_length(key, value)
+
    @property
    def ngrams(self):
        """Pseudo-attribute allowing to retrieve a node's ngrams.

--- a/gargantext/util/crawlers/HAL.py
+++ b/gargantext/util/crawlers/HAL.py
@@ -14,12 +14,12 @@ from gargantext.util.files import save

 class HalCrawler(Crawler):
    ''' HAL API CLIENT'''
-    
+
    def __init__(self):
        # Main EndPoints
        self.BASE_URL = "https://api.archives-ouvertes.fr"
        self.API_URL  = "search"
-        
+
        # Final EndPoints
        # TODO : Change endpoint according type of database
        self.URL   = self.BASE_URL + "/" + self.API_URL
@@ -29,28 +29,39 @@ class HalCrawler(Crawler):
        '''formating the query'''

        #search_field="title_t"
-        search_field="abstract_t"
+        #search_field="abstract_t"

-        return (search_field + ":" + "(" + query  + ")")
+        #return (search_field + ":" + "(" + query  + ")")
+        return "(" + query + ")"


    def _get(self, query, fromPage=1, count=10, lang=None):
        # Parameters

-        fl = """ title_s
+        fl = """ docid
+               , title_s
               , abstract_s
+               , en_title_s
+               , en_abstract_s
               , submittedDate_s
               , journalDate_s
               , authFullName_s
               , uri_s
               , isbn_s
               , issue_s
+               , journalTitle_s
+               , language_s
+               , doiId_s
+               , authId_i
+               , instStructId_i
+               , deptStructId_i
+               , labStructId_i
+               , rteamStructId_i
               , docType_s
-               , journalPublisher_s
             """
               #, authUrl_s
               #, type_s
-        
+
        wt = "json"

        querystring = { "q"       : query
@@ -59,18 +70,18 @@ class HalCrawler(Crawler):
                      , "fl"      : fl
                      , "wt"      : wt
                      }
-        
+
        # Specify Headers
        headers = { "cache-control" : "no-cache" }
-        
-        
+
+
        # Do Request and get response
        response = requests.request( "GET"
                                   , self.URL
                                   , headers = headers
                                   , params  = querystring
                                   )
-        
+
        #print(querystring)
        # Validation : 200 if ok else raise Value
        if response.status_code == 200:
@@ -81,27 +92,27 @@ class HalCrawler(Crawler):
            return (json.loads(response.content.decode(charset)))
        else:
            raise ValueError(response.status_code, response.reason)
-        
+
    def scan_results(self, query):
        '''
        scan_results : Returns the number of results
        Query String -> Int
        '''
        self.results_nb = 0
-        
+
        total = ( self._get(query)
                      .get("response", {})
                      .get("numFound"  ,  0)
                )
-        
+
        self.results_nb = total

        return self.results_nb

    def download(self, query):
-        
+
        downloaded = False
-        
+
        self.status.append("fetching results")

        corpus = []
@@ -113,9 +124,9 @@ class HalCrawler(Crawler):
            msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
                                                            , QUERY_SIZE_N_MAX
                                                            )
-            print("ERROR (scrap: Multivac d/l ): " , msg)
+            print("ERROR (scrap: HAL d/l ): " , msg)
            self.query_max = QUERY_SIZE_N_MAX
-        
+
        #for page in range(1, trunc(self.query_max / 100) + 2):
        for page in range(0, self.query_max, paging):
            print("Downloading page %s to %s results" % (page, paging))
@@ -132,5 +143,5 @@ class HalCrawler(Crawler):
                        , basedir=UPLOAD_DIRECTORY
                        )
        downloaded = True
-        
+
        return downloaded
--- a/gargantext/util/crawlers/sparql/bool2sparql.py
+++ b/gargantext/util/crawlers/sparql/bool2sparql.py
@@ -2,6 +2,7 @@
 import subprocess
 import re
 from .sparql import Service
+from gargantext.settings import BOOL_TOOLS_PATH
 #from sparql import Service

 def bool2sparql(rawQuery, count=False, offset=None, limit=None):
@@ -12,7 +13,7 @@ def bool2sparql(rawQuery, count=False, offset=None, limit=None):
    See: https://github.com/delanoe/bool2sparql
    """
    query = re.sub("\"", "\'", rawQuery)
-    bashCommand = ["/srv/gargantext/gargantext/util/crawlers/sparql/bool2sparql-exe","-q",query]
+    bashCommand = [BOOL_TOOLS_PATH + "/bool2sparql-exe","-q",query]

    if count is True :
        bashCommand.append("-c")

--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -5,15 +5,9 @@ from gargantext.util.json import json_dumps
 ########################################################################
 # get engine, session, etc.
 ########################################################################
-import sqlalchemy as sa
 from sqlalchemy.orm import sessionmaker, scoped_session
-from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy import delete

-# To make Full Text search possible, uncomment lines below
-# (and install it with pip before)
-#from sqlalchemy_searchable import make_searchable
-
 def get_engine():
    from sqlalchemy import create_engine
    return create_engine( settings.DATABASES['default']['URL']
@@ -24,16 +18,8 @@ def get_engine():

 engine = get_engine()

-# To make Full Text search possible, uncomment lines below
-# https://sqlalchemy-searchable.readthedocs.io/
-#sa.orm.configure_mappers()
-Base = declarative_base()
-#Base.metadata.create_all(engine)
-#make_searchable()
-
 session = scoped_session(sessionmaker(bind=engine))

-
 ########################################################################
 # useful for queries
 ########################################################################

--- a/gargantext/util/group_tools.py
+++ b/gargantext/util/group_tools.py
@@ -7,7 +7,7 @@ from gargantext.util.db  import session, aliased
 from gargantext.models   import Ngram, NodeNgramNgram
 from igraph              import Graph  # for group_union

-def query_groups(groupings_id, details=False):
+def query_groups(groupings_id, details=False, sort=False):
    """
    Listing of couples (mainform,   subform)
                 aka   (ngram1_id, ngram2_id)
@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
    Parameter:
      - details: if False, just send the array of couples
                 if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
+      - sort: order results by terms of ngram1 then ngram2
    """
+    if details or sort:
+        Ngram1, Ngram2 = Ngram, aliased(Ngram)
+
    if not details:
        # simple contents
-        query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
+        columns = (NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id)
    else:
        # detailed contents (id + terms)
-        Ngram1 = aliased(Ngram)
-        Ngram2 = aliased(Ngram)
-        query = (session
-                    .query(
-                        NodeNgramNgram.ngram1_id,
-                        Ngram1.terms,
-                        NodeNgramNgram.ngram2_id,
-                        Ngram2.terms,
-                     )
-                    .join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
-                    .join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)
-                )
+        columns = (Ngram1.id, Ngram1.terms,
+                   Ngram2.id, Ngram2.terms)
+
+    query = session.query(*columns)
+
+    if details or sort:
+        query = (query.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id)
+                      .join(Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id))
+
+    if sort:
+        query = query.order_by(Ngram1.terms, Ngram2.terms)

    # main filter
    # -----------

--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -73,7 +73,8 @@ from rest_framework.views import APIView
 from gargantext.util.json import json_encoder
 def JsonHttpResponse(data, status=200):
    return HttpResponse(
-        content      = json_encoder.encode(data),
+        content      = data.encode('utf-8') if isinstance(data, str) else \
+                       json_encoder.encode(data),
        content_type = 'application/json; charset=utf-8',
        status       = status
    )

--- a/gargantext/util/lists.py
+++ b/gargantext/util/lists.py
@@ -50,6 +50,9 @@ class _BaseClass:
        else:
            return NotImplemented

+    def __len__(self):
+        return len(self.items)
+
    def __repr__(self):
        items = self.items
        if isinstance(items, defaultdict):

--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
 """

 from gargantext.util.group_tools import query_groups, group_union
-from gargantext.util.db          import session, desc, func, \
-                                        bulk_insert_ifnotexists
+from gargantext.util.db          import session, bulk_insert_ifnotexists
 from gargantext.models           import Ngram, NodeNgram, NodeNodeNgram, \
                                        NodeNgramNgram, Node

@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
 # merge will also index the new ngrams in the docs of the corpus
 from gargantext.util.toolchain.ngrams_addition   import index_new_ngrams

-from sqlalchemy.sql      import exists
 from os                  import path
 from csv                 import writer, reader, QUOTE_MINIMAL
 from collections         import defaultdict
@@ -35,8 +33,8 @@ from celery              import shared_task

 def query_list(list_id,
                pagination_limit=None, pagination_offset=None,
-                details=False, scoring_metric_id=None, groupings_id=None
-                ):
+                details=False, scoring_metric_id=None, groupings_id=None,
+                sort=False):
    """
    Paginated listing of ngram_ids in a NodeNgram lists.

@@ -51,6 +49,7 @@ def query_list(list_id,
                           (for details and sorting)
      - groupings_id: optional id of a list of grouping relations (synonyms)
                      (each synonym will be added to the list if not already in there)
+      - sort: order by Ngram.terms (not possible if details is False)

    FIXME: subforms appended recently and not generalized enough
            => add a common part for all "if groupings_id"
@@ -125,7 +124,10 @@ def query_list(list_id,
        query = query.limit(pagination_limit)

    if pagination_offset:
-        query = query.offset(pagination_offsets)
+        query = query.offset(pagination_offset)
+
+    if details and sort:
+        query = query.order_by(Ngram.terms)

    return query

@@ -186,9 +188,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
            # 3 columns = |status,         |  mainform, |  forms
            #             (type_of_list)    ( term )     ( subterm1|&|subterm2 )

-            csv_rows.append(
-                  [list_type,ng_obj.terms,this_grouped_terms]
-                  )
+            csv_rows.append([list_type, ng_obj.terms, this_grouped_terms])

    return csv_rows

@@ -231,9 +231,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
    # listes de ngram_ids correspondantes
    # ------------------------------------
    # contenu: liste des objets ngrammes [(2562,"monterme",1),...]
-    stop_ngrams  = query_list(stoplist_node.id, details=True, groupings_id=group_node.id).all()
-    main_ngrams  = query_list(mainlist_node.id, details=True, groupings_id=group_node.id).all()
-    map_ngrams  = query_list(maplist_node.id, details=True, groupings_id=group_node.id).all()
+    stop_ngrams, main_ngrams, map_ngrams = (
+        query_list(n.id, details=True, groupings_id=group_node.id, sort=True).all()
+        for n in (stoplist_node, mainlist_node, maplist_node)
+    )

    # pour debug ---------->8 --------------------
    #~ stop_ngrams = stop_ngrams[0:10]
@@ -250,7 +251,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
    # for the groups we got couples of ids in the DB
    # -------------------
    # ex: [(3544, 2353), (2787, 4032), ...]
-    group_ngram_id_couples = query_groups(group_node.id).all()
+    group_ngram_id_couples = query_groups(group_node.id, sort=True)

    # we expend this to double structure for groups lookup
    # 1) g['links'] = k couples (x,y_i) as a set   [x => {y1,y2}]
@@ -397,6 +398,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
    NB: To merge the imported lists into a corpus node's lists,
        chain this function with merge_ngramlists()
    '''
+
+    list_types = ['stop','main','map']
+
    # ---------------
    #  ngram storage
    # ---------------
@@ -461,7 +465,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,

        # headers
        if i == 0:
-            n_cols = len(csv_row)
            for j, colname in enumerate(csv_row):
                if colname in ['label', 'status', 'forms']:
                    columns[colname] = j
@@ -508,31 +511,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
            continue

        # --- check correct list type
-        if not this_list_type in ['stop','main','map']:
+        if not this_list_type in list_types:
            print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" % (fname, i))
            continue

        # subforms can be duplicated (in forms and another label)
        # but we must take care of unwanted other duplicates too
-        if this_row_label in imported_unique_ngramstrs:
-            print("TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i"
-                    % (fname, i))
+        if imported_unique_ngramstrs.get(this_row_label) == 1:
+            print("TODO IMPORT DUPL: (skip line) term %r appears more than once at CSV %s:l.%i"
+                    % (this_row_label, fname, i))

        # ================= Store the data ====================
        # the ngram census
-        imported_unique_ngramstrs[this_row_label] = True
+        imported_unique_ngramstrs[this_row_label] = 1

        # and the "list to ngram" relation
        imported_nodes_ngrams[this_list_type].append(this_row_label)

        # ====== Store synonyms from the import (if any) ======
        if len(this_row_forms) != 0:
-            other_terms = []
            for raw_term_str in this_row_forms.split(group_delimiter):

                # each subform is also like an ngram declaration
                term_str = normalize_forms(normalize_chars(raw_term_str))
-                imported_unique_ngramstrs[term_str] = True
+                imported_unique_ngramstrs[term_str] = 2
                imported_nodes_ngrams[this_list_type].append(term_str)

                # the optional repeated mainform doesn't interest us
@@ -610,7 +612,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
                % (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
    print("IMPORT: read %i grouping relations" % n_group_relations)

-    # print("IMPORT RESULT", result)
+    list_counts = [(typ, len(result.get(typ))) for typ in list_types]
+    list_counts.append(('total', sum(x[1] for x in list_counts)))
+    print("IMPORT: " + '; '.join('%s %s' % stats for stats in list_counts))
+
    return result

 def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
@@ -718,9 +723,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):

    # ======== Merging all involved ngrams =========

-    # all memberships with resolved conflicts of interfering memberships
+    # all ngram memberships with resolved conflicts of interfering memberships
+    # (associates ngram ids with list types -- see linfos definition above)
    resolved_memberships = {}

+    # iterates over each ngram of each list type for both old and new lists
    for list_set in [old_lists, new_lists]:
        for lid, info in enumerate(linfos):
            list_type = info['key']
@@ -749,12 +756,15 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):

    # ======== Merging old and new groups =========
    # get the arcs already in the target DB (directed couples)
-    previous_links = session.query(
-       NodeNgramNgram.ngram1_id,
-       NodeNgramNgram.ngram2_id
-      ).filter(
-         NodeNgramNgram.node_id == old_group_id
-       ).all()
+    if 'groupings' in del_originals:
+        previous_links = []
+    else:
+        previous_links = session.query(
+                NodeNgramNgram.ngram1_id,
+                NodeNgramNgram.ngram2_id
+            ).filter(
+                NodeNgramNgram.node_id == old_group_id
+            ).all()

    n_links_previous = len(previous_links)

@@ -822,7 +832,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
            list_type = linfos[lid]['key']
            merged_results[list_type].items.add(ng_id)

-    # print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
+    print("IMPORT: added %i elements in the lists indices" % added_nd_ng)

    # ======== Overwrite old data with new =========
    for lid, info in enumerate(linfos):
@@ -845,13 +855,17 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
    """
    A single function to run import_ngramlists and merge_ngramlists together
    """
-    print("import list")
+
+    print("IMPORT CSV termlists file with %s lines in corpus %s (%s)" % (
+        len(file_contents),
+        onto_corpus_id, 'overwrite' if overwrite else 'merge'))
+
    new_lists = import_ngramlists(file_contents)

-    corpus_node = session.query(Node).filter(Node.id == onto_corpus_id).first()
+    corpus_node = session.query(Node).get(onto_corpus_id)

    # merge the new_lists onto those of the target corpus
-    del_originals = ['stop', 'main', 'map'] if overwrite else []
+    del_originals = ['stop', 'main', 'map', 'groupings'] if overwrite else []
    log_msg = merge_ngramlists(new_lists, onto_corpus=corpus_node, del_originals=del_originals)

    return log_msg
--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -4,128 +4,67 @@ import sys
 import csv
 csv.field_size_limit(sys.maxsize)
 import numpy as np
-import os
+

 class CSVParser(Parser):
+    DELIMITERS = ", \t;|:"

-    def CSVsample( self, small_contents , delim) :
-        reader = csv.reader(small_contents, delimiter=delim)
+    def detect_delimiter(self, lines, sample_size=10):
+        sample = lines[:sample_size]

-        Freqs = []
-        for row in reader:
-            Freqs.append(len(row))
+        # Compute frequency of each delimiter on each input line
+        delimiters_freqs = {
+            d: [line.count(d) for line in sample]
+            for d in self.DELIMITERS
+        }

-        return Freqs
+        # Select delimiters with a standard deviation of zero, ie. delimiters
+        # for which we have the same number of fields on each line
+        selected_delimiters = [
+            (d, np.sum(freqs))
+            for d, freqs in delimiters_freqs.items()
+            if any(freqs) and np.std(freqs) == 0
+        ]

+        if selected_delimiters:
+            # Choose the delimiter with highest frequency amongst selected ones
+            sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
+            return sorted_delimiters[-1][0]

    def parse(self, filebuf):
-
        print("CSV: parsing (assuming UTF-8 and LF line endings)")

        contents = filebuf.read().decode("UTF-8").split("\n")

-        sample_size = 10
-        sample_contents = contents[0:sample_size]
-
-        hyperdata_list = []
-
-        # # = = = = [ Getting delimiters frequency ] = = = = #
-        PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
-        AllDelimiters = {}
-        for delim in PossibleDelimiters:
-            AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
-        # # = = = = [ / Getting delimiters frequency ] = = = = #
-        # # OUTPUT example:
-        # #  AllDelimiters = {
-        # #   '\t': [1, 1, 1, 1, 1],
-        # #   ' ': [1, 13, 261, 348, 330],
-        # #   ',': [15, 15, 15, 15, 15],
-        # #   ';': [1, 1, 1, 1, 1],
-        # #   '|': [1, 1, 1, 1, 1]
-        # #  }
-
-        # # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
-        Delimiters = []
-        for d in AllDelimiters:
-            freqs = AllDelimiters[d]
-            suma = np.sum( freqs )
-            if suma >0:
-                std = np.std( freqs )
-                # print [ d , suma , len(freqs) , std]
-                if std == 0:
-                    Delimiters.append ( [ d , suma , len(freqs) , std] )
-        # # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
-        # # OUTPUT example:
-        # #  Delimiters = [
-        # #     ['\t', 5, 5, 0.0],
-        # #     [',', 75, 5, 0.0],
-        # #     ['|', 5, 5, 0.0]
-        # #  ]
-
-
-        # # = = = = [ Delimiter selection ] = = = = #
-        Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
-        HighestDelim = Sorted_Delims[0][0]
-        # HighestDelim = ","
-        print("CSV selected delimiter:",[HighestDelim])
-        # # = = = = [ / Delimiter selection ] = = = = #
-
-
-        # # = = = = [ First data coordinate ] = = = = #
-        Coords = {
-            "row": -1,
-            "column": -1
-        }
+        # Filter out empty lines
+        contents = [line for line in contents if line.strip()]
+
+        # Delimiter auto-detection
+        delimiter = self.detect_delimiter(contents, sample_size=10)
+
+        if delimiter is None:
+            raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
+
+        print("CSV: selected delimiter: %r" % delimiter)
+
+        # Parse CSV
+        reader = csv.reader(contents, delimiter=delimiter)
+
+        # Get first not empty row and its fields (ie. header row), or (0, [])
+        first_row, headers = \
+            next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
+                 (0, []))
+
+        # Get first not empty column of the first row, or 0
+        first_col = next((i for i, field in enumerate(headers) if field), 0)
+
+        # Strip out potential empty fields in headers
+        headers = headers[first_col:]

-        reader = csv.reader(contents, delimiter=HighestDelim)
-
-        for rownum, tokens in enumerate(reader):
-            if rownum % 250 == 0:
-                print("CSV row: ", rownum)
-            joined_tokens = "".join (tokens)
-            if Coords["row"]<0 and len( joined_tokens )>0 :
-                Coords["row"] = rownum
-                for columnum in range(len(tokens)):
-                    t = tokens[columnum]
-                    if len(t)>0:
-                        Coords["column"] = columnum
-                        break
-        # # = = = = [ / First data coordinate ] = = = = #
-
-
-
-        # # = = = = [ Setting Headers ] = = = = #
-        Headers_Int2Str = {}
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>=Coords["row"]:
-                for columnum in range( Coords["column"],len(tokens) ):
-                    t = tokens[columnum]
-                    Headers_Int2Str[columnum] = t
-                break
-        # print("Headers_Int2Str")
-        # print(Headers_Int2Str)
-        # # = = = = [ / Setting Headers ] = = = = #
-        # # OUTPUT example:
-        # #  Headers_Int2Str = {
-        # #     0: 'publication_date',
-        # #      1: 'publication_month',
-        # #      2: 'publication_second',
-        # #      3: 'abstract'
-        # #  }
-
-
-        # # = = = = [ Reading the whole CSV and saving ] = = = = #
-        hyperdata_list = []
-        reader = csv.reader(contents, delimiter=HighestDelim)
-        for rownum, tokens in enumerate(reader):
-            if rownum>Coords["row"]:
-                RecordDict = {}
-                for columnum in range( Coords["column"],len(tokens) ):
-                    data = tokens[columnum]
-                    RecordDict[ Headers_Int2Str[columnum] ] = data
-                if len(RecordDict.keys())>0:
-                    hyperdata_list.append( RecordDict )
-        # # = = = = [ / Reading the whole CSV and saving ] = = = = #
-
-        return hyperdata_list
+        # Return a generator of dictionaries with column labels as keys,
+        # filtering out empty rows
+        for i, fields in enumerate(reader):
+            if i % 500 == 0:
+                print("CSV: parsing row #%s..." % (i+1))
+            if any(fields):
+                yield dict(zip(headers, fields[first_col:]))
--- a/gargantext/util/parsers/HAL.py
+++ b/gargantext/util/parsers/HAL.py
@@ -11,25 +11,26 @@ from datetime import datetime
 import json

 class HalParser(Parser):
+    def _parse(self, json_docs):

-    def parse(self, filebuf):
-        '''
-        parse :: FileBuff -> [Hyperdata]
-        '''
-        contents = filebuf.read().decode("UTF-8")
-        data = json.loads(contents)
-        
-        filebuf.close()
-        
-        json_docs = data
        hyperdata_list = []
-        
-        hyperdata_path = { "id"       : "isbn_s"
-                         , "title"    : "title_s"
-                         , "abstract" : "abstract_s"
-                         , "source"   : "journalPublisher_s"
-                         , "url"      : "uri_s"
-                         , "authors"  : "authFullName_s"
+
+        hyperdata_path = { "id"              : "docid"
+                         , "title"           : ["en_title_s", "title_s"]
+                         , "abstract"        : ["en_abstract_s", "abstract_s"]
+                         , "source"          : "journalTitle_s"
+                         , "url"             : "uri_s"
+                         , "authors"         : "authFullName_s"
+                         , "isbn_s"          : "isbn_s"
+                         , "issue_s"         : "issue_s"
+                         , "language_s"      : "language_s"
+                         , "doiId_s"         : "doiId_s"
+                         , "authId_i"        : "authId_i"
+                         , "instStructId_i"  : "instStructId_i"
+                         , "deptStructId_i"  : "deptStructId_i"
+                         , "labStructId_i"   : "labStructId_i"
+                         , "rteamStructId_i" : "rteamStructId_i"
+                         , "docType_s"       : "docType_s"
                         }

        uris = set()
@@ -37,29 +38,32 @@ class HalParser(Parser):
        for doc in json_docs:

            hyperdata = {}
-            
+
            for key, path in hyperdata_path.items():
-                    
-                    field = doc.get(path, "NOT FOUND")
-                    if isinstance(field, list):
-                        hyperdata[key] = ", ".join(field)
-                    else:
-                        hyperdata[key] = field
-            
+
+                # A path can be a field name or a sequence of field names
+                if isinstance(path, (list, tuple)):
+                    # Get first non-empty value of fields in path sequence, or None
+                    field = next((x for x in (doc.get(p) for p in path) if x), None)
+                else:
+                    # Get field value
+                    field = doc.get(path)
+
+                if field is None:
+                    field = "NOT FOUND"
+
+                if isinstance(field, list):
+                    hyperdata[key] = ", ".join(map(str, field))
+                else:
+                    hyperdata[key] = str(field)
+
            if hyperdata["url"] in uris:
                print("Document already parsed")
+
            else:
                uris.add(hyperdata["url"])
-#            hyperdata["authors"] = ", ".join(
-#                                             [ p.get("person", {})
-#                                                .get("name"  , "")
-#                          
-#                                               for p in doc.get("hasauthor", [])
-#                                             ]
-#                                            )
-#            
-                maybeDate = doc.get("submittedDate_s", None)

+                maybeDate = doc.get("submittedDate_s", None)
                if maybeDate is not None:
                    date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
                else:
@@ -69,7 +73,17 @@ class HalParser(Parser):
                hyperdata["publication_year"]  = str(date.year)
                hyperdata["publication_month"] = str(date.month)
                hyperdata["publication_day"]   = str(date.day)
-                
+
                hyperdata_list.append(hyperdata)
-        
+
        return hyperdata_list
+
+    def parse(self, filebuf):
+        '''
+        parse :: FileBuff -> [Hyperdata]
+        '''
+        contents = filebuf.read().decode("UTF-8")
+        data = json.loads(contents)
+
+        return self._parse(data)
+
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -81,44 +81,45 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
                    corpus.hyperdata["skipped_docs"].append(document.id)
                    corpus.save_hyperdata()
                    continue
-                else:
-                    # ready !
-                    tagger = tagger_bots[language_iso2]
-
-                    # to do verify if document has no KEYS to index
-                    # eg: use set intersect (+ loop becomes direct! with no continue)
-                    for key in keys:
-                        try:
-                            value = document.hyperdata[str(key)]
-                            if not isinstance(value, str):
-                                #print("DBG wrong content in doc for key", key)
-                                continue
-                                # get ngrams
-                            for ngram in tagger.extract(value):
-                                tokens = tuple(normalize_forms(token[0]) for token in ngram)
-                                if do_subngrams:
-                                    # ex tokens = ["very", "cool", "exemple"]
-                                    #    subterms = [['very', 'cool'],...]
-
-                                    subterms = subsequences(tokens)
-                                else:
-                                    subterms = [tokens]
-
-                                for seqterm in subterms:
-                                    ngram = ' '.join(seqterm)
-                                    nbwords = len(seqterm)
-                                    nbchars = len(ngram)
-                                    if nbchars > 1:
-                                        if nbchars > 255:
-                                            # max ngram length (DB constraint)
-                                            ngram = ngram[:255]
-                                        # doc <=> ngram index
-                                        nodes_ngrams_count[(document.id, ngram)] += 1
-                                        # add fields :   terms          n
-                                        ngrams_data.add((ngram, nbwords, ))
-                        except:
-                            #value not in doc
+
+                # ready !
+                tagger = tagger_bots[language_iso2]
+
+                # to do verify if document has no KEYS to index
+                # eg: use set intersect (+ loop becomes direct! with no continue)
+                for key in keys:
+                    try:
+                        value = document.hyperdata[str(key)]
+                        if not isinstance(value, str):
+                            #print("DBG wrong content in doc for key", key)
                            continue
+                            # get ngrams
+                        for ngram in tagger.extract(value):
+                            normal_forms = (normalize_forms(t[0]) for t in ngram)
+                            tokens = tuple(nf for nf in normal_forms if nf)
+                            if do_subngrams:
+                                # ex tokens = ["very", "cool", "exemple"]
+                                #    subterms = [['very', 'cool'],...]
+
+                                subterms = subsequences(tokens)
+                            else:
+                                subterms = [tokens]
+
+                            for seqterm in subterms:
+                                ngram = ' '.join(seqterm)
+                                nbwords = len(seqterm)
+                                nbchars = len(ngram)
+                                if nbchars > 1:
+                                    if nbchars > 255:
+                                        # max ngram length (DB constraint)
+                                        ngram = ngram[:255]
+                                    # doc <=> ngram index
+                                    nodes_ngrams_count[(document.id, ngram)] += 1
+                                    # add fields :   terms          n
+                                    ngrams_data.add((ngram, nbwords, ))
+                    except:
+                        #value not in doc
+                        continue

            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:

--- a/gargantext/views/api/ngramlists.py
+++ b/gargantext/views/api/ngramlists.py
@@ -155,7 +155,12 @@ class CSVLists(APIView):
        try:
            # merge the source_lists onto those of the target corpus
            delete = todo_lists if bool(params.get('overwrite')) else []
+
+            if len(delete) == len(list_types):
+                delete.append('groupings')
+
            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete)
+
            return JsonHttpResponse({
                'log': log_msg,
                }, 200)

--- a/gargantext/views/api/urls.py
+++ b/gargantext/views/api/urls.py
 from django.conf.urls import url

+from rest_framework_jwt.views import obtain_jwt_token
+
 from . import nodes
 from . import projects
 from . import corpora
@@ -10,78 +12,81 @@ from . import ngramlists
 from . import analytics
 from graph.rest import Graph

-urlpatterns = [ url(r'^nodes$'                , nodes.NodeListResource.as_view()     )
-              , url(r'^nodes/(\d+)$'          , nodes.NodeResource.as_view()         )
-              , url(r'^nodes/(\d+)/having$'   , nodes.NodeListHaving.as_view()       )
-              , url(r'^nodes/(\d+)/status$'   , nodes.Status.as_view()     )
-              #Projects
-              , url(r'^projects$'                , projects.ProjectList.as_view()     )
-              , url(r'^projects/(\d+)$'                , projects.ProjectView.as_view()     )
-              #?view=resource
-              #?view=docs
-              #Corpora
-              , url(r'^projects/(\d+)/corpora/(\d+)$' , corpora.CorpusView.as_view()     )
-              #?view=source
-              #?view=title
-              #?view=analytics
-              #Sources
-              #, url(r'^projects/(\d+)/corpora/(\d+)/sources$' , corpora.CorpusSources.as_view()     )
-              #, url(r'^projects/(\d+)/corpora/(\d+)/sources/(\d+)$' , corpora.CorpusSourceView.as_view()     )
-              #Facets
-              , url(r'^projects/(\d+)/corpora/(\d+)/facets$' , nodes.CorpusFacet.as_view()     )
-              #Favorites
-              , url(r'^projects/(\d+)/corpora/(\d+)/favorites$', nodes.CorpusFavorites.as_view()      )
-              #Metrics
-              , url(r'^projects/(\d+)/corpora/(\d+)/metrics$', metrics.CorpusMetrics.as_view()      )
-              #GraphExplorer
-              , url(r'^projects/(\d+)/corpora/(\d+)/explorer$'      , Graph.as_view())
+urlpatterns = [ url(r'^nodes$'                , nodes.NodeListResource.as_view())
+              , url(r'^nodes/(\d+)$'          , nodes.NodeResource.as_view())
+              , url(r'^nodes/(\d+)/having$'   , nodes.NodeListHaving.as_view())
+              , url(r'^nodes/(\d+)/status$'   , nodes.Status.as_view())
+
+              # Projects
+              , url(r'^projects$'             , projects.ProjectList.as_view())
+              , url(r'^projects/(\d+)$'       , projects.ProjectView.as_view())
+
+              # Corpora
+              , url(r'^projects/(\d+)/corpora/(\d+)$', corpora.CorpusView.as_view())
+
+              # Sources
+              #, url(r'^projects/(\d+)/corpora/(\d+)/sources$', corpora.CorpusSources.as_view())
+              #, url(r'^projects/(\d+)/corpora/(\d+)/sources/(\d+)$ , corpora.CorpusSourceView.as_view())
+
+              # Facets
+              , url(r'^projects/(\d+)/corpora/(\d+)/facets$', nodes.CorpusFacet.as_view())
+
+              # Favorites
+              , url(r'^projects/(\d+)/corpora/(\d+)/favorites$', nodes.CorpusFavorites.as_view())
+
+              # Metrics
+              , url(r'^projects/(\d+)/corpora/(\d+)/metrics$', metrics.CorpusMetrics.as_view())
+
+              # GraphExplorer
+              , url(r'^projects/(\d+)/corpora/(\d+)/explorer$', Graph.as_view())
                # data for graph explorer (json)
                #                 GET /api/projects/43198/corpora/111107/explorer?
                # Corresponding view is : /projects/43198/corpora/111107/explorer?
                # Parameters (example):
                # explorer?field1=ngrams&field2=ngrams&distance=conditional&bridgeness=5&start=1996-6-1&end=2002-10-5
+
               # Ngrams
-               , url(r'^ngrams/?$'             , ngrams.ApiNgrams.as_view()          )
+               , url(r'^ngrams/?$'             , ngrams.ApiNgrams.as_view())

               # Analytics
              , url(r'^nodes/(\d+)/histories$', analytics.NodeNgramsQueries.as_view())
-              , url(r'hyperdata$'             , analytics.ApiHyperdata.as_view()     )
+              , url(r'hyperdata$'             , analytics.ApiHyperdata.as_view())
                # get a list of ngram_ids or ngram_infos by list_id
                # url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),

-              , url(r'^nodes/(\d+)/facets$'   , nodes.CorpusFacet.as_view()          )
-              , url(r'^nodes/(\d+)/favorites$', nodes.CorpusFavorites.as_view()      )
-              # in these two routes the node is supposed to be a *corpus* node
+              , url(r'^nodes/(\d+)/facets$'   , nodes.CorpusFacet.as_view())
+              , url(r'^nodes/(\d+)/favorites$', nodes.CorpusFavorites.as_view())
+                # in these two routes the node is supposed to be a *corpus* node


-              , url(r'^metrics/(\d+)$',         metrics.CorpusMetrics.as_view()      )
+              , url(r'^metrics/(\d+)$'        , metrics.CorpusMetrics.as_view())
                # update all metrics for a corpus
                #  ex: PUT metrics/123
                #                     \
                #                   corpus id

-              , url(r'^ngramlists/export$', ngramlists.CSVLists.as_view()            )
+              , url(r'^ngramlists/export$', ngramlists.CSVLists.as_view())
                # get a CSV export of the ngramlists of a corpus
                #  ex: GET ngramlists/export?corpus=43
                #  TODO : unify to a /api/ngrams?formatted=csv
                #        (similar to /api/nodes?formatted=csv)

-              , url(r'^ngramlists/import$', ngramlists.CSVLists.as_view()            )
+              , url(r'^ngramlists/import$', ngramlists.CSVLists.as_view())
                # same handling class as export (CSVLists)
                # but this route used only for POST + file
                #                           or PATCH + other corpus id

-              , url(r'^ngramlists/change$', ngramlists.ListChange.as_view()          )
+              , url(r'^ngramlists/change$', ngramlists.ListChange.as_view())
                # add or remove ngram from a list
                #  ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
                #       rm <=> DEL ngramlists/change?list=42&ngrams=1,2

-              ,  url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view()        )
+              ,  url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view())
                # modify grouping couples of a group node
                #  ex: PUT/DEL ngramlists/groups?node=43
                #      & group data also in url: 767[]=209,640 & 779[]=436,265,385

-              , url(r'^ngramlists/family$'     , ngramlists.ListFamily.as_view()     )
+              , url(r'^ngramlists/family$', ngramlists.ListFamily.as_view())
                # entire combination of lists from a corpus, dedicated to termtable
                # (or any combination of lists that go together :
                #   - a mainlist
@@ -89,8 +94,11 @@ urlpatterns = [ url(r'^nodes$'                , nodes.NodeListResource.as_view()
                #   - an optional maplist
                #   - an optional grouplist

-              , url(r'^ngramlists/maplist$'     , ngramlists.MapListGlance.as_view() )
+              , url(r'^ngramlists/maplist$', ngramlists.MapListGlance.as_view())
                # fast access to maplist, similarly formatted for termtable
-                , url(r'^user/parameters/$', users.UserParameters.as_view())
+
+              , url(r'^user/parameters/$', users.UserParameters.as_view())
+
+              , url('^auth/token$', obtain_jwt_token)

              ]
--- a/install/gargamelle/requirements.txt
+++ b/install/gargamelle/requirements.txt
@@ -11,6 +11,7 @@ django-celery==3.2.1
 django-pgfields==1.4.4
 django-pgjsonb==0.0.23
 djangorestframework==3.5.3
+djangorestframework-jwt==1.9.0
 html5lib==0.9999999
 python-igraph>=0.7.1
 jdatetime==1.7.2

--- a/install/notebook.run
+++ b/install/notebook.run
@@ -16,7 +16,7 @@ sudo docker run \
        --env POSTGRES_HOST=localhost \
        -v /srv/gargantext:/srv/gargantext \
        -it garg-notebook:latest \
-        /bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
+        /bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /home/notebooks && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser'"
 #        #&& jupyter nbextension enable --py widgetsnbextension --sys-prefix 
        #/bin/bash -c "/bin/su notebooks -c 'source /env_3-5/bin/activate && cd /srv/gargantext/ && jupyter notebook --port=8899 --ip=0.0.0.0 --no-browser --notebook-dir=/home/notebooks/'"


--- a/install/notebook/Dockerfile
+++ b/install/notebook/Dockerfile
@@ -78,32 +78,8 @@ RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt

 #RUN ./psql_configure.sh
 #RUN ./django_configure.sh
-
 RUN chown notebooks:notebooks -R /env_3-5

-########################################################################
-### Notebook IHaskell and IPYTHON ENVIRONNEMENT
-########################################################################
-
-#RUN apt-get update && apt-get install -y    \
-#        libtinfo-dev                        \
-#        libzmq3-dev                         \
-#        libcairo2-dev                       \
-#        libpango1.0-dev                     \
-#        libmagic-dev                        \
-#        libblas-dev                         \
-#        liblapack-dev
-#RUN curl -sSL https://get.haskellstack.org/ | sh 
-#RUN stack setup
-#RUN git clone https://github.com/gibiansky/IHaskell
-#RUN . /env_3-5/bin/activate     \
-#    && cd IHaskell                    \
-#    && stack install gtk2hs-buildtools \
-#    && stack install --fast             \
-#    && /root/.local/bin/ihaskell install --stack
-#
-
-#
 ########################################################################
 ### POSTGRESQL DATA (as ROOT)
 ########################################################################
@@ -115,3 +91,32 @@ RUN chown notebooks:notebooks -R /env_3-5
 EXPOSE 5432 8899

 VOLUME ["/srv/","/home/notebooks/"]
+
+
+########################################################################
+### Notebook IHaskell and IPYTHON ENVIRONNEMENT
+########################################################################
+
+RUN apt-get update && apt-get install -y \
+        libtinfo-dev                      \
+        libzmq3-dev                        \
+        libcairo2-dev                       \
+        libpango1.0-dev                      \
+        libmagic-dev                          \
+        libblas-dev                            \
+        liblapack-dev
+
+USER notebooks
+
+RUN cd  /home/notebooks                              \
+    &&  curl -sSL https://get.haskellstack.org/ | sh  \
+    &&  stack setup                                    \
+    &&  git clone https://github.com/gibiansky/IHaskell \
+    &&  . /env_3-5/bin/activate                          \
+    &&  cd IHaskell                                       \
+    &&  stack install gtk2hs-buildtools                    \
+    &&  stack install --fast                                \
+    &&  /root/.local/bin/ihaskell install --stack
+
+
+
--- a/install/notebook/gargantext_notebook.py
+++ b/install/notebook/gargantext_notebook.py
+#!/usr/bin/env python
 """
   Gargantext Software Copyright (c) 2016-2017 CNRS ISC-PIF -
 http://iscpif.fr
@@ -6,45 +7,33 @@ http://gitlab.iscpif.fr/humanities/gargantext/blob/stable/LICENSE )
    - In France : a CECILL variant affero compliant
    - GNU aGPLV3 for all other countries
 """
-#!/usr/bin/env python
-import sys
-import os

+import os
+import django

-# Django settings
-dirname = os.path.dirname(os.path.realpath(__file__))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gargantext.settings")
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
+django.setup()

-# initialize Django application
-from django.core.wsgi import get_wsgi_application
-application = get_wsgi_application()
+from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
+from gargantext.models import ProjectNode, DocumentNode
+from gargantext.util.db import session, get_engine
+from collections import Counter
+import importlib
+from django.http import Http404

-from gargantext.util.toolchain.main import parse_extract_indexhyperdata
-from gargantext.util.db import *
-from gargantext.models import Node
+# Import those to be available by notebook user
+from langdetect import detect as detect_lang
+from gargantext.models import UserNode, User

-from nltk.tokenize import wordpunct_tokenize

-from gargantext.models import *
-from nltk.tokenize import word_tokenize
-import nltk as nltk
-from statistics import mean
-from math import log
-from collections import defaultdict
-import matplotlib.pyplot as plt
-import numpy as np
-import datetime
+class NotebookError(Exception):
+    pass

-from collections import Counter
-from langdetect import detect as detect_lang

 def documents(corpus_id):
-    return (session.query(Node).filter( Node.parent_id==corpus_id
-                                  , Node.typename=="DOCUMENT"
-                                  )
-        # .order_by(Node.hyperdata['publication_date'])
-        .all()
-        )
+    return (session.query(DocumentNode).filter_by(parent_id=corpus_id)
+                  #.order_by(Node.hyperdata['publication_date'])
+                   .all())


 #import seaborn as sns
@@ -63,13 +52,134 @@ def scan_hal(request):
    hal = HalCrawler()
    return hal.scan_results(request)

+
 def scan_gargantext(corpus_id, lang, request):
    connection = get_engine().connect()
    # TODO add some sugar the request (ideally request should be the same for hal and garg)
    query = """select count(n.id) from nodes n
-                  where to_tsvector('%s', hyperdata ->> 'abstract' || 'title') 
+                  where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
                  @@ to_tsquery('%s')
-                  AND n.parent_id = %s;""" % (lang, request, corpus_id) 
+                  AND n.parent_id = %s;""" % (lang, request, corpus_id)
    return [i for i in connection.execute(query)][0][0]
    connection.close()

+
+def myProject_fromUrl(url):
+    """
+    myProject :: String -> Project
+    """
+    project_id = url.split("/")[4]
+    project = session.query(ProjectNode).get(project_id)
+    return project
+
+
+def newCorpus(project, source, name=None, query=None):
+    error = False
+
+    if name is None:
+        name = query
+
+    if not isinstance(project, ProjectNode):
+        error = "a valid project"
+    if not isinstance(source, int) and not isinstance(source, str):
+        error = "a valid source identifier: id or name"
+    elif not isinstance(query, str):
+        error = "a valid query"
+    elif not isinstance(name, str):
+        error = "a valid name"
+
+    if error:
+        raise NotebookError("Please provide %s." % error)
+
+    resource = get_resource(source) if isinstance(source, int) else \
+               get_resource_by_name(source)
+
+    moissonneur_name = get_moissonneur_name(resource) if resource else \
+                       source.lower()
+
+    try:
+        moissonneur = get_moissonneur(moissonneur_name)
+    except ImportError:
+        raise NotebookError("Invalid source identifier: %r" % source)
+
+    return run_moissonneur(moissonneur, project, name, query)
+
+
+def get_moissonneur_name(ident):
+    """ Return moissonneur module name from RESOURCETYPE or crawler name """
+
+    # Does it quacks like a RESOURCETYPE ?
+    if hasattr(ident, 'get'):
+        ident = ident.get('crawler')
+
+    # Extract name from crawler class name, otherwise assume ident is already
+    # a moissonneur name.
+    if isinstance(ident, str) and ident.endswith('Crawler'):
+        return ident[:-len('Crawler')].lower()
+
+
+def get_moissonneur(name):
+    """ Return moissonneur module from its name """
+    if not isinstance(name, str) or not name.islower():
+        raise NotebookError("Invalid moissonneur name: %r" % name)
+
+    module = importlib.import_module('moissonneurs.%s' % name)
+    module.name = name
+
+    return module
+
+
+def run_moissonneur(moissonneur, project, name, query):
+    """ Run moissonneur and return resulting corpus """
+
+    # XXX Uber-kludge with gory details. Spaghetti rulezzzzz!
+    class Dummy(object):
+        pass
+
+    request = Dummy()
+    request.method = 'POST'
+    request.path = 'nowhere'
+    request.META = {}
+    # XXX 'string' only have effect on moissonneurs.pubmed; its value is added
+    #     when processing request client-side, take a deep breath and see
+    #     templates/projects/project.html for more details.
+    request.POST = {'string': name,
+                    'query': query,
+                    'N': QUERY_SIZE_N_MAX}
+    request.user = Dummy()
+    request.user.id = project.user_id
+    request.user.is_authenticated = lambda: True
+
+    if moissonneur.name == 'istex':
+        # Replace ALL spaces by plus signs
+        request.POST['query'] = '+'.join(filter(None, query.split(' ')))
+
+    try:
+        import json
+
+        r = moissonneur.query(request)
+        raw_json = r.content.decode('utf-8')
+        data = json.loads(raw_json)
+
+        if moissonneur.name == 'pubmed':
+            count = sum(x['count'] for x in data)
+            request.POST['query'] = raw_json
+        elif moissonneur.name == 'istex':
+            count = data.get('total', 0)
+        else:
+            count = data.get('results_nb', 0)
+
+        if count > 0:
+            corpus = moissonneur.save(request, project.id, return_corpus=True)
+        else:
+            return None
+
+    except (ValueError, Http404) as e:
+        raise e
+
+    # Sometimes strange things happens...
+    if corpus.name != name:
+        corpus.name = name
+        session.commit()
+
+    return corpus
--- a/moissonneurs/cern.py
+++ b/moissonneurs/cern.py
@@ -30,7 +30,7 @@ def query( request):
            #ids = crawlerbot.get_ids(query)
            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})

-def save(request, project_id):
+def save(request, project_id, return_corpus=False):
    '''save'''
    if request.method == "POST":

@@ -101,6 +101,9 @@ def save(request, project_id):
            session.rollback()
            # --------------------------------------------

+        if return_corpus:
+            return corpus
+
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,

--- a/moissonneurs/hal.py
+++ b/moissonneurs/hal.py
@@ -33,7 +33,7 @@ def query( request):
            print(results)
            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})

-def save(request, project_id):
+def save(request, project_id, return_corpus=False):
    '''save'''
    if request.method == "POST":

@@ -103,6 +103,9 @@ def save(request, project_id):
            session.rollback()
            # --------------------------------------------

+        if return_corpus:
+            return corpus
+
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,

--- a/moissonneurs/isidore.py
+++ b/moissonneurs/isidore.py
@@ -29,7 +29,7 @@ def query( request):
            #ids = crawlerbot.get_ids(query)
            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})

-def save(request, project_id):
+def save(request, project_id, return_corpus=False):
    '''save'''
    if request.method == "POST":

@@ -100,6 +100,9 @@ def save(request, project_id):
            session.rollback()
            # --------------------------------------------

+        if return_corpus:
+            return corpus
+
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,

--- a/moissonneurs/istex.py
+++ b/moissonneurs/istex.py
@@ -52,7 +52,7 @@ def query( request ):



-def save(request , project_id):
+def save(request , project_id, return_corpus=False):
    print("testISTEX:")
    print(request.method)
    alist = ["bar","foo"]
@@ -171,6 +171,9 @@ def save(request , project_id):
            session.rollback()
            # --------------------------------------------

+        if return_corpus:
+            return corpus
+
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,

--- a/moissonneurs/multivac.py
+++ b/moissonneurs/multivac.py
@@ -33,7 +33,7 @@ def query( request):
            print(results)
            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})

-def save(request, project_id):
+def save(request, project_id, return_corpus=False):
    '''save'''
    if request.method == "POST":

@@ -104,6 +104,9 @@ def save(request, project_id):
            session.rollback()
            # --------------------------------------------

+        if return_corpus:
+            return corpus
+
        return render(
            template_name = 'pages/projects/wait.html',
            request = request,

--- a/moissonneurs/pubmed.py
+++ b/moissonneurs/pubmed.py
@@ -69,7 +69,7 @@ def query( request ):
    return JsonHttpResponse(data)


-def save( request , project_id ) :
+def save( request , project_id, return_corpus=False ) :
    # implicit global session
    # do we have a valid project id?
    try:
@@ -164,6 +164,10 @@ def save( request , project_id ) :
            session.rollback()
            # --------------------------------------------
        sleep(1)
+
+        if return_corpus:
+            return corpus
+
        return HttpResponseRedirect('/projects/' + str(project_id))

    data = alist

--- a/AdvancedTutorial.ipynb
+++ b/AdvancedTutorial.ipynb
@@ -2,11 +2,38 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
   "source": [
    "# Advanced Gargantext Tutorial (Python)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "'list' object is not callable",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-a8e3501c9a54>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'/srv/gargantext'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m: 'list' object is not callable"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.pa"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 1,
@@ -28,7 +55,9 @@
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
   },
   "outputs": [
    {

--- a/notebooks/AdvancedTutorial.ipynb
+++ b/notebooks/AdvancedTutorial.ipynb
--- a/notebooks/gargantext_core_tutorial.ipynb
+++ b/notebooks/gargantext_core_tutorial.ipynb
--- a/templates/pages/projects/overview.html
+++ b/templates/pages/projects/overview.html
@@ -203,6 +203,7 @@
      // do something…
        resetStatusForm("#createForm");
      })
+      return false;

    })


--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -57,7 +57,7 @@

                    <center id="corpus" class="help">

-                      
+
                        <a  data-toggle="modal" href="#addcorpus" >
                            <button
                            type="button"
@@ -440,11 +440,12 @@
                          // in the form "Add a corpus"

                          var type = $("#id_type").val()
+                          var file = $("#id_file").val()

                          // 5 booleans
                          var nameField = $("#id_name").val() != ""
                          var typeField = (type != "") && (type != "0")
-                          var fileField = $("#id_file").val() != ""
+                          var fileField = file != ""
                          var wantfileField = $("#file_yes").prop("checked")
                          var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField

@@ -457,6 +458,23 @@
                          if (! crawling) {
                              $("#submit_thing").prop('disabled' , !(nameField && typeField && fileField))
                          }
+
+                          // Automatically select CSV when type is undefined
+                          // and we have a .csv file
+                          if (!typeField && file && file.match(/.csv$/i)) {
+                              // Get CSV type id
+                              var csv = $('#id_type > option')
+                                  .filter(function() {
+                                      return $(this).text() === 'CSV'
+                                  })
+                                  .attr('value')
+                              // Select CSV type
+                              $('#id_type').val(csv)
+                              // Focus on name field
+                              setTimeout(function() {
+                                  $("#id_name").focus()
+                              })
+                          }
                        }

                        function bringDaNoise() {
@@ -532,7 +550,7 @@
                                        $("#submit_thing").html("Process a {{ query_size }} sample!")

                                        thequeries = data
-                                        var N=0,k=0;
+                                        var N=0;

                                        for(var i in thequeries) N += thequeries[i].count
                                        if( N>0) {
@@ -571,12 +589,11 @@
                                        $("#submit_thing").html("Process a {{ query_size }} sample!")

                                        thequeries = data
-                                        var N=data.length,k=0;
-                                        // for(var i in thequeries) N += thequeries[i].count
-                                        if( N>1) {
-                                            var total = JSON.parse(data).total
-                                            console.log("N: "+total)
-                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
+                                        var N = data.total;
+
+                                        if (N > 0) {
+                                            console.log("N: "+N)
+                                            $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications.</i><br>")
                                            $('#submit_thing').prop('disabled', false);
                                        } else {
                                            $("#theresults").html("<i>  <b>"+data[0]+"</b></i><br>")
@@ -661,7 +678,7 @@
                                      console.log(data)
                                      console.log("SUCCESS")
                                      console.log("enabling "+"#"+value.id)
-                                      
+
                                      // $("#"+value.id).attr('onclick','getGlobalResults(this);');
                                      $("#submit_thing").prop('disabled' , false)
                                      //$("#submit_thing").html("Process a {{ query_size }} sample!")
@@ -721,7 +738,7 @@
                                      console.log(data)
                                      console.log("SUCCESS")
                                      console.log("enabling "+"#"+value.id)
-                                      
+
                                      // $("#"+value.id).attr('onclick','getGlobalResults(this);');
                                      $("#submit_thing").prop('disabled' , false)
                                      //$("#submit_thing").html("Process a {{ query_size }} sample!")
@@ -781,7 +798,7 @@
                                      console.log(data)
                                      console.log("SUCCESS")
                                      console.log("enabling "+"#"+value.id)
-                                      
+
                                      // $("#"+value.id).attr('onclick','getGlobalResults(this);');
                                      $("#submit_thing").prop('disabled' , false)
                                      //$("#submit_thing").html("Process a {{ query_size }} sample!")
@@ -876,12 +893,12 @@
                            console.log("selected:", selectedId);

                            // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
-                            if (  selectedId == "3" 
-                               || selectedId == "8" 
-                               || selectedId == "9" 
-                               || selectedId == "10" 
-                               || selectedId == "11" 
-                               || selectedId == "12" 
+                            if (  selectedId == "3"
+                               || selectedId == "8"
+                               || selectedId == "9"
+                               || selectedId == "10"
+                               || selectedId == "11"
+                               || selectedId == "12"
                                ) {
                                console.log("show the button for: " + selectedId)
                                $("#div-fileornot").css("visibility", "visible");
@@ -1019,16 +1036,16 @@

                        function saveMultivac(query, N){
                          console.log("In Multivac")
-                          
+
                          if(!query || query=="") return;
                          console.log(query)
-                          
+
                              //var origQuery = query
                          var data = { "query" : query , "N": N };
-                          
+
                          // Replace all the slashes
                          var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
-                          
+
                          console.log(data)
                          $.ajax({
                              dataType: 'json',
@@ -1066,16 +1083,16 @@

                        function save(query, N, urlGarg){
                          console.log("In Gargantext")
-                          
+
                          if(!query || query=="") return;
                          console.log(query)
-                          
+
                              //var origQuery = query
                          var data = { "query" : query , "N": N };
-                          
+
                          // Replace all the slashes
                          var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
-                          
+
                          console.log(data)
                          $.ajax({
                              dataType: 'json',