[STABLE] Update from current unstable.

0bd9237e · delanoe · 515aa18b · 9087e23c · 0bd9237e · 0bd9237e
Commit 0bd9237e authored Aug 22, 2016 by delanoe
43 changed files
--- a/docs/contribution.md
+++ b/docs/contribution.md
@@ -28,6 +28,5 @@ see [install procedure](install.md)
    2. Create a new branch <username>-refactoring
    3. Run the gargantext-box
    4. Code
-    5.Test
+    5. Test
    6. Commit
-
--- a/docs/install.md
+++ b/docs/install.md
@@ -26,7 +26,7 @@ git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \


 ## Install
- ``` bash
+ ```bash
 # go into the directory
 user@computer: cd /srv/gargantext/
 #git inside installation folder
@@ -34,20 +34,31 @@ git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
 #execute the installation
 user@computer: ./install
 ```
-During installation an admin account for gargantext will be created by asking you a username and a password
-Remember it to accès to the Gargantext plateform
+The installation requires to create a user for gargantext,  it will be asked:
+
+```bash
+Username (leave blank to use 'gargantua'):
+#email is not mandatory
+Email address:
+Password:
+Password (again):
+```
+If successfully done this step you should see:
+```bash
+Superuser created successfully.
+[ ok ] Stopping PostgreSQL 9.5 database server: main.
+```
+

 ## Run
 Once you proceed to installation Gargantext plateforme will be available at localhost:8000
-by running the run executable file
+to start gargantext plateform:
 ``` bash
 # go into the directory
 user@computer: cd /srv/gargantext/
 #git inside installation folder
- user@computer: cd /install
- #execute the installation
- user@computer: ./run
- #type ctrl+d to exit or exit; command
+ user@computer: ./start
+ #type ctrl+d to exit or simply type exit in terminal;
 ```

 Then open up a chromium browser and go to localhost:8000
@@ -55,7 +66,3 @@ Click on "Enter Gargantext"
 Login in with you created username and pasword

 Enjoy! ;)
-
-
-
-
--- a/docs/resource.md
+++ b/docs/resource.md
+#resources
+
+Adding a new source into Gargantext requires a previous declaration
+of the source inside constants.py
+
+```python
+RESOURCETYPES= [
+{    "type":9, #give a unique type int
+      "name": 'SCOAP [XML]', #resource name as proposed into the add corpus FORM [generic format]
+      "parser": "CernParser", #name of the new parser class inside a CERN.py file (set to None if not implemented)
+      "format": 'MARC21', #specific format
+      'file_formats':["zip","xml"],# accepted file format
+      "crawler": "CernCrawler", #name of the new crawler class inside a CERN.py file (set to None if no Crawler implemented)
+      'default_languages': ['en', 'fr'], #supported defaut languages of the source
+ },
+ ...
+ ]
+```
+## adding a new parser
+
+Once you declared your new parser inside constants.py
+
+add your new crawler file into /srv/gargantext/utils/parsers/
+following this naming convention:
+
+* Filename must be in uppercase without the Crawler mention.
+  eg. MailParser => MAIL.py
+* Inside this file the Parser must be called following the exact typo declared as parser in constants.py
+* Your new crawler shall inherit from baseclasse Parser and provide a parse(filebuffer) method
+
+```python
+  #!/usr/bin/python3 env
+  #filename:/srv/gargantext/util/parser/MAIL.py:
+  from ._Parser import Parser
+  class MailParser(Parser):
+      def parse(self, file):
+          ...
+```
+## adding a new crawler
+
+Once you declared your new parser inside constants.py
+add your new crawler file into /srv/gargantext/utils/parsers/
+following this naming convention:
+
+* Filename must be in uppercase without the Crawler mention.
+  eg. MailCrawler => MAIL.py
+* Inside this file the Crawler must be called following the exact typo declared as crawler in constants.py
+* Your new crawler shall inherit from baseclasse Crawler and provide three method:
+  * scan_results => ids
+  * sample = > yes/no
+  * fetch
+
+```python
+  #!/usr/bin/python3 env
+  #filename:/srv/gargantext/util/crawler/MAIL.py:
+  from ._Crawler import Crawler
+  class MailCrawler(Crawler):
+      def scan_results(self, query):
+        ...
+        self.ids = set()
+      def sample(self, results_nb):
+        ...
+      def fetch(self, ids):
+        
+```
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
 # WARNING: to ensure consistency and retrocompatibility, lists should keep the
 #   initial order (ie., new elements should be appended at the end of the lists)
-
+import importlib
 from gargantext.util.lists import *
 from gargantext.util.tools import datetime, convert_to_date
-
 import re

+
+# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
 LISTTYPES = {
    'DOCUMENT'     : WeightedList,
    'GROUPLIST'    : Translations,   # todo remove "LIST" from name
@@ -19,16 +20,16 @@ LISTTYPES = {
    'TFIDF-CORPUS' : WeightedIndex,
    'TFIDF-GLOBAL' : WeightedIndex,
    'TIRANK-LOCAL' : WeightedIndex,   # could be WeightedList
-    'TIRANK-GLOBAL' : WeightedIndex,   # could be WeightedList
 }
 # 'OWNLIST'      : UnweightedList,    # £TODO use this for any term-level tags

 NODETYPES = [
    # TODO separate id not array index, read by models.node
-    None,
+    None,                    # 0
    # documents hierarchy
    'USER',                  # 1
    'PROJECT',               # 2
+    #RESOURCE should be here but last
    'CORPUS',                # 3
    'DOCUMENT',              # 4
    # lists
@@ -45,11 +46,12 @@ NODETYPES = [
    'TFIDF-GLOBAL',          # 14
    # docs subset
    'FAVORITES',             # 15
-
    # more scores (sorry!)
    'TIRANK-LOCAL',          # 16
    'TIRANK-GLOBAL',         # 17
+
    'GENCLUSION',            # 18
+    'RESOURCE',              # 19
 ]

 INDEXED_HYPERDATA = {
@@ -114,114 +116,178 @@ INDEXED_HYPERDATA = {

 }

+# resources ---------------------------------------------
+def get_resource(sourcetype):
+    '''resource :: type => resource dict'''
+    for n in RESOURCETYPES:
+        if int(n["type"]) == int(sourcetype):
+            return n
+    return None
+def get_resource_by_name(sourcename):
+    '''resource :: name => resource dict'''
+    for n in RESOURCETYPES:
+        if str(n["name"]) == str(sourcename):
+            return n
+# taggers -----------------------------------------------
+def get_tagger(lang):
+    '''
+    lang => default langage[0] => Tagger

-#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
-from gargantext.util.taggers import NltkTagger
-
-LANGUAGES = {
-    'en': {
-        #'tagger': EnglishMeltTagger,
-        #'tagger': TurboTagger,
-        'tagger': NltkTagger,
-    },
-    'fr': {
-        #'tagger': FrenchMeltTagger,
-        #'tagger': TreeTagger,
-        'tagger': NltkTagger,
-    },
-}
-
+    '''
+    name = LANGUAGES[lang]["tagger"]
+    module = "gargantext.util.taggers.%s" %(name)
+    module = importlib.import_module(module, "")
+    tagger = getattr(module, name)
+    return tagger()

-from gargantext.util.parsers import \
-    EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser, RepecParser

-def resourcetype(name):
-    '''
-    resourcetype :: String -> Int
-    Usage : resourcetype("Europress (English)") == 1
-    Examples in scrapers scripts (Pubmed or ISTex for instance).
-    '''
-    return [n[0]  for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]

-def resourcename(corpus):
-    '''
-    resourcetype :: Corpus -> String
-    Usage : resourcename(corpus) == "ISTex"
-    '''
-    resource = corpus.resources()[0]
-    resourcename = RESOURCETYPES[resource['type']]['name']
-    return re.sub(r'\(.*', '', resourcename)

 RESOURCETYPES = [
-    # type 0
-    {   'name': 'Select database below',
-        'parser': None,
-        'default_language': None,
-    },
-    # type 1
-    {   'name': 'Europress (English)',
-        'parser': EuropressParser,
-        'default_language': 'en',
+    {   "type":1,
+        'name': 'Europress',
+        'format': 'Europress',
+        'parser': "EuropressParser",
+        'file_formats':["zip"],
+        'crawler': None,
+        'default_languages': ['en', 'fr'],
    },
-    # type 2
-    {   'name': 'Europress (French)',
-        'parser': EuropressParser,
-        'default_language': 'fr',
+    {   'type': 2,
+        'name': 'Jstor [RIS]',
+        'format': 'RIS',
+        'parser': "RISParser",
+        'file_formats':["zip"],
+        'crawler': None,
+        'default_languages': ['en'],
    },
-    # type 3
-    {   'name': 'Jstor (RIS format)',
-        'parser': RISParser,
-        'default_language': 'en',
+    {   'type': 3,
+        'name': 'Pubmed [XML]',
+        'format': 'Pubmed',
+        'parser': "PubmedParser",
+        'file_formats':["zip", "xml"],
+        'crawler': "PubmedCrawler",
+        'default_languages': ['en'],
    },
-    # type 4
-    {   'name': 'Pubmed (XML format)',
-        'parser': PubmedParser,
-        'default_language': 'en',
+    {   'type':4,
+        'name': 'Scopus [RIS]',
+        'format': 'RIS',
+        'parser': "RISParser",
+        'file_formats':["zip"],
+        'crawler': None,
+        'default_languages': ['en'],
    },
-    # type 5
-    {   'name': 'Scopus (RIS format)',
-        'parser': RISParser,
-        'default_language': 'en',
+    {   'type':5,
+        'name': 'Web of Science [ISI]',
+        'format': 'ISI',
+        'parser': "ISIParser",
+        'file_formats':["zip"],
+        #'crawler': "ISICrawler",
+        'crawler': None,
+        'default_languages': ['en'],
    },
-    # type 6
-    {   'name': 'Web of Science (ISI format)',
-        'parser': ISIParser,
-        'default_language': 'en',
+    {   'type':6,
+        'name': 'Zotero [RIS]',
+        'format': 'RIS',
+        'parser': 'RISParser',
+        'file_formats':["zip", "ris", "txt"],
+        'crawler': None,
+        'default_languages': ['en'],
    },
-    # type 7
-    {   'name': 'Zotero (RIS format)',
-        'parser': RISParser,
-        'default_language': 'en',
+    {   'type':7,
+        'name': 'CSV',
+        'format': 'CSV',
+        'parser': 'CSVParser',
+        'file_formats':["zip", "csv"],
+        'crawler': None,
+        'default_languages': ['en'],
    },
-    # type 8
-    {   'name': 'CSV',
-        'parser': CSVParser,
-        'default_language': 'en',
+    {   'type': 8,
+        'name': 'ISTex [ISI]',
+        'format': 'ISI',
+        'parser': "ISTexParser",
+        'file_formats':["zip"],
+        #'crawler': "ISICrawler",
+        'crawler': None,
+        'default_languages': ['en', 'fr'],
    },
-    # type 9
-    {   'name': 'ISTex',
-        'parser': ISTexParser,
-        'default_language': 'en',
+   {    "type":9,
+        "name": 'SCOAP [XML]',
+        "parser": "CernParser",
+        "format": 'MARC21',
+        'file_formats':["zip","xml"],
+        "crawler": "CernCrawler",
+        'default_languages': ['en'],
+   },
+   {    "type":10,
+        "name": 'REPEC [RIS]',
+        "parser": "RisParser",
+        "format": 'RIS',
+        'file_formats':["zip","ris", "txt"],
+        "crawler": None,
+        'default_languages': ['en'],
+   },
+]
+#shortcut for resources declaration in template
+PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
+CRAWLERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["crawler"] is not None]
+
+def load_parser(resource):
+    '''given a resource load the corresponding Parser
+    resource(dict) > Parser(object)
+    exemple with resource ISTexParser
+    PARSER filename: ISTEX
+    PARSER object: ISTexParser
+    '''
+    filename = resource["parser"].replace("Parser", '').upper()
+    module = 'gargantext.util.parsers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, resource["parser"])
+
+
+def load_crawler(resource):
+    '''given a resource load the corresponding Parser()
+    resource(dict) > Parser(object)
+    exemple with resource ISTexCrawler
+    PARSER filename: ISTEX
+    PARSER object: ISTexCrawler
+    '''
+    filename = resource["crawler"].replace("Crawler", "").upper()
+    module = 'gargantext.util.crawlers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, resource["crawler"])
+
+
+
+# Supported languages and taggers ---------------------------------------------
+#first declare the tagger using a string
+#and it will be imported into gargantext.utils.taggers
+LANGUAGES = {
+    'en': {
+        #'tagger': 'EnglishMeltTagger',
+        #'tagger': "TurboTagger",
+        'tagger': 'NltkTagger',
    },
-    # type 10
-    {    "type":10,
-         "name": 'SCOAP (XML MARC21 Format)',
-         "parser": CernParser,
-         "default_language": "en",
-         'accepted_formats':["zip","xml"],
-         #~ "scrapper": CernScrapper,
-         #~ "base_url": "http://api.scoap3.org/search?",
+    'fr': {
+        #'tagger': "FrenchMeltTagger",
+        #'tagger': 'TreeTagger',
+        'tagger': 'NltkTagger',
    },
+}

-    # type 11
-    {   'name': 'REPEC (RIS format)',
-        'parser': RepecParser,
-        'default_language': 'en',
-    },
+def load_tagger(lang):
+    '''
+    given a LANG load the corresponding tagger
+    lang(str) > Tagger(Object)
+    '''
+
+    filename = LANGUAGES[lang]["tagger"]
+    module = 'gargantext.util.taggers.%s' %(filename)
+    module = importlib.import_module(module)
+    return getattr(module, filename)

-]

 # linguistic extraction parameters ---------------------------------------------
+
 DEFAULT_RANK_CUTOFF_RATIO      = .75         # MAINLIST maximum terms in %

 DEFAULT_RANK_HARD_LIMIT        = 5000        # MAINLIST maximum terms abs
@@ -257,19 +323,24 @@ DEFAULT_INDEX_SUBGRAMS         = False        # False <=> traditional
                                             #  "cool example".
                                             #   (all 1 to n-1 length ngrams,
                                             #    at indexing after extraction)
+# Defaults INDEXED Fields for ngrams extraction
+# put longest field first in order to make detection language more efficient
+DEFAULT_INDEX_FIELDS            = ('abstract','title' )
+# Grammar rules for chunking
+RULE_JJNN   = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
+RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
+RULE_TINA   = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+               +?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
+               ,){0,2}?(N.?.?,|\?,)+?)+?)*?$"


 # ngram lists import/export parameters -----------------------------------------
 DEFAULT_CSV_DELIM              = '\t'        # for import/export CSV defaults
 DEFAULT_CSV_DELIM_GROUP        = '|&|'

-# ------------------------------------------------------------------------------

-# other parameters
-# default number of docs POSTed to scrappers.views.py
-#  (at page  project > add a corpus > scan/process sample)
-QUERY_SIZE_N_DEFAULT = 1000

+# Files ----------------------------------------------------------------
 import os
 from .settings import BASE_DIR
 # uploads/.gitignore prevents corpora indexing
@@ -278,7 +349,7 @@ UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
 UPLOAD_LIMIT       = 1024 * 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

-
+# Processing -----------------------------------------------------------
 # about batch processing...
 BATCH_PARSING_SIZE          = 256
 BATCH_NGRAMSEXTRACTION_SIZE = 3000   # how many distinct ngrams before INTEGRATE
@@ -305,5 +376,3 @@ graph_constraints = {'corpusMax' : 400
                    ,'corpusMin' : 10
                    ,'mapList'   : 50
                    }
-
-
--- a/gargantext/requirements.pip
+++ b/gargantext/requirements.pip
@@ -14,6 +14,7 @@ djangorestframework==3.3.2
 html5lib==0.9999999
 jdatetime==1.7.2
 kombu==3.0.33
+langdetect==1.0.6
 lxml==3.5.0
 networkx==1.11
 nltk==3.1

--- a/gargantext/util/crawlers/CERN.py
+++ b/gargantext/util/crawlers/CERN.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# *****  CERN Scrapper    *****
+# ****************************
+# Author:c24b
+# Date: 27/05/2015
+
+from ._Crawler import Crawler
+
+import hmac, hashlib
+import requests
+import os
+import random
+import urllib.parse as uparse
+from lxml import etree
+from gargantext.settings import API_TOKENS
+
+#from gargantext.util.files import build_corpus_path
+from gargantext.util.db import session
+from gargantext.models          import Node
+
+class CernCrawler(Crawler):
+    '''CERN SCOAP3 API Interaction'''
+
+    def __generate_signature__(self, url):
+        '''creation de la signature'''
+        #hmac-sha1 salted with secret
+        return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
+
+    def __format_query__(self, query, of="xm", fields= None):
+        ''' for query filters params
+        see doc https://scoap3.org/scoap3-repository/xml-api/
+        '''
+        #dict_q = uparse.parse_qs(query)
+        dict_q = {}
+        #by default: search by pattern
+        dict_q["p"] = query
+        if fields is not None and isinstance(fields, list):
+            fields = ",".join(fields)
+            dict_q["f"] = fields
+        #outputformat: "xm", "xmt", "h", "html"
+        dict_q["of"]= of
+        return dict_q
+
+    def __format_url__(self, dict_q):
+        '''format the url with encoded query'''
+        #add the apikey
+        dict_q["apikey"] = [self.apikey]
+        params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
+        return self.BASE_URL+params
+
+    def sign_url(self, dict_q):
+        '''add signature'''
+        API = API_TOKENS["CERN"]
+        self.apikey = API["APIKEY"]
+        self.secret  = API["APISECRET"].encode("utf-8")
+        self.BASE_URL = u"http://api.scoap3.org/search?"
+        url = self.__format_url__(dict_q)
+        return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
+
+
+    def create_corpus(self):
+        #create a corpus
+        corpus = Node(
+            name = self.query,
+            #user_id = self.user_id,
+            parent_id = self.project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : self.type["default_language"]
+                                        }
+        )
+        #add the resource
+        corpus.add_resource(
+          type = self.type["type"],
+          name = self.type["name"],
+          path = self.path)
+
+        try:
+            print("PARSING")
+            # p = eval(self.type["parser"])()
+            session.add(corpus)
+            session.commit()
+            self.corpus_id = corpus.id
+            parse_extract_indexhyperdata(corpus.id)
+            return self
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            session.rollback()
+            return self
+
+    def download(self):
+        import time
+        self.path = "/tmp/results.xml"
+        query = self.__format_query__(self.query)
+        url = self.sign_url(query)
+        start = time.time()
+        r = requests.get(url, stream=True)
+        downloaded = False
+        #the long part
+        with open(self.path, 'wb') as f:
+            print("Downloading file")
+            for chunk in r.iter_content(chunk_size=1024):
+
+                if chunk: # filter out keep-alive new chunks
+                    #print("===")
+                    f.write(chunk)
+            downloaded = True
+            end = time.time()
+            #print (">>>>>>>>>>LOAD results", end-start)
+        return downloaded
+
+
+    def scan_results(self):
+        '''scanner le nombre de resultat en récupérant 1 seul résultat
+        qui affiche uniquement l'auteur de la page 1
+        on récupère le commentaire en haut de la page
+        '''
+        import time
+
+
+        self.results_nb = 0
+        query = self.__format_query__(self.query, of="hb")
+        query["ot"] = "100"
+        query["jrec"]='1'
+        query["rg"]='1'
+        url = self.sign_url(query)
+        print(url)
+        #start = time.time()
+        r = requests.get(url)
+        #end = time.time()
+        #print (">>>>>>>>>>LOAD results_nb", end-start)
+        if r.status_code == 200:
+            self.results_nb = int(r.text.split("-->")[0].split(': ')[-1][:-1])
+            return self.results_nb
+        else:
+            raise ValueError(r.status)
--- a/gargantext/util/crawlers/ISTEX.py
+++ b/gargantext/util/crawlers/ISTEX.py
+from ._Crawler import *
+import json
+
+class ISTexCrawler(Crawler):
+    """
+    ISTEX Crawler
+    """
+    def __format_query__(self,query=None):
+        '''formating query urlquote instead'''
+        if query is not None:
+            query = query.replace(" ","+")
+            return query
+        else:
+            self.query = self.query.replace(" ","+")
+            return self.query
+
+    def scan_results(self):
+        #get the number of results
+        self.results_nb = 0
+        self.query = self.__format_query__()
+        _url = "http://api.istex.fr/document/?q="+self.query+"&size=0"
+        #"&output=id,title,abstract,pubdate,corpusName,authors,language"
+        r = requests.get(_url)
+        print(r)
+        if r.status_code == 200:
+            self.results_nb = int(r.json()["total"])
+            self.status.append("fetching results")
+            return self.results_nb
+        else:
+            self.status.append("error")
+            raise ValueError(r.status)
+
+    def download(self):
+        '''fetching items'''
+        downloaded = False
+        def get_hits(future):
+            '''here we directly get the result hits'''
+            response = future.result()
+            if response.status_code == 200:
+                return response.json()["hits"]
+            else:
+                return None
+
+        #session = FuturesSession()
+        #self.path = "/tmp/results.json"
+        self.status.append("fetching results")
+        paging = 100
+        self.query_max = self.results_nb
+        if self.query_max > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
+            print("ERROR (scrap: istex d/l ): ",msg)
+            self.query_max = QUERY_SIZE_N_MAX
+
+        #urlreqs = []
+        with open(self.path, 'wb') as f:
+            for i in range(0, self.query_max, paging):
+                url_base = "http://api.istex.fr/document/?q="+self.query+"&output=*&from=%i&size=%i" %(i, paging)
+                r = requests.get(url_base)
+                if r.status_code == 200:
+                    downloaded = True
+                    f.write(r.text.encode("utf-8"))
+                else:
+                    downloaded = False
+                    self.status.insert(0, "error fetching ISTEX "+ r.status)
+                    break
+        return downloaded
+
+
+
+
--- a/gargantext/util/crawlers/PUBMED.py
+++ b/gargantext/util/crawlers/PUBMED.py
+# ****************************
+# *****  Medline Scraper *****
+# ****************************
+
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
+# between 9 pm and 5 am Eastern Time weekdays
+
+
+# from datetime import datetime
+from time import sleep
+import json
+import datetime
+from os import path
+import threading
+from traceback                  import print_tb
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR
+from ._Crawler import Crawler
+import requests
+from lxml import etree
+
+class PubmedCrawler(Crawler):
+    #self.pubMedEutilsURL =
+    #self.pubMedDB        = 'Pubmed'
+    #self.reportType      = 'medline'
+
+    def __format_query__(self, query= None):
+        if query is not None:
+        #origQuery = self.query
+            query     = query.replace(' ', '%20')
+            return query
+        else:
+            self.query     = self.query.replace(' ', '%20')
+            return self.query
+
+    def get_records_by_year(self):
+        '''
+        Calculate the offset results <retmax> for each year by:
+        - getting the last_n_years results by year
+        - respecting the proportion on the global results by year
+        - sample it on the MAX_RESULTS basis
+        as the following:
+        pub_nb = sum([pub_nb_by_years])
+        retmax = (pub_nb_year /sum([pub_nb_by_years]))*MAX_RESULTS
+        '''
+        _url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+        stats = {}
+
+        for i in range(self.n_last_years):
+            maxyear = self.YEAR -i
+            minyear = maxyear-1
+            #mindate = str(maxyear-1)+"/"+str(self.MONTH)
+            #maxdate = str(maxyear)+"/"+str(self.MONTH)
+            #print(mindate,"-",maxdate)
+            params = {  "db":"pubmed",
+                        "term":self.query,
+                        "datetype":"pdat",
+                        "retmax":1,
+                        "usehistory":'y',
+                        'mindate':minyear,
+                        'maxdate':maxyear,
+                        }
+            r = requests.get(_url, params)
+            if r.status_code == 200:
+                data          = (r.text).encode("utf-8")
+                root          = etree.XML(data)
+                findcount     = etree.XPath("/eSearchResult/Count/text()")
+                count         = int(findcount(root)[0])
+                stats[minyear]   = count
+        return stats
+
+
+
+
+    def sampling(self):
+        stats = self.get_records_by_year()
+        _url = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
+        self.results_nb = sum(list(stats.values()))
+        if self.results_nb == 0:
+            self.status.insert(0, "[SAMPLING error] no  results found by year")
+            downloaded = False
+            return False
+
+        self.paths= []
+        for minyear, count in stats.items():
+            print(minyear, minyear+1)
+            path = "/tmp/"+str(minyear-1)+"_results.xml"
+            maxyear = minyear+1
+             #mindate = str(maxyear-1)+"/"+self.MONTH
+            #maxdate = str(maxyear)+"/"+self.MONTH
+            ratio = (count/self.results_nb)*self.MAX_RESULTS
+            params = {  "email": 'youremail@example.org',
+                        'rettype': 'abstract',
+                        "retstart":0,
+                        'retmax':round(ratio),
+                        "db":"pubmed",
+                        "term": self.query,
+                        #"query_key": self.queryKey,
+                        #"WebEnv": self.webEnv,
+                        "rettype":"abstract",
+                        "datetype":"pdat",
+                        "mindate": str(minyear),
+                        "maxdate": str(maxyear),
+                        "usehistory": 'n',
+                        }
+            r = requests.get(_url, params, stream=True)
+            with open(path, 'wb') as f:
+                print(path)
+                if r.status_code == 200:
+                    for chunk in r.iter_content(chunk_size=1024):
+
+                        f.write(chunk)
+                        downloaded = True
+                    self.paths.append(path)
+                else:
+                    downloaded = False
+                    self.status.insert(0, "error fetching PUBMED "+ str(r))
+                    break
+        return downloaded
+
+    def scan_results(self):
+        self.__format_query__()
+        self.base_url = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.base_db= 'Pubmed'
+        self.base_format = 'medline'
+        self.results_nb = 0
+        self.webEnv = None
+        self.results_nb = 0
+        self.queryKey = None
+        self.retMax = 1
+
+        _url   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
+                     % ( self.base_url, self.base_db, self.query )
+
+        r = requests.get(_url)
+        print(r.url)
+        if r.status_code == 200:
+            data          = (r.text).encode("utf-8")
+            root          = etree.XML(data)
+
+            findcount     = etree.XPath("/eSearchResult/Count/text()")
+            self.results_nb = findcount(root)[0]
+
+            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
+            self.queryKey      = findquerykey(root)[0]
+
+            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
+            self.webEnv        = findwebenv(root)[0]
+            findretmax    = etree.XPath("/eSearchResult/RetMax/text()")
+            self.retMax        = findwebenv(root)[0]
+            return self
+
+
+    def download(self):
+
+        #print(self.results_nb, self.queryKey, self.webEnv)
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+        paging = 100
+        self.query = self.query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+
+        print(self.results_nb, self.queryKey, self.webEnv)
+
+        if self.results_nb > self.MAX_RESULTS:
+            #Search results nb over the past N_YEARS
+            msg = "Invalid sample size N = %i (max = %i)" % (self.results_nb, self.MAX_RESULTS)
+            #print("ERROR (scrap: istex d/l ): ",msg)
+            stats = self.sampling()
+            #print(stats)
+            #self.query_max = QUERY_SIZE_N_MAX
+            return True
+        else:
+
+            #retstart = 0
+            #retmax = 0
+            _url = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
+            params = {  "email": 'youremail@example.org',
+                        'rettype': 'abstract',
+                        "retstart":0,
+                        'retmax':self.MAX_RESULTS,
+                        "db":"Pubmed",
+                        "query_key": self.queryKey,
+                        "WebEnv": self.webEnv,
+                        "rettype":"abstract",
+                        }
+            r = requests.get(_url, params, stream=True)
+            print(r.url)
+            #print(r.text)
+
+            with open(self.path, 'wb') as f:
+                if r.status_code == 200:
+                    for chunk in r.iter_content(chunk_size=1024):
+                        downloaded = True
+                        f.write(chunk)
+                else:
+                    downloaded = False
+                    self.status.insert(0, "error fetching PUBMED "+ r.status)
+            return downloaded
+
+
+def query( request ):
+    """
+    Pubmed year by year results
+
+    # alist = [
+    # {'string': '2011[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
+    # {'string': '2012[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
+    #  ... ]
+
+    (reused as thequeries in query_save)
+    """
+    print(request.method)
+    alist = []
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+
+        if N > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+            print("ERROR(scrap: pubmed stats): ",msg)
+            raise ValueError(msg)
+
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        instancia = Scraper()
+
+        # serialFetcher (n_last_years, query, query_size)
+        alist = instancia.serialFetcher( 5, query , N )
+
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def save( request , project_id ) :
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+
+    project = session.query( Node ).filter(Node.id == project_id).first()
+
+    if project is None:
+        raise Http404()
+
+
+    user = cache.User[request.user.id]
+    if not user.owns(project):
+        raise HttpResponseForbidden()
+
+
+    if request.method == "POST":
+        queries = request.POST["query"]
+        name    = request.POST["string"]
+
+        # here we just realize queries already prepared by getGlobalStats
+        #    ===> no need to repeat N parameter like in testISTEX <===
+
+        instancia  = Scraper()
+        thequeries = json.loads(queries)
+
+        # fyi the sum of our prepared yearly proportional quotas
+        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
+        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
+
+        urlreqs = []
+        for yearquery in thequeries:
+            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+        alist = ["tudo fixe" , "tudo bem"]
+
+
+        # corpus node instanciation as a Django model
+        corpus = project.add_child( name=name
+                                  , typename = "CORPUS"
+                                  )
+
+        # """
+        # urlreqs: List of urls to query.
+        # - Then, to each url in urlreqs you do:
+        #     eFetchResult = urlopen(url)
+        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
+        # """
+
+        tasks = Scraper()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in the queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+
+        for filename in tasks.firstResults :
+            print(filename)
+            if filename != False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource( type = resourcetype('Pubmed (XML format)')
+                                   , path = filename
+                                   , url  = None
+                                   )
+                print("Adding the resource")
+                dwnldsOK+=1
+
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus_id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        sleep(1)
+        return HttpResponseRedirect('/projects/' + str(project_id))
+
+    data = alist
+    return JsonHttpResponse(data)
--- a/gargantext/util/crawlers/_Crawler.py
+++ b/gargantext/util/crawlers/_Crawler.py
+# Scrapers config
+QUERY_SIZE_N_MAX     = 1000
+
+from gargantext.constants import get_resource
+from gargantext.util.scheduling import scheduled
+from gargantext.util.db         import session
+from requests_futures.sessions import FuturesSession
+from gargantext.util.db         import session
+import requests
+from gargantext.models.nodes    import Node
+#from gargantext.util.toolchain import parse_extract_indexhyperdata
+from datetime import date
+
+class Crawler:
+    """Base class for performing search and add corpus file depending on the type
+    """
+    def __init__(self, record):
+
+        #the name of corpus
+        #that will be built in case of internal fileparsing
+        self.record = record
+        self.name = record["corpus_name"]
+        self.project_id = record["project_id"]
+        self.user_id = record["user_id"]
+        self.resource = record["source"]
+        self.type = get_resource(self.resource)
+        self.query = record["query"]
+        #format the sampling
+        self.n_last_years = 5
+        self.YEAR = date.today().year
+        #pas glop
+        # mais easy version
+        self.MONTH = str(date.today().month)
+        if len(self.MONTH) == 1:
+            self.MONTH = "0"+self.MONTH
+        self.MAX_RESULTS = 1000
+        try:
+            self.results_nb = int(record["count"])
+        except KeyError:
+            #n'existe pas encore
+            self.results_nb = 0
+        try:
+            self.webEnv = record["webEnv"]
+            self.queryKey = record["queryKey"]
+            self.retMax = record["retMax"]
+        except KeyError:
+            #n'exsite pas encore
+            self.queryKey = None
+            self.webEnv = None
+            self.retMax = 1
+        self.status = [None]
+        self.path = "/tmp/results.txt"
+
+    def tmp_file(self):
+        '''here should stored the results
+        depending on the type of format'''
+        raise NotImplemented
+
+
+    def parse_query(self):
+        '''here should parse the parameters of the query
+        depending on the type and retrieve a set of activated search option
+        '''
+        raise NotImplemented
+
+    def fetch(self):
+        if self.download():
+            self.create_corpus()
+            return self.corpus_id
+    def get_sampling_dates():
+        '''Create a sample list of min and max date based on Y and M f*
+        or N_LAST_YEARS results'''
+        dates = []
+        for i in range(self.n_last_years):
+            maxyear = self.YEAR -i
+            mindate = str(maxyear-1)+"/"+str(self.MONTH)
+            maxdate = str(maxyear)+"/"+str(self.MONTH)
+            print(mindate,"-",maxdate)
+            dates.append((mindate, maxdate))
+        return dates
+
+    def create_corpus(self):
+        #create a corpus
+        corpus = Node(
+            name = self.query,
+            user_id = self.user_id,
+            parent_id = self.project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data",
+                                         "language_id" : self.type["default_language"],
+                                        }
+        )
+        self.corpus_id = corpus.id
+        if len(self.paths) > 0:
+            for path in self.paths:
+                #add the resource
+                corpus.add_resource(
+                  type = self.type["type"],
+                  name = self.type["name"],
+                  path = path
+                  )
+            session.add(corpus)
+            session.commit()
+            scheduled(parse_extract_indexhyperdata(corpus.id))
+        else:
+            #add the resource
+            corpus.add_resource(
+              type = self.type["type"],
+              name = self.type["name"],
+              path = self.path
+              )
+            session.add(corpus)
+            session.commit()
+            scheduled(parse_extract_indexhyperdata(corpus.id))
+        return corpus
--- a/gargantext/util/crawlers/__init__.py
+++ b/gargantext/util/crawlers/__init__.py
+import importlib
+from gargantext.constants import RESOURCETYPES
+from gargantext.settings import DEBUG
+
+#if DEBUG: print("Loading available Crawlers")
+
+base_parser = "gargantext.util.crawlers"
+for resource in RESOURCETYPES:
+    if resource["crawler"] is not None:
+        try:
+            name =resource["crawler"]
+            #crawler is type basename+"Crawler"
+            filename = name.replace("Crawler", "").lower()
+            module = base_parser+".%s" %(filename)
+            importlib.import_module(module)
+
+            #if DEBUG: print("\t-", name)
+        except Exception as e:
+            print("Check constants.py RESOURCETYPES declaration %s \nCRAWLER %s is not available for %s" %(str(e), resource["crawler"], resource["name"]))
+
+#initial import
+#from .cern import CernCrawler
+#from .istex import ISTexCrawler
+#from .pubmed import PubmedCrawler
+
--- a/gargantext/util/languages.py
+++ b/gargantext/util/languages.py
 from gargantext.constants import *
-
-
+from langdetect import detect, DetectorFactory

 class Language:
-    def __init__(self, iso2=None, iso3=None, name=None):
+    def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
        self.iso2 = iso2
        self.iso3 = iso3
        self.name = name
        self.implemented = iso2 in LANGUAGES
+
    def __str__(self):
        result = '<Language'
        for key, value in self.__dict__.items():
@@ -16,6 +16,7 @@ class Language:
        return result
    __repr__ = __str__

+
 class Languages(dict):
    def __missing__(self, key):
        key = key.lower()
@@ -25,6 +26,10 @@ class Languages(dict):

 languages = Languages()

+def detect_lang(text):
+    DetectorFactory.seed = 0
+    return languages[detect(text)].iso2
+
 import pycountry
 pycountry_keys = (
    ('iso639_3_code', 'iso3', ),
@@ -49,3 +54,4 @@ languages['fre'] = languages['fr']
 languages['ger'] = languages['de']
 languages['Français'] = languages['fr']
 languages['en_US'] = languages['en']
+languages['english'] = languages['en']
--- a/gargantext/util/parsers/Cern.py
+++ b/gargantext/util/parsers/Cern.py
@@ -2,6 +2,8 @@ from ._Parser import Parser
 from datetime import datetime
 from bs4 import BeautifulSoup
 from lxml import etree
+#import asyncio
+#q = asyncio.Queue(maxsize=0)

 class CernParser(Parser):
    #mapping MARC21 ==> hyperdata
@@ -38,24 +40,34 @@ class CernParser(Parser):
            "856": {"u":"pdf_source"},
            }

-    def format_date(self, hyperdata):
-        '''formatting pubdate'''
-        prefix = "publication"
-        date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
-        #hyperdata[prefix + "_year"]      = date.strftime('%Y')
-        hyperdata[prefix + "_month"]     = date.strftime("%m")
-        hyperdata[prefix + "_day"]       = date.strftime("%d")
-        hyperdata[prefix + "_hour"]      = date.strftime("%H")
-        hyperdata[prefix + "_minute"]    = date.strftime("%M")
-        hyperdata[prefix + "_second"]    = date.strftime("%S")
-        hyperdata[prefix + "_date"]  = date.strftime("%Y-%m-%d %H:%M:%S")
-        print("Date", hyperdata["publication_date"])
-        return hyperdata
+    # def format_date(self, hyperdata):
+    #     '''formatting pubdate'''
+    #     prefix = "publication"
+    #     try:
+    #         date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m-%d")
+    #     except ValueError:
+    #         date = datetime.strptime(hyperdata[prefix + "_date"], "%Y-%m")
+    #         date.day = "01"
+    #     hyperdata[prefix + "_year"]      = date.strftime('%Y')
+    #     hyperdata[prefix + "_month"]     = date.strftime("%m")
+    #     hyperdata[prefix + "_day"]       = date.strftime("%d")
+    #
+    #     hyperdata[prefix + "_hour"]      = date.strftime("%H")
+    #     hyperdata[prefix + "_minute"]    = date.strftime("%M")
+    #     hyperdata[prefix + "_second"]    = date.strftime("%S")
+    #     hyperdata[prefix + "_date"]  = date.strftime("%Y-%m-%d %H:%M:%S")
+    #     #print("Date", hyperdata["publication_date"])
+    #     return hyperdata

+    #@asyncio.coroutine
    def parse(self, file):
+        #print("PARSING")
        hyperdata_list = []
        doc = file.read()
-        soup = BeautifulSoup(doc.decode("utf-8"), "lxml")
+        #print(doc[:35])
+        soup = BeautifulSoup(doc, "lxml")
+
+        #print(soup.find("record"))
        for record in soup.find_all("record"):
            hyperdata = {v:[] for v in self.MARC21["100"].values()}
            hyperdata["uid"] = soup.find("controlfield").text
@@ -86,8 +98,8 @@ class CernParser(Parser):
            hyperdata["authors_affiliations"] = (",").join(hyperdata["authors_affiliations"])
            hyperdata["authors"] = (",").join(hyperdata["authors"])
            hyperdata["authors_mails"] = (",").join(hyperdata["authors_mails"])
-            hyperdata = self.format_date(hyperdata)
+            #hyperdata = self.format_date(hyperdata)
+            hyperdata = self.format_hyperdata_languages(hyperdata)
+            hyperdata = self.format_hyperdata_dates(hyperdata)
            hyperdata_list.append(hyperdata)
        return hyperdata_list
-
-
--- a/gargantext/util/parsers/Europress.py
+++ b/gargantext/util/parsers/Europress.py
--- a/gargantext/util/parsers/Isi.py
+++ b/gargantext/util/parsers/Isi.py
-from .Ris import RISParser
+from .RIS import RISParser


 class ISIParser(RISParser):
-    
+
        _begin = 3
-        
+
        _parameters = {
            b"ER":  {"type": "delimiter"},
            b"TI":  {"type": "hyperdata", "key": "title", "separator": " "},
@@ -17,4 +17,3 @@ class ISIParser(RISParser):
            b"AB":  {"type": "hyperdata", "key": "abstract", "separator": " "},
            b"WC":  {"type": "hyperdata", "key": "fields"},
        }
-
--- a/gargantext/util/parsers/ISTex.py
+++ b/gargantext/util/parsers/ISTex.py
--- a/gargantext/util/parsers/Pubmed.py
+++ b/gargantext/util/parsers/Pubmed.py
@@ -31,6 +31,7 @@ class PubmedParser(Parser):
        if isinstance(file, bytes):
            file = BytesIO(file)
        xml = etree.parse(file, parser=self.xml_parser)
+        #print(xml.find("PubmedArticle"))
        xml_articles = xml.findall('PubmedArticle')
        # initialize the list of hyperdata
        hyperdata_list = []

--- a/gargantext/util/parsers/Ris_repec.py
+++ b/gargantext/util/parsers/Ris_repec.py
--- a/gargantext/util/parsers/Ris.py
+++ b/gargantext/util/parsers/Ris.py
@@ -36,6 +36,7 @@ class RISParser(Parser):
        last_values = []
        # browse every line of the file
        for line in file:
+
            if len(line) > 2 :
                # extract the parameter key
                parameter_key = line[:2]

--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -20,14 +20,9 @@ class Parser:
            self._file = file

    def __del__(self):
-        self._file.close()
+        if hasattr(self, '_file'):
+            self._file.close()

-    def detect_format(self, afile, a_formats):
-        #import magic
-        print("Detecting format")
-        #print(magic.from_file(afile))
-
-        return

    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -167,6 +162,8 @@ class Parser:

    def __iter__(self, file=None):
        """Parse the file, and its children files found in the file.
+        C24B comment: le stokage/extraction du fichier devrait être faite en amont
+        et cette methode est un peu obscure
        """
        if file is None:
            file = self._file

--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
-from .Ris       import RISParser
-from .Ris_repec import RepecParser
-from .Isi       import ISIParser
-# from .Jstor import JstorParser
-# from .Zotero import ZoteroParser
-from .Pubmed    import PubmedParser
-
-# # 2015-12-08: parser 2 en 1
-from .Europress import EuropressParser
-
-from .ISTex     import ISTexParser
-from .CSV       import CSVParser
-from .Cern      import CernParser
+import importlib
+from gargantext.constants import RESOURCETYPES
+from gargantext.settings import DEBUG
+if DEBUG:
+    print("Loading available PARSERS:")
+base_parser = "gargantext.util.parsers"
+for resource in RESOURCETYPES:
+    if resource["parser"] is not None:
+        #parser file is without Parser
+        fname = resource["parser"].replace("Parser", "")
+        #parser file is formatted as a title
+        module = base_parser+".%s" %(fname.upper())
+        #parser module is has shown in constants
+        parser = importlib.import_module(module)
+        if DEBUG:
+            print("\t-", resource["parser"])
+        getattr(parser,resource["parser"])
--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
@@ -3,9 +3,9 @@ When started, it initiates the parser;
 when passed text, the text is piped to the parser.
 When ended, the parser is closed and the tagged word returned as a tuple.
 """
-
+from gargantext.constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
 import re
-
+import nltk

 class Tagger:

@@ -19,7 +19,28 @@ class Tagger:
            | [][.,;"'?!():-_`]             # these are separate tokens
            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
        self.buffer = []
-        self.start()
+
+        #self.start()
+
+
+    def clean_text(self, text):
+        """Clean the text for better POS tagging.
+        For now, only removes (short) XML tags.
+        """
+        return re.sub(r'<[^>]{0,45}>', '', text)
+
+    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
+        self.text = self.clean_text(text)
+        grammar = nltk.RegexpParser(label + ': ' + rule)
+        tagged_tokens = list(self.tag_text(self.text))
+        if len(tagged_tokens):
+            grammar_parsed = grammar.parse(tagged_tokens)
+            for subtree in grammar_parsed.subtrees():
+                if subtree.label() == label:
+                    if len(subtree) < max_n_words:
+                        yield subtree.leaves()
+                            # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
+

    def __del__(self):
        self.stop()
@@ -29,6 +50,8 @@ class Tagger:
        This method is called by the constructor, and can be overriden by
        inherited classes.
        """
+        print("START")
+        self.extract(self.text)

    def stop(self):
        """Ends the tagger.

--- a/gargantext/util/taggers/__init__.py
+++ b/gargantext/util/taggers/__init__.py
-from .TurboTagger import TurboTagger
-from .NltkTagger import NltkTagger
-from .TreeTagger import TreeTagger
-from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
+#version2
+#imported as needed
+
+#Version 1
+#~ import importlib
+#~ from gargantext.constants import LANGUAGES
+#~ from gargantext.settings import DEBUG
+
+
+#~ if DEBUG:
+    #~ print("Loading available Taggers:")
+
+#~ for lang, tagger in LANGUAGES.items():
+    #~ tagger = tagger["tagger"]
+    #~ filename = "gargantext.util.taggers.%s" %(tagger)
+    #~ if DEBUG:
+        #~ print("\t-%s (%s)" %(tagger, lang))
+    #~ getattr(importlib.import_module(filename), tagger)()
+
+
+#VERSION 0
+#~ #initally a manual import declaration
+#~ from .TurboTagger import TurboTagger
+#~ from .NltkTagger import NltkTagger
+#~ from .TreeTagger import TreeTagger
+#~ from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
--- a/gargantext/util/toolchain/list_map.py
+++ b/gargantext/util/toolchain/list_map.py
@@ -102,7 +102,7 @@ def do_maplist(corpus,

    if n_ngrams == 0:
        raise ValueError("No ngrams in cooc table ?")
-
+        #return
    # results, with same structure as quotas
    chosen_ngrams = {
                     'topgen':{'monograms':[], 'multigrams':[]},

--- a/gargantext/util/toolchain/main.py
+++ b/gargantext/util/toolchain/main.py
@@ -82,6 +82,7 @@ def parse_extract_indexhyperdata(corpus):
    favs = corpus.add_child(
            typename='FAVORITES', name='favorite docs in "%s"' % corpus.name
            )
+
    session.add(favs)
    session.commit()
    print('CORPUS #%d: [%s] new favorites node #%i' % (corpus.id, t(), favs.id))
@@ -265,7 +266,7 @@ def recount(corpus):

    # -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
    (spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
-                                        spec_overwrite_id = old_spec_id, 
+                                        spec_overwrite_id = old_spec_id,
                                        gen_overwrite_id = old_gen_id)

    print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))

--- a/gargantext/util/toolchain/ngram_groups.py
+++ b/gargantext/util/toolchain/ngram_groups.py
+#!/usr/bin/python3 env
 """
 For initial ngram groups via stemming
 Exemple:
@@ -21,16 +22,13 @@ def prepare_stemmers(corpus):
    """
    Returns *several* stemmers (one for each language in the corpus)
         (as a dict of stemmers with key = language_iso2)
+         languages has been previously filtered by supported source languages
+         and formatted
    """
-    stemmers_by_lg = {
-        # always get a generic stemmer in case language code unknown
-        '__unknown__' : SnowballStemmer("english")
-    }
-    for lgiso2 in corpus.hyperdata['languages'].keys():
-        if (lgiso2 != '__skipped__'):
-            lgname = languages[lgiso2].name.lower()
-            stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
-    return stemmers_by_lg
+    stemmers = {lang:SnowballStemmer(languages[lang].name.lower())  for lang \
+                    in corpus.languages.keys() if lang !="__skipped__"}
+    stemmers['__unknown__'] = SnowballStemmer("english")
+    return stemmers

 def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    """
@@ -57,16 +55,17 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    my_groups = defaultdict(Counter)

    # preloop per doc to sort ngrams by language
-    for doc in corpus.children():
-        if ('language_iso2' in doc.hyperdata):
-            lgid = doc.hyperdata['language_iso2']
-        else:
-            lgid = "__unknown__"
-
-        # doc.ngrams is an sql query (ugly but useful intermediate step)
-        # FIXME: move the counting and stoplist filtering up here
-        for ngram_pack in doc.ngrams.all():
-            todo_ngrams_per_lg[lgid].add(ngram_pack)
+    for doc in corpus.children('DOCUMENT'):
+        if doc.id not in corpus.skipped_docs:
+            if ('language_iso2' in doc.hyperdata):
+                lgid = doc.hyperdata['language_iso2']
+            else:
+                lgid = "__unknown__"
+
+            # doc.ngrams is an sql query (ugly but useful intermediate step)
+            # FIXME: move the counting and stoplist filtering up here
+            for ngram_pack in doc.ngrams.all():
+                todo_ngrams_per_lg[lgid].add(ngram_pack)

    # --------------------
    # long loop per ngrams

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
 from gargantext.util.db import *
 from gargantext.models import *
 from gargantext.constants import *
-from gargantext.util.ngramsextractors import ngramsextractors
-
 from collections import defaultdict
 from re          import sub
-
 from gargantext.util.scheduling import scheduled

 def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
@@ -36,7 +33,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
    db.commit()


-def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_INDEX_SUBGRAMS):
+def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_INDEX_SUBGRAMS):
    """Extract ngrams for every document below the given corpus.
    Default language is given by the resource type.
    The result is then inserted into database.
@@ -46,57 +43,50 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        db, cursor = get_cursor()
        nodes_ngrams_count = defaultdict(int)
        ngrams_data = set()
-        # extract ngrams
-        resource_type_index = corpus.resources()[0]['type']
+        #1 corpus = 1 resource
+        resource = corpus.resources()[0]
        documents_count = 0
-        resource_type = RESOURCETYPES[resource_type_index]
-        default_language_iso2 = resource_type['default_language']
-        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
-            # get ngrams extractor for the current document
-            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
-            try:
-                # this looks for a parser in constants.LANGUAGES
-                ngramsextractor = ngramsextractors[language_iso2]
-            except KeyError:
-                # skip document
-                print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
-                # and remember that for later processes (eg stemming)
-                document.hyperdata['__skipped__'] = 'ngrams_extraction'
-                document.save_hyperdata()
-                session.commit()
-                if language_iso2 in corpus.hyperdata['languages']:
-                    skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2)
-                    corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
-                    corpus.save_hyperdata()
-                    session.commit()
-                continue
-            # extract ngrams on each of the considered keys
-            for key in keys:
-                value = document.hyperdata.get(key, None)
-                if not isinstance(value, str):
-                    continue
-                # get ngrams
-                for ngram in ngramsextractor.extract(value):
-                    tokens = tuple(normalize_forms(token[0]) for token in ngram)
-
-                    if do_subngrams:
-                        # ex tokens = ["very", "cool", "exemple"]
-                        #    subterms = [['very', 'cool'],
-                        #                ['very', 'cool', 'exemple'],
-                        #                ['cool', 'exemple']]
-
-                        subterms = subsequences(tokens)
-                    else:
-                        subterms = [tokens]
-
-                    for seqterm in subterms:
-                        ngram = ' '.join(seqterm)
-                        if len(ngram) > 1:
-                            # doc <=> ngram index
-                            nodes_ngrams_count[(document.id, ngram)] += 1
-                            # add fields :   terms          n
-                            ngrams_data.add((ngram[:255], len(seqterm), ))
-
+        source = get_resource(resource["type"])
+        #load available taggers for source default langage
+        docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
+        tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"}
+        #sort docs by lang?
+        for lang, tagger in tagger_bots.items():
+            for documents_count, document in enumerate(docs):
+                language_iso2 = document.hyperdata.get('language_iso2', lang)
+                #print(language_iso2)
+                for key in keys:
+                    try:
+                        value = document[str(key)]
+                        if not isinstance(value, str):
+                            continue
+                            # get ngrams
+                        for ngram in tagger.extract(value):
+                            tokens = tuple(normalize_forms(token[0]) for token in ngram)
+                            if do_subngrams:
+                                # ex tokens = ["very", "cool", "exemple"]
+                                #    subterms = [['very', 'cool'],
+                                #                ['very', 'cool', 'exemple'],
+                                #                ['cool', 'exemple']]
+
+                                subterms = subsequences(tokens)
+                            else:
+                                subterms = [tokens]
+
+                            for seqterm in subterms:
+                                ngram = ' '.join(seqterm)
+                                if len(ngram) > 1:
+                                    # doc <=> ngram index
+                                    nodes_ngrams_count[(document.id, ngram)] += 1
+                                    # add fields :   terms          n
+                                    ngrams_data.add((ngram[:255], len(seqterm), ))
+                    except:
+                        #value not in doc
+                        pass
+                # except AttributeError:
+                #     print("ERROR NO language_iso2")
+                #     document.status("NGRAMS", error="No lang detected skipped Ngrams")
+                #     corpus.skipped_docs.append(document.id)
            # integrate ngrams and nodes-ngrams
            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
@@ -105,12 +95,14 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
            if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
                corpus.status('Ngrams', progress=documents_count+1)
                corpus.save_hyperdata()
+                session.add(corpus)
                session.commit()
-        # integrate ngrams and nodes-ngrams
-        _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-        corpus.status('Ngrams', progress=documents_count+1, complete=True)
-        corpus.save_hyperdata()
-        session.commit()
+            else:
+                # integrate ngrams and nodes-ngrams
+                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+            corpus.status('Ngrams', progress=documents_count+1, complete=True)
+            corpus.save_hyperdata()
+            session.commit()
    except Exception as error:
        corpus.status('Ngrams', error=error)
        corpus.save_hyperdata()

--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
 from gargantext.util.db import *
 from gargantext.models import *
 from gargantext.constants import *
-
-from collections import defaultdict
+#from gargantext.util.parsers import *
+from collections import defaultdict, Counter
 from re          import sub
+from gargantext.util.languages import languages, detect_lang

 def parse(corpus):
    try:
        documents_count = 0
-
        corpus.status('Docs', progress=0)
-
-        # will gather info about languages
-        observed_languages = defaultdict(int)
-
-        # retrieve resource information
-        for resource in corpus.resources():
-            # information about the resource
-            if resource['extracted']:
-                continue
-            resource_parser = RESOURCETYPES[resource['type']]['parser']
-            resource_path = resource['path']
-            # extract and insert documents from corpus resource into database
-            for hyperdata in resource_parser(resource_path):
-
-                # uniformize the text values for easier POStagging and processing
-                for k in ['abstract', 'title']:
-                    if k in hyperdata:
-                        try :
-                            hyperdata[k] = normalize_chars(hyperdata[k])
-                        except Exception as error :
-                            print("Error normalize_chars", error)
-
-                # save as DB child
-                # ----------------
-                document = corpus.add_child(
-                    typename = 'DOCUMENT',
-                    name = hyperdata.get('title', '')[:255],
-                    hyperdata = hyperdata,
-                )
-                session.add(document)
-
-                # a simple census to raise language info at corpus level
-                if "language_iso2" in hyperdata:
-                    observed_languages[hyperdata["language_iso2"]] += 1
+        #1 corpus => 1 resource
+        resources = corpus.resources()
+        #get the sources capabilities for a given corpus resource
+        sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False]
+        if len(sources) == 0:
+            #>>> documents have already been parsed?????
+            return
+        if len(sources) > 0:
+            #>>> necessairement 1 corpus = 1 source dans l'archi actuelle
+            source = sources[0]
+            resource = resources[0]
+            #source.extend(resource)
+            if source["parser"] is None:
+                #corpus.status(error)
+                raise ValueError("Resource '%s' has no Parser" %resource["name"])
+            else:
+                #observed langages in corpus docs
+                corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
+                #remember the skipped docs in parsing
+                skipped_languages = []
+                corpus.skipped_docs = []
+                session.add(corpus)
+                session.commit()
+                #load the corresponding parser
+                parserbot = load_parser(source)
+                # extract and insert documents from resource.path into database
+                for hyperdata in parserbot(resource["path"]):
+                    # indexed text fields defined in CONSTANTS
+                    for k in DEFAULT_INDEX_FIELDS:
+                        if k in hyperdata.keys():
+                            try:
+                                hyperdata[k] = normalize_chars(hyperdata[k])
+                            except Exception as error :
+                                hyperdata["error"] = "Error normalize_chars"
+                    indexed = False
+                    # a simple census to raise language info at corpus level
+                    for l in ["iso2", "iso3", "full_name"]:
+                        if indexed is True:
+                            break
+                        lang_field = "language_"+l
+                        if lang_field in hyperdata.keys():
+                            if l == "iso2":
+                                try:
+                                    corpus.languages[hyperdata["language_iso2"]] += 1
+                                    indexed = True
+                                except KeyError:
+                                    hyperdata["error"] = "Error: unsupported language"
+                                    skipped_languages.append(hyperdata["language_iso2"])
+                            else:
+                                lang = languages(hyperdata[lang_field].lower()).iso2
+                                try:
+                                    corpus.languages[lang] += 1
+                                    indexed = True
+                                except KeyError:
+                                    hyperdata["error"] = "Error: unsupported language"
+                                    skipped_languages.append(lang)
+                    if indexed is False:
+                        #no language have been indexed
+                        #detectlang by index_fields
+                        for k in DEFAULT_INDEX_FIELDS:
+                            if indexed is True:
+                                break
+                            if k in hyperdata.keys():
+                                try:
+                                    if len(hyperdata[k]) > 10:
+                                        #print("> detected on",k, ":", detect_lang(hyperdata[k]))
+                                        hyperdata["language_iso2"] = detect_lang(hyperdata[k])
+
+                                        corpus.languages[hyperdata["language_iso2"]] += 1
+                                        indexed = True
+                                        break
+                                except KeyError:
+                                    hyperdata["error"] = "Error: unsupported language"
+                                    skipped_languages.append(hyperdata["language_iso2"])
+                                    indexed = True
+                                except Exception as error :
+                                    print(error)
+                                    pass
+
+
+                    # save as DB child
+                    # ----------------
+                    document = corpus.add_child(
+                        typename = 'DOCUMENT',
+                        name = hyperdata.get('title', '')[:255],
+                        hyperdata = hyperdata,
+                    )
+                    session.add(document)
+
+                    if "error" in hyperdata.keys():
+                        #document.status("error")
+                        document.status('Parsing', error= document.hyperdata["error"])
+                        document.save_hyperdata()
+                        session.commit()
+                        #adding skipped_docs for later processsing
+                        corpus.skipped_docs.append(document.id)
+                    documents_count += 1

                # logging
                if documents_count % BATCH_PARSING_SIZE == 0:
                    corpus.status('Docs', progress=documents_count)
                    corpus.save_hyperdata()
+                    session.add(corpus)
                    session.commit()
-                documents_count += 1
+
            # update info about the resource
            resource['extracted'] = True
-        # add a corpus-level info about languages...
-        corpus.hyperdata['languages'] = observed_languages
-        # ...with a special key inside for skipped languages at ngrams_extraction
-        corpus.hyperdata['languages']['__skipped__'] = {}
+        # add a corpus-level info about languages adding a __skipped__ info
+        corpus.languages['__skipped__'] = Counter(skipped_languages)
+        for n in corpus.languages.items():
+            print(n)
        # commit all changes
        corpus.status('Docs', progress=documents_count, complete=True)
        corpus.save_hyperdata()
+        session.add(corpus)
        session.commit()
    except Exception as error:
        corpus.status('Docs', error=error)

--- a/gargantext/views/pages/corpora.py
+++ b/gargantext/views/pages/corpora.py
@@ -37,7 +37,7 @@ def docs_by_titles(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus': corpus,
-            'resourcename' : resourcename(corpus),
+            'resourcename' : get_resource_by_name(corpus.resources()[0]),
            'view': 'titles',
            'user': request.user
        },
@@ -65,7 +65,7 @@ def docs_by_journals(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus' : corpus,
-            'resourcename' : resourcename(corpus),
+            'resourcename' : get_resource_by_name(corpus.resources()[0]),
            'view': 'journals'
        },
    )
@@ -84,11 +84,8 @@ def analytics(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus': corpus,
-            'resourcename' : resourcename(corpus),
+            'resourcename' : get_resource_by_name(corpus.resources()[0]),
            'view': 'analytics',
            'user': request.user
        },
    )
-
-
-
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -59,12 +59,17 @@ def overview(request):


 class NewCorpusForm(forms.Form):
+    #mapping choices based on ressource.type
+    source_list = [(resource["type"], resource["name"]) for resource in RESOURCETYPES]
+    source_list.insert(0, (0,"Select a database below"))
    type = forms.ChoiceField(
-        choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
+        choices = source_list,
        widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
    )
    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
    file = forms.FileField()
+    def clean_resource(self):
+        file_ = self.cleaned_data.get('file')
    def clean_file(self):
        file_ = self.cleaned_data.get('file')
        if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
@@ -117,7 +122,8 @@ def project(request, project_id):
        resources = corpus.resources()
        if len(resources):
            resource = resources[0]
-            resource_type_name = RESOURCETYPES[resource['type']]['name']
+            #resource_type_name = RESOURCETYPES[resource['type']]['name']
+            resource_type_name = get_resource(resource["type"])["name"]
        else:
            print("(WARNING) PROJECT view: no listed resource")
        # add some data for the viewer
@@ -172,5 +178,3 @@ def project(request, project_id):
            'query_size': QUERY_SIZE_N_DEFAULT,
        },
    )
-
-
--- a/gargantext/views/pages/terms.py
+++ b/gargantext/views/pages/terms.py
@@ -2,7 +2,7 @@ from gargantext.util.http     import requires_auth, render, settings
 from gargantext.util.db       import session
 from gargantext.util.db_cache import cache
 from gargantext.models        import Node
-from gargantext.constants     import resourcename
+from gargantext.constants     import get_resource_by_name
 from datetime                 import datetime

 @requires_auth
@@ -42,7 +42,7 @@ def ngramtable(request, project_id, corpus_id):
            'date': datetime.now(),
            'project': project,
            'corpus' : corpus,
-            'resourcename' : resourcename(corpus),
+            'resourcename' : get_resource_by_name(corpus),
            'view': 'terms',

            # for the CSV import modal

--- a/init_accounts.py
+++ b/init_accounts.py
@@ -55,8 +55,13 @@ def notify_user(username, email, password):

    La nouvelle version de Gargantext sort en septembre prochain.
    Vous êtes actuellement sur la version de développement, vos retours
-    seront précieux pour stabiliser la plateforme; merci d'avance!
+    seront précieux pour stabiliser la plateforme: merci d'avance!
+    
+    Foire aux questions de Gargantext:
+    https://gogs.iscpif.fr/humanities/faq_gargantext/wiki/FAQ

+    Rapporter un bogue:
+    https://gogs.iscpif.fr/humanities/faq_gargantext/issues

    Nous restons à votre disposition pour tout complément d'information.
    Cordialement

--- a/install/README.md
+++ b/install/README.md
-#Install Instructions for Gargamelle:
+# Install Instructions for Gargamelle

-Gargamelle is the gargantext plateforme toolbox it is a full plateform system
-with minimal modules
+**Gargamelle** is the gargantext platform toolbox: it installs a full gargantext system with minimal modules inside a **docker** container.

-First you need to get the source code to install it
-The folder will be /srv/gargantext:
-* docs containes all informations on gargantext
-    /srv/gargantext/docs/
-* install contains all the installation files
-    /srv/gargantext/install/
+First you need to get the source code to install it  
+The destination folder will be `/srv/gargantext`:
+* docs contains all information on gargantext  
+    (`/srv/gargantext/docs/`)
+* install contains all the installation files  
+    `/srv/gargantext/install/`

-Help needed ?
+Help needed ?  
 See [http://gargantext.org/about](http://gargantext.org/about) and [tools](./contribution_guide.md) for the community

 ## Get the source code
@@ -27,36 +26,30 @@ git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \

 ## Install
 ``` bash
- # go into the directory
- user@computer: cd /srv/gargantext/
- #git inside installation folder
- user@computer: cd /install
- #execute the installation
- user@computer: ./install
+# go into the directory
+user@computer: cd /srv/gargantext/
+# get inside installation folder
+user@computer: cd install
+# execute the installation script
+user@computer: ./install
 ```
-During installation an admin account for gargantext will be created by asking you a username and a password
-Remember it to accès to the Gargantext plateform
+
+During installation an admin account for gargantext will be created by asking you a username and a password  
+Remember it to access to the Gargantext plateform

 ## Run
-Once you proceed to installation Gargantext plateforme will be available at localhost:8000
-by running the run executable file
- ``` bash
- # go into the directory
- user@computer: cd /srv/gargantext/
- #git inside installation folder
- user@computer: cd /install
- #execute the installation
- user@computer: ./run
- #type ctrl+d to exit or exit; command
- ```
+Once you're done with the installation, **Gargantext** platform will be available at `http://localhost:8000`
+simply by running the `start` executable file
+``` bash
+# go into the directory
+user@computer: cd /srv/gargantext/
+# run the start command
+user@computer: ./start
+# type ctrl+d or "exit" command to exit
+```

-Then open up a chromium browser and go to localhost:8000
-Click on "Enter Gargantext"
-Login in with you created username and pasword
+Then open up a chromium browser and go to `http://localhost:8000`  
+Click on "Enter Gargantext"  
+Login in with your created username and password  

 Enjoy! ;)
-
-
-
-
-	
--- a/install/gargamelle/Dockerfile
+++ b/install/gargamelle/Dockerfile
@@ -9,6 +9,7 @@ MAINTAINER ISCPIF <gargantext@iscpif.fr>
 USER root

 ### Update and install base dependencies
+RUN echo "############ DEBIAN LIBS ###############"
 RUN apt-get update && \
    apt-get install -y \
    apt-utils ca-certificates locales \
@@ -19,33 +20,37 @@ RUN apt-get update && \
    postgresql-9.5 postgresql-client-9.5 postgresql-contrib-9.5


-RUN echo "############ DEBIAN LIBS ###############"
 ### Configure timezone and locale
-RUN echo "Europe/Paris" > /etc/timezone && \
-    dpkg-reconfigure -f noninteractive tzdata && \
-    sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
+RUN echo "###########  LOCALES & TZ #################"
+RUN echo "Europe/Paris" > /etc/timezone
+ENV TZ "Europe/Paris"
+
+RUN sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
    sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
-    echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale && \
    dpkg-reconfigure --frontend=noninteractive locales && \
-    update-locale LANG=fr_FR.UTF-8
+    echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale
+ENV LANG fr_FR.UTF-8
+ENV LANGUAGE fr_FR.UTF-8
+ENV LC_ALL fr_FR.UTF-8
+

-RUN echo "###########  LOCALES & TZ #################"
 ### Install main dependencies and python packages based on Debian distrib
+RUN echo "############# PYTHON DEPENDENCIES ###############"
 RUN apt-get update && apt-get install -y \
        libxml2-dev xml-core libgfortran-5-dev \
        libpq-dev \
        python3.5 \
        python3-dev \
+        # for numpy, pandas and numpyperf
        python3-six python3-numpy python3-setuptools \
-        # ^for numpy, pandas and numpyperf
        python3-numexpr \
-        #python dependencies
+        # python dependencies
        python3-pip \
        # for lxml
        libxml2-dev libxslt-dev
        #libxslt1-dev zlib1g-dev
-RUN echo "############# PYTHON DEPENDENCIES ###############"
-#UPDATE AND CLEAN
+
+# UPDATE AND CLEAN
 RUN apt-get update && apt-get autoclean &&\
    rm -rf /var/lib/apt/lists/*
 #NB: removing /var/lib will avoid to significantly fill up your /var/ folder on your native system
@@ -65,9 +70,8 @@ ADD psql_configure.sh /
 ADD django_configure.sh /

 RUN . /env_3-5/bin/activate && pip3 install -r requirements.txt && \
-    pip3  install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 &&\
-    python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data;
-    # nltk.data.path.append('path_to_nltk_data')
+    pip3  install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 && \
+    python3 -m nltk.downloader averaged_perceptron_tagger -d /usr/local/share/nltk_data

 RUN chown gargantua:gargantua -R /env_3-5

@@ -81,6 +85,4 @@ RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf

 EXPOSE 5432 8000

-VOLUME ["/srv/",]
-
-
+# VOLUME ["/srv/",]
--- a/install/gargamelle/console
+++ b/install/gargamelle/console
+#!/bin/bash
+
+# opens a console + virtualenv inside the already active docker container
+# (to use after start)
+
+sudo docker exec -it gargamelle_box bash --rcfile 'env_3-5/bin/activate'
--- a/install/gargamelle/debug
+++ b/install/gargamelle/debug
 #!/bin/bash

-
 sudo docker run \
-        -v /srv/:/srv/\
+        --name=gargamelle_box \
+        -v /srv/gargantext:/srv/gargantext \
+        -v /srv/gargandata:/srv/gargandata \
+        -v /srv/gargantext_lib:/srv/gargantext_lib \
        -p 8000:8000 \
        -p 5432 \
        -it gargamelle:latest \
        /bin/bash -c "service postgresql start; /bin/su gargantua -c 'source /env_3-5/bin/activate && /srv/gargantext/manage.py runserver 0.0.0.0:8000' && bin/bash"

-sudo docker rm -f `docker ps -a | grep -v CONTAINER | awk '{print $1 }'`
-
-
-
+sudo docker rm gargamelle_box
--- a/install/gargamelle/django_configure.sh
+++ b/install/gargamelle/django_configure.sh
@@ -16,13 +16,12 @@ echo "::::: DJANGO :::::"

 /bin/su gargantua -c 'source /env_3-5/bin/activate &&\
    echo "Activated env" &&\
-    ./srv/gargantext/manage.py makemigrations &&\
-    ./srv/gargantext/manage.py migrate && \
+    /srv/gargantext/manage.py makemigrations &&\
+    /srv/gargantext/manage.py migrate && \
    echo "migrations ok" &&\
-    ./srv/gargantext/dbmigrate.py && \
-    ./srv/gargantext/dbmigrate.py && \
-    ./srv/gargantext/dbmigrate.py && \
-    ./srv/gargantext/manage.py createsuperuser'
+    /srv/gargantext/dbmigrate.py && \
+    /srv/gargantext/dbmigrate.py && \
+    /srv/gargantext/dbmigrate.py && \
+    /srv/gargantext/manage.py createsuperuser'

 /usr/sbin/service postgresql stop
-
--- a/install/gargamelle/requirements.txt
+++ b/install/gargamelle/requirements.txt
@@ -14,6 +14,7 @@ html5lib==0.9999999
 python-igraph>=0.7.1
 jdatetime==1.7.2
 kombu==3.0.33                  # messaging
+langdetect==1.0.6              #detectinglanguage
 nltk==3.1
 numpy==1.10.4
 psycopg2==2.6.1

--- a/install/install
+++ b/install/install
@@ -43,6 +43,22 @@ function uncompress_lib {
 #~ esac


+echo "::: CREATE GROUP :::";
+if grep -q 'gargantua' /etc/groups
+then
+    echo "Using existing group 'gargantua'"
+else
+    sudo groupadd gargantua
+fi
+
+# adding the users to the group
+current_user=$(who -m | cut -d' ' -f1)
+sudo usermod -G gargantua $current_user
+sudo usermod -G gargantua gargantua
+
+# changing the group of the sourcedir
+sudo chown -R :gargantua /srv/gargantext
+
 echo "::: SETUP ENV :::";
 for dir in "/srv/gargantext_lib" "/srv/gargantext_static" "/srv/gargantext_media"; do
    sudo mkdir -p $dir ;
@@ -59,12 +75,17 @@ sudo docker build -t gargamelle:latest ./gargamelle
 echo ':::: CONFIGURE ::::'

 sudo docker run \
-        -v /srv/:/srv/ \
+        --name=gargamelle_box \
+        -v /srv/gargantext:/srv/gargantext \
+        -v /srv/gargandata:/srv/gargandata \
+        -v /srv/gargantext_lib:/srv/gargantext_lib \
        -p 8000:8000 \
        -p 5432 \
        -it gargamelle:latest \
        /bin/bash -c "./psql_configure.sh; ./django_configure.sh ; exit"

-sudo docker rm -f `docker ps -a | grep -v CONTAINER | awk '{print $1 }'`
-
+sudo docker rm gargamelle_box

+# creating the "start" copy + giving it normal ownership (because we're probably sudo)
+cp ./run /srv/gargantext/start
+chown $current_user:gargantua /srv/gargantext/start
--- a/install/run
+++ b/install/run
 #!/bin/bash

-
 sudo docker run \
-        -v /srv/:/srv/\
+        --name=gargamelle_box \
+        -v /srv/gargantext:/srv/gargantext \
+        -v /srv/gargandata:/srv/gargandata \
+        -v /srv/gargantext_lib:/srv/gargantext_lib \
        -p 8000:8000 \
        -p 5432 \
        -it gargamelle:latest \
        /bin/bash -c "service postgresql start; /bin/su gargantua -c 'source /env_3-5/bin/activate && /srv/gargantext/manage.py runserver 0.0.0.0:8000'"

-sudo docker rm -f `docker ps -a | grep -v CONTAINER | awk '{print $1 }'`
-
-
-
+sudo docker rm gargamelle_box
--- a/moissonneurs/istex.py
+++ b/moissonneurs/istex.py
@@ -8,7 +8,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect, render
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden

-from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
+from gargantext.constants       import get_resource_by_name, QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.http       import JsonHttpResponse
@@ -133,7 +133,7 @@ def save(request , project_id):
            if filename!=False:
                # add the uploaded resource to the corpus
                corpus.add_resource(
-                  type = resourcetype('ISTex')
+                  type = get_resource_by_name('ISTex [ISI]')["type"]
                , path = filename
                                   )
                dwnldsOK+=1

--- a/moissonneurs/pubmed.py
+++ b/moissonneurs/pubmed.py
@@ -18,7 +18,7 @@ from traceback                  import print_tb
 from django.shortcuts import redirect
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden

-from gargantext.constants       import resourcetype, QUERY_SIZE_N_MAX
+from gargantext.constants       import get_resource, QUERY_SIZE_N_MAX
 from gargantext.models.nodes    import Node
 from gargantext.util.db         import session
 from gargantext.util.db_cache   import cache
@@ -134,7 +134,7 @@ def save( request , project_id ) :
            print(filename)
            if filename != False:
                # add the uploaded resource to the corpus
-                corpus.add_resource( type = resourcetype('Pubmed (XML format)')
+                corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
                                   , path = filename
                                   , url  = None
                                   )

--- a/templates/pages/menu.html
+++ b/templates/pages/menu.html
@@ -174,12 +174,28 @@
                                            title="Export terms table in CSV">
                                            Export terms table &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
                                        </a>
+
+
                                        {% elif view == 'titles'  %}
+
+
+                                        <a href="https://gogs.iscpif.fr/humanities/faq_gargantext/wiki/FAQ#import--export-a-dataset" class="pull-right btn btn-lg">
+                                        <span class="glyphicon glyphicon-question-sign" aria-hidden="true"></span>
+                                        </a>
+
+
                                        <a class="btn btn-primary exportbtn pull-right" role="button"
                                            href="/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
                                            title="Export full corpus in CSV">
-                                            Export corpus &nbsp; <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
+
+                                            Export corpus &nbsp;
+
+                                            <span class="glyphicon glyphicon-download" aria-hidden="true"></span>
+
                                        </a>
+
+
+
                                        {% else  %}
                                        <!-- TODO export journal table -->
                                        {% endif %}
@@ -187,6 +203,7 @@
                                </div>
                                <div class="row">
                                    <div class="col-md-1">
+                                      </span>
                                    </div>
                                    <div class="col-md-6">
                                            <h3>

--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -212,12 +212,19 @@
                            <div class="modal-content">
                                <div class="modal-header">
                                    <button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
-                                    <h3>Add a Corpus</h3>
+
+                                    <h3>Add a Corpus <a href="https://gogs.iscpif.fr/humanities/faq_gargantext/wiki/FAQ#import--export-a-dataset">
+                                      <span class="glyphicon glyphicon-question-sign" aria-hidden="true"></span>
+                                    </a>
+                                  </h3>
                                </div>
                                <div class="modal-body">
+                                  <!-- FAQ -->
                                    <form id="id_form" enctype="multipart/form-data" action="/projects/{{project.id}}/" method="post">
                                        {% csrf_token %}
                                        <table cellpadding="5">
+
+
                                            {% for field in form %}
                                            <tr>
                                                <th>{{field.label_tag}}</th>