[MERGE/FIX] Fix merge urls

2828aa22 · delanoe · 8d97e039 · a9731641 · 2828aa22 · 2828aa22
Commit 2828aa22 authored Apr 08, 2016 by delanoe
79 changed files
--- a/.gitignore
+++ b/.gitignore
--- a/annotations/.bowerrc
+++ b/annotations/.bowerrc
--- a/annotations/.gitignore
+++ b/annotations/.gitignore
--- a/annotations/.jshintrc
+++ b/annotations/.jshintrc
--- a/annotations/.lvimrc
+++ b/annotations/.lvimrc
--- a/annotations/bower.json
+++ b/annotations/bower.json
--- a/annotations/package.json
+++ b/annotations/package.json
--- a/annotations/static/annotations/app.css
+++ b/annotations/static/annotations/app.css
--- a/dbmigrate.py
+++ b/dbmigrate.py
--- a/doc/schemas/ngram_parsing_flow.dot
+++ b/doc/schemas/ngram_parsing_flow.dot
--- a/doc/schemas/ngram_parsing_flow.png
+++ b/doc/schemas/ngram_parsing_flow.png
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -47,59 +47,59 @@ def convert_to_date(date):
        return dateutil.parser.parse(date)

 INDEXED_HYPERDATA = {
-    # TODO use properties during toolchain.hyperdata_indexing 
+    # TODO use properties during toolchain.hyperdata_indexing
    # (type, convert_to_db, convert_from_db)
-    
+
    'count':
        { 'id'             : 1
        , 'type'           : int
        , 'convert_to_db'  : int
        , 'convert_from_db': int
        },
- 
+
    'publication_date':
        { 'id'             : 2
        , 'type'           : datetime.datetime
        , 'convert_to_db'  : convert_to_date
        , 'convert_from_db': datetime.datetime.fromtimestamp
        },
-    
+
    'title':
        { 'id'             : 3
        , 'type'           : str
        , 'convert_to_db'  : str
        , 'convert_from_db': str
        },
-    
-   
+
+
    'authors':
        { 'id'             : 4
        , 'type'           : str
        , 'convert_to_db'  : str
        , 'convert_from_db': str
        },
-    
+
    'journal':
        { 'id'             : 5
        , 'type'           : str
        , 'convert_to_db'  : str
        , 'convert_from_db': str
        },
-    
+
    'abstract':
        { 'id'             : 6
        , 'type'           : str
        , 'convert_to_db'  : str
        , 'convert_from_db': str
        },
-     
+
    'text':
        { 'id'             : 7
        , 'type'           : str
        , 'convert_to_db'  : str
        , 'convert_from_db': str
        },
-    
+
    'page':
        { 'id'             : 8
        , 'type'           : int
@@ -160,10 +160,10 @@ RESOURCETYPES = [
        'parser': CSVParser,
        'default_language': 'en',
    },
-    # {   'name': 'ISTex',
-    #     # 'parser': ISTexParser,
-    #     'default_language': 'en',
-    # },
+    {   'name': 'ISTex',
+        'parser': ISTexParser,
+        'default_language': 'en',
+    },
 ]

 # linguistic extraction parameters ---------------------------------------------
@@ -179,11 +179,11 @@ DEFAULT_MAPLIST_MAX             = 300        # MAPLIST maximum terms

 DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5         # part of monograms in MAPLIST

-DEFAULT_MAX_NGRAM_LEN = 7                    # limit used after POStagging rule
+DEFAULT_MAX_NGRAM_LEN           = 7          # limit used after POStagging rule
                                             # (initial ngrams number is a power law of this /!\)
                                             # (and most longer ngrams have tiny freq anyway)

-DEFAULT_ALL_LOWERCASE_FLAG = True            # lowercase ngrams before recording
+DEFAULT_ALL_LOWERCASE_FLAG      = True       # lowercase ngrams before recording
                                             # them to their DB table
                                             # (potentially bad for acronyms but
                                             #  good for variants like same term
@@ -198,7 +198,9 @@ QUERY_SIZE_N_DEFAULT = 1000

 import os
 from .settings import BASE_DIR
-UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads')
+# uploads/.gitignore prevents corpora indexing
+# copora can be either a folder or symlink towards specific partition
+UPLOAD_DIRECTORY   = os.path.join(BASE_DIR, 'uploads/corpora')
 UPLOAD_LIMIT       = 1024 * 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY

@@ -206,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
 # about batch processing...
 BATCH_PARSING_SIZE          = 256
 BATCH_NGRAMSEXTRACTION_SIZE = 1024
+
+
+# Scrapers config
+QUERY_SIZE_N_MAX     = 1000
+QUERY_SIZE_N_DEFAULT = 1000
+
--- a/gargantext/models/hyperdata.py
+++ b/gargantext/models/hyperdata.py
@@ -71,7 +71,7 @@ class NodeHyperdata(Base):
    value_flt = Column( Double()                , index=True )
    value_utc = Column( DateTime(timezone=True) , index=True )
    value_str = Column( String(255)             , index=True )
-    value_txt = Column( Text                    , index=True )
+    value_txt = Column( Text                    , index=False )


    def __init__(self, node=None, key=None, value=None):

--- a/gargantext/settings.py
+++ b/gargantext/settings.py
@@ -51,6 +51,7 @@ INSTALLED_APPS = [
    'djcelery',
    'annotations',
    'graphExplorer',
+    'scrapers',
 ]

 MIDDLEWARE_CLASSES = [

--- a/gargantext/urls.py
+++ b/gargantext/urls.py
@@ -8,9 +8,9 @@ Views are shared between these modules:
 - `graph explorer`, to explore graphs
 """

-from django.conf.urls import include, url
+from django.conf.urls    import include, url

-from django.contrib import admin
+from django.contrib      import admin

 import gargantext.views.api.urls
 import gargantext.views.generated.urls
@@ -18,30 +18,34 @@ import gargantext.views.pages.urls

 # Module Annotation
    ## tempo: unchanged doc-annotations --
-from annotations       import urls as annotations_urls
-from annotations.views import main as annotations_main_view
+from annotations         import urls as annotations_urls
+from annotations.views   import main as annotations_main_view

-# Module "Graph Explorer" 
+# Module "Graph Explorer"
 #from graphExplorer     import urls as graphExplorer_urls
-from graphExplorer.rest import Graph
+from graphExplorer.rest  import Graph
 from graphExplorer.views import explorer

-urlpatterns = [
-    url(r'^admin/', admin.site.urls),
-    url(r'^generated/', include(gargantext.views.generated.urls)),
-    url(r'^api/', include(gargantext.views.api.urls)),
-    url(r'^', include(gargantext.views.pages.urls)),
-
-    # Module Annotation
-        # tempo: unchanged doc-annotations routes --
-    url(r'^annotations/', include(annotations_urls)),
-    url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
-
-    # Module "Graph Explorer"
-    url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer), 
-    url(r'^projects/(\d+)/corpora/(\d+)/graph$',    Graph.as_view()),
-    # to be removed:
-    url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$',    Graph.as_view())
-    #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
-    #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
-]
+# Module Scrapers
+from scrapers            import urls as scrapers_urls
+
+urlpatterns = [ url(r'^admin/'     , admin.site.urls                           )
+              , url(r'^generated/' , include( gargantext.views.generated.urls ))
+              , url(r'^api/'       , include( gargantext.views.api.urls )      )
+              , url(r'^'           , include( gargantext.views.pages.urls )    )
+
+              # Module Annotation
+              # tempo: unchanged doc-annotations routes --
+              , url(r'^annotations/', include( annotations_urls )              )
+              , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
+
+              # Module "Graph Explorer"
+              , url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer       )
+              , url(r'^projects/(\d+)/corpora/(\d+)/graph$'   , Graph.as_view())
+              # to be removed:
+              , url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
+              #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
+
+              # Scrapers module
+              , url(r'^scrapers/'   , include( scrapers_urls )                 )
+              ]
--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
-from gargantext.constants import *
+from gargantext.constants   import *
 from gargantext.util.digest import str_digest
-from gargantext.util import http
+from gargantext.util        import http


 def save(contents, name='', basedir=''):

--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -29,7 +29,7 @@ import urllib.request

 def get(url):
    response = urllib.request.urlopen(url)
-    html = response.read()
+    return response.read()


 # retrieve GET parameters from a request

--- a/gargantext/util/parsers/ISTex.py
+++ b/gargantext/util/parsers/ISTex.py
@@ -4,7 +4,7 @@ from datetime import datetime
 from io import BytesIO
 import json

-class ISTex(Parser):
+class ISTexParser(Parser):

    def parse(self, thefile):
        json_data=open(thefile,"r")
@@ -84,16 +84,16 @@ class ISTex(Parser):
                # ---------------------------------------------------
                if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
                    hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
-                
+
                # default value = eng
                # possible even better: langid.classify(abstract)
                else:
                    # NB 97% des docs istex sont eng donc par défaut
                    # ----------------------------------------------
                    hyperdata["language_iso3"] = "eng"
-                    # (cf. api.istex.fr/document/?q=*&facet=language 
+                    # (cf. api.istex.fr/document/?q=*&facet=language
                    #  et  tests langid sur les language=["unknown"])
-                    
+

            if "publication_date" in hyperdata:
                RealDate = hyperdata["publication_date"]

--- a/gargantext/util/parsers/__init__.py
+++ b/gargantext/util/parsers/__init__.py
@@ -7,5 +7,5 @@ from .Pubmed import PubmedParser
 # # 2015-12-08: parser 2 en 1
 from .Europress import EuropressParser

-# from .ISTex import ISTexParser
+from .ISTex import ISTexParser
 from .CSV import CSVParser
--- a/gargantext/util/scrappers/pubmed.py
+++ b/gargantext/util/scrappers/pubmed.py
-
-
-def suggest(keywords):
-    return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
-
-def count(keywords):
-    return 42
-
-def query_save(keywords):
-    return 'path/to/query.xml'
--- a/gargantext/util/taggers/lib/nlpserver/turboparser.cpython-34m.so
+++ b/gargantext/util/taggers/lib/nlpserver/turboparser.cpython-34m.so
-/srv/gargantext_lib/taggers/nlpserver/turboparser.cpython-34m.so
\ No newline at end of file
--- a/gargantext/util/tools.py
+++ b/gargantext/util/tools.py
+import os
+from gargantext.settings import MEDIA_ROOT
+
+
+def ensure_dir(user):
+    '''
+    If user is new, folder does not exist yet, create it then
+    '''
+    dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
+    if not os.path.exists(dirpath):
+        print("Creating folder %s" % dirpath)
+        os.makedirs(dirpath)
--- a/gargantext/views/api/urls.py
+++ b/gargantext/views/api/urls.py
@@ -3,30 +3,26 @@ from django.conf.urls import url
 from . import nodes
 from . import ngramlists

+urlpatterns = [ url(r'^nodes$'                , nodes.NodeListResource.as_view())
+              , url(r'^nodes/(\d+)$'          , nodes.NodeResource.as_view()    )
+              , url(r'^nodes/(\d+)/facets$'   , nodes.CorpusFacet.as_view()     )
+              , url(r'^nodes/(\d+)/having$'   , nodes.NodeListHaving.as_view()  )

-urlpatterns = [
-    url(r'^nodes$'              , nodes.NodeListResource.as_view()),
-    url(r'^nodes/(\d+)$'        , nodes.NodeResource.as_view()),
+                # get a list of ngram_ids or ngram_infos by list_id
+                # url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),

-    url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
-    url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
+              ,  url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view())
+                # modify grouping couples of a group node
+                #  ex: POST ngramlists/groups?node=43
+                #           post data looks like : {"767":[209,640],"779":[436,265,385]}"

-    # add or remove ngram from a list
-    #  ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
-    #       rm <=> DEL ngramlists/change?list=42&ngrams=1,2
-    url(r'^ngramlists/change$', ngramlists.ListChange.as_view()),

-    # modify grouping couples of a group node
-    #  ex: POST ngramlists/groups?node=43
-    #           post data looks like : {"767":[209,640],"779":[436,265,385]}"
-    url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view()),
+              , url(r'^ngramlists/family$'     , ngramlists.ListFamily.as_view())
+                # entire combination of lists from a corpus
+                # (or any combination of lists that go together :
+                #   - a mainlist
+                #   - an optional stoplist
+                #   - an optional maplist
+                #   - an optional grouplist

-    # get entire combination of lists from a corpus
-    # (or any combination of lists that go together :
-    #   - a mainlist
-    #   - an optional stoplist
-    #   - an optional maplist
-    #   - an optional grouplist)
-    url(r'^ngramlists/family$', ngramlists.ListFamily.as_view()),
-
-]
+              ]
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -94,7 +94,7 @@ def project(request, project_id):
        )
        session.add(corpus)
        session.commit()
-        
+
        # parse_extract: fileparsing -> ngram extraction -> lists
        scheduled(parse_extract_indexhyperdata)(corpus.id)


--- a/graphExplorer/static/TODO
+++ b/graphExplorer/static/TODO
--- a/install/docker/dev/Dockerfile
+++ b/install/docker/dev/Dockerfile
--- a/install/python/requirements.txt
+++ b/install/python/requirements.txt
--- a/scrapers/istex.py
+++ b/scrapers/istex.py
+# from datetime import datetime
+from time import sleep
+import datetime
+import threading
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR
+
+from django.shortcuts import redirect
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+
+from gargantext.constants       import RESOURCETYPES, QUERY_SIZE_N_MAX
+from gargantext.models.nodes    import Node
+from gargantext.util.db         import session
+from gargantext.util.http       import JsonHttpResponse
+from gargantext.util.tools      import ensure_dir
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain  import parse_extract_indexhyperdata
+
+from scrapers.util              import Scraper
+
+
+
+def query( request ):
+    """
+    ISTEX simply the total of hits for a query
+
+    (not reused in testISTEX)
+    """
+    print(request.method)
+    alist = ["bar","foo"]
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        query_string = query.replace(" ","+")
+        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
+
+        tasks = Scraper()
+
+        try:
+            thedata_path = tasks.download( url )
+            thedata = open(thedata_path, "rb")
+            alist = thedata.read().decode('utf-8')
+        except Exception as error:
+            alist = [str(error)]
+    data = alist
+    return JsonHttpResponse(data)
+
+
+
+def save(request , project_id):
+    print("testISTEX:")
+    print(request.method)
+    alist = ["bar","foo"]
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+    if request.method == "POST":
+        query = "-"
+        query_string = "-"
+        N = 0
+
+        if "query" in request.POST:
+            query = request.POST["query"]
+            query_string = query.replace(" ","+")   # url encoded q
+
+        if "N" in request.POST:
+            N = int(request.POST["N"])     # query_size from views_opti
+            if N > QUERY_SIZE_N_MAX:
+                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+                print("ERROR (scrap: istex d/l ): ",msg)
+                raise ValueError(msg)
+
+        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
+
+        urlreqs = []
+        pagesize = 50
+        tasks = Scraper()
+        chunks = list(tasks.chunks(range(N), pagesize))
+        for k in chunks:
+            if (k[0]+pagesize)>N: pagesize = N-k[0]
+            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+
+        # corpus node instanciation as a Django model
+
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scraping data"
+                                        , "language_id" : None
+                                        }
+        )
+
+
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+
+        print("NEW CORPUS", corpus_id)
+        ensure_dir(request.user)
+        tasks = Scraper()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                # add the uploaded resource to the corpus
+                corpus.add_resource( type = 3
+                                   , path = filename
+                                   )
+                dwnldsOK+=1
+
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+        ###########################
+        ###########################
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/projects/' + str(project_id))
+
+
+    data = [query_string,query,N]
+    return JsonHttpResponse(data)
+
+
+
+
+
--- a/scrapers/pubmed.py
+++ b/scrapers/pubmed.py
+# ****************************
+# *****  Medline Scraper *****
+# ****************************
+
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
+# between 9 pm and 5 am Eastern Time weekdays
+
+
+# from datetime import datetime
+from time import sleep
+import json
+import datetime
+from os import path
+import threading
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR
+
+from django.shortcuts import redirect
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+
+from gargantext.constants       import RESOURCETYPES, QUERY_SIZE_N_MAX
+from gargantext.models.nodes    import Node
+from gargantext.util.db         import session
+from gargantext.util.http       import JsonHttpResponse
+from gargantext.util.tools      import ensure_dir
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain  import parse_extract_indexhyperdata
+
+from scrapers.util              import Scraper
+
+
+
+def query( request ):
+    """
+    Pubmed year by year results
+
+    # alist = [
+    # {'string': '2011[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
+    # {'string': '2012[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
+    #  ... ]
+
+    (reused as thequeries in query_save)
+    """
+    print(request.method)
+    alist = []
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+
+        if N > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+            print("ERROR(scrap: pubmed stats): ",msg)
+            raise ValueError(msg)
+
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        instancia = Scraper()
+
+        # serialFetcher (n_last_years, query, query_size)
+        alist = instancia.serialFetcher( 5, query , N )
+
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def save( request , project_id ) :
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+    project = (session.query( Node )
+                      .filter(Node.id == project_id)
+                      .filter(Node.typename == 'PROJECT')
+              ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+    if request.method == "POST":
+        queries = request.POST["query"]
+        name    = request.POST["string"]
+
+        # here we just realize queries already prepared by getGlobalStats
+        #    ===> no need to repeat N parameter like in testISTEX <===
+
+        instancia  = Scraper()
+        thequeries = json.loads(queries)
+
+        # fyi the sum of our prepared yearly proportional quotas
+        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
+        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
+
+        urlreqs = []
+        for yearquery in thequeries:
+            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+        alist = ["tudo fixe" , "tudo bem"]
+
+
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = name,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scraping data"
+                                        , "language_id" : None
+                                        }
+        )
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+        # """
+        # urlreqs: List of urls to query.
+        # - Then, to each url in urlreqs you do:
+        #     eFetchResult = urlopen(url)
+        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
+        # """
+
+
+        ensure_dir(request.user)
+        tasks = Scraper()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in the queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults :
+            print(filename)
+            if filename != False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource( type = 3
+                                   , path = filename
+                                   )
+                dwnldsOK+=1
+
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/projects/' + str(project_id))
+
+    data = alist
+    return JsonHttpResponse(data)
+
+
--- a/scrapers/urls.py
+++ b/scrapers/urls.py
+from django.conf.urls import url
+
+import scrapers.pubmed as pubmed
+import scrapers.istex  as istex
+
+#import scrapers.cern  as cern
+#import scrapers.hal   as hal
+
+
+# Scraping : getting data from external database
+# Available databases : Pubmed, IsTex, (next: CERN)
+
+# /!\ urls patterns here are *without* the trailing slash
+urlpatterns = [ url(r'^pubmed/query$'       , pubmed.query            )
+              , url(r'^pubmed/save/(\d+)' , pubmed.save              )
+
+              , url(r'^istex/query$'        , istex.query       )
+              , url(r'^istex/save/(\d+)'  , istex.save                  )
+
+            # TODO REST API for the scrapers
+            #, url(r'^rest$'              , scraping.Target.as_view()      )
+              ,
+              ]
+
+
+#def count(keywords):
+#    return 42
+#
+#def query_save(keywords):
+#    return 'path/to/query.xml'
+#
--- a/scrapers/util.py
+++ b/scrapers/util.py
+
+from gargantext.util.files import download
+
+import sys
+import time
+import threading
+from queue import Queue
+
+from lxml import etree
+if sys.version_info >= (3, 0):
+    from urllib.request import urlopen
+else:
+    from urllib import urlopen
+
+
+class Scraper :
+
+    def __init__(self):
+        self.queue_size      = 8
+        self.q               = Queue()
+        self.firstResults    = []
+        self.lock            = threading.Lock() # lock to serialize console output
+        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.pubMedDB        = 'Pubmed'
+        self.reportType      = 'medline'
+
+
+    # Return the globalResults!:
+    # - count =
+    # - queryKey =
+    # - webEnv =
+    def medlineEsearch(self , query):
+
+        # print ("MedlineFetcher::medlineEsearch :")
+
+        "Get number of results for query 'query' in variable 'count'"
+        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+
+        # print(query)
+        origQuery = query
+        query     = query.replace(' ', '%20')
+
+        eSearch   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
+                     % ( self.pubMedEutilsURL, self.pubMedDB, query )
+
+        try:
+            eSearchResult = urlopen(eSearch)
+
+            data          = eSearchResult.read()
+            root          = etree.XML(data)
+
+            findcount     = etree.XPath("/eSearchResult/Count/text()")
+            count         = findcount(root)[0]
+
+            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
+            queryKey      = findquerykey(root)[0]
+
+            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
+            webEnv        = findwebenv(root)[0]
+
+        except Exception as Error:
+            print(Error)
+            count         = 0
+            queryKey      = False
+            webEnv        = False
+            origQuery     = False
+
+        values = { "query"    : origQuery
+                 , "count"    : int(count)
+                 , "queryKey" : queryKey
+                 , "webEnv"   : webEnv
+                 }
+        return values
+
+
+    # RETMAX:
+    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
+    # maximum of 100,000 records
+    def medlineEfetchRAW( self , fullquery):
+
+        query    = fullquery [ "string"  ]
+        retmax   = fullquery [ "retmax"  ]
+        count    = fullquery [ "count"   ]
+        queryKey = fullquery [ "queryKey"]
+        webEnv   = fullquery [ "webEnv"  ]
+
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+
+        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+
+        retstart = 0
+        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
+        return eFetch
+
+
+    # generic!
+    def download(self, url):
+        print(url)
+        filename = download(url)
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+
+
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            # print(threading.current_thread().name, item)
+            return returnvalue
+
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+
+
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            results = []
+            try:
+                result = self.download(item)
+            except Exception as error :
+                print(error)
+                result = False
+            self.firstResults.append(result)
+            self.q.task_done()
+
+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]
+
+
+    # GLOBALLIMIT:
+    # I will retrieve this exact amount of publications.
+    # The publications per year i'll retrieve per year will be :
+    #        (k/N)*GlobalLimit
+    #                  \_ this is used as RETMAX
+    # - k : Number of publications of x year (according to pubmed)
+    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
+    # - GlobalLimit : Number of publications i want.
+    def serialFetcher(self , yearsNumber , query, globalLimit):
+
+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+
+        N = 0
+
+        # print ("MedlineFetcher::serialFetcher :")
+        thequeries = []
+        globalresults = []
+        for i in range(yearsNumber):
+            year = str(2015 - i)
+            # print ('YEAR ' + year)
+            # print ('---------\n')
+            pubmedquery = str(year) + '[dp] '+query
+            self.q.put( pubmedquery ) #put task in the queue
+
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+
+        Total = 0
+        Fails = 0
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
+            Total += 1
+            if globalresults["queryKey"]==False:
+                Fails += 1
+            if globalresults["count"] > 0 :
+
+                N+=globalresults["count"]
+
+                queryhyperdata = { "string"   : globalresults["query"]
+                                 , "count"    : globalresults["count"]
+                                 , "queryKey" : globalresults["queryKey"]
+                                 , "webEnv"   : globalresults["webEnv"]
+                                 , "retmax"   : 0
+                                 }
+                thequeries.append ( queryhyperdata )
+
+        print("Total Number:", N,"publications")
+        print("And i want just:",globalLimit,"publications")
+        print("---------------------------------------\n")
+
+        for i,query in enumerate(thequeries):
+            k                  = query["count"]
+            proportion         = k/float(N)
+            retmax_forthisyear = int(round(globalLimit*proportion))
+            query["retmax"]    = retmax_forthisyear
+
+            if query["retmax"] == 0 : query["retmax"]+=1
+
+            print(query["string"],"\t[",k,">",query["retmax"],"]")
+
+        if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
+            thequeries = [False]
+
+        return thequeries
--- a/static/bower_components/angular/angular-csp.css
+++ b/static/bower_components/angular/angular-csp.css
--- a/static/bower_components/bootstrap/dist/css/bootstrap.min.css
+++ b/static/bower_components/bootstrap/dist/css/bootstrap.min.css
--- a/static/bower_components/bootstrap/dist/js/bootstrap.min.js
+++ b/static/bower_components/bootstrap/dist/js/bootstrap.min.js
--- a/static/css/bootstrap-theme.min.css
+++ b/static/css/bootstrap-theme.min.css
--- a/static/css/bootstrap.css
+++ b/static/css/bootstrap.css
--- a/static/css/d3/dc.css
+++ b/static/css/d3/dc.css
--- a/static/css/gargantext/tables.css
+++ b/static/css/gargantext/tables.css
--- a/static/css/jquery/jquery.dynatable.css
+++ b/static/css/jquery/jquery.dynatable.css
--- a/static/css/jquery/jquery.easy-pie-chart.css
+++ b/static/css/jquery/jquery.easy-pie-chart.css
--- a/static/css/morris.css
+++ b/static/css/morris.css
--- a/static/docs/gargantua_book/Source.txt
+++ b/static/docs/gargantua_book/Source.txt
--- a/static/docs/gargantua_book/gargantua_chapter_1.txt
+++ b/static/docs/gargantua_book/gargantua_chapter_1.txt
--- a/static/docs/gargantua_book/gargantua_chapter_2.txt
+++ b/static/docs/gargantua_book/gargantua_chapter_2.txt
--- a/static/docs/gargantua_book/gargantua_chapter_3.txt
+++ b/static/docs/gargantua_book/gargantua_chapter_3.txt
--- a/static/docs/gargantua_book/gargantua_chapter_4.txt
+++ b/static/docs/gargantua_book/gargantua_chapter_4.txt
--- a/static/docs/gargantua_book/gargantua_chapter_5.txt
+++ b/static/docs/gargantua_book/gargantua_chapter_5.txt
--- a/static/fonts/bootstrap/glyphicons-halflings-regular.eot
+++ b/static/fonts/bootstrap/glyphicons-halflings-regular.eot
--- a/static/fonts/bootstrap/glyphicons-halflings-regular.svg
+++ b/static/fonts/bootstrap/glyphicons-halflings-regular.svg
--- a/static/fonts/bootstrap/glyphicons-halflings-regular.ttf
+++ b/static/fonts/bootstrap/glyphicons-halflings-regular.ttf
--- a/static/fonts/bootstrap/glyphicons-halflings-regular.woff
+++ b/static/fonts/bootstrap/glyphicons-halflings-regular.woff
--- a/static/img/Gargantextuel-212x300.jpg
+++ b/static/img/Gargantextuel-212x300.jpg
--- a/static/img/ajax-loader.gif
+++ b/static/img/ajax-loader.gif
--- a/static/img/credits/ademe.png
+++ b/static/img/credits/ademe.png
--- a/static/img/credits/alexandre.jpg
+++ b/static/img/credits/alexandre.jpg
--- a/static/img/credits/cams.jpg
+++ b/static/img/credits/cams.jpg
--- a/static/img/credits/cnrs.png
+++ b/static/img/credits/cnrs.png
--- a/static/img/credits/csi.png
+++ b/static/img/credits/csi.png
--- a/static/img/credits/david.jpg
+++ b/static/img/credits/david.jpg
--- a/static/img/credits/ehess.png
+++ b/static/img/credits/ehess.png
--- a/static/img/credits/elias.jpg
+++ b/static/img/credits/elias.jpg
--- a/static/img/credits/forccast.png
+++ b/static/img/credits/forccast.png
--- a/static/img/credits/iscpif.svg
+++ b/static/img/credits/iscpif.svg
--- a/static/img/credits/logo.svg
+++ b/static/img/credits/logo.svg
--- a/static/img/credits/mastodons.png
+++ b/static/img/credits/mastodons.png
--- a/static/img/credits/mastodons.svg
+++ b/static/img/credits/mastodons.svg
--- a/static/img/credits/mathieu.jpg
+++ b/static/img/credits/mathieu.jpg
--- a/static/img/credits/maziyar.jpg
+++ b/static/img/credits/maziyar.jpg
--- a/static/img/credits/mines.png
+++ b/static/img/credits/mines.png
--- a/static/img/credits/pasteur.png
+++ b/static/img/credits/pasteur.png
--- a/static/img/credits/romain.jpg
+++ b/static/img/credits/romain.jpg
--- a/static/img/credits/samuel.jpg
+++ b/static/img/credits/samuel.jpg
--- a/static/img/logo.png
+++ b/static/img/logo.png
--- a/static/img/searx.jpg
+++ b/static/img/searx.jpg
--- a/static/img/searx.png
+++ b/static/img/searx.png
--- a/templates/generated/css/bootstrap.css
+++ b/templates/generated/css/bootstrap.css
--- a/templates/generated/img/logo.svg
+++ b/templates/generated/img/logo.svg
--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -260,7 +260,7 @@

                                $.ajax({
                                    // contentType: "application/json",
-                                    url: window.location.origin+"/tests/project/"+projectid+"/pubmedquery/go",
+                                    url: window.location.origin+"/scrapers/pubmed/save/"+projectid,
                                    data: pubmedifiedQuery,
                                    type: 'POST',
                                    beforeSend: function(xhr) {
@@ -290,7 +290,7 @@
                                    var theType = $("#id_type option:selected").html();
                                    console.log("consoling the typeeee: ")
                                    console.log(theType)
-                                    if(theType=="Pubmed (xml format)") doTheQuery();
+                                    if(theType=="Pubmed (XML format)") doTheQuery();
                                    if(theType=="ISTex") {
                                        var origQuery = $("#id_name").val()
                                        console.log("printing the results:")
@@ -329,10 +329,10 @@

                                var theType = $("#id_type option:selected").html();

-                                if(theType=="Pubmed (xml format)") {
+                                if(theType=="Pubmed (XML format)") {
                                    $.ajax({
                                        // contentType: "application/json",
-                                        url: window.location.origin+"/tests/pubmedquery",
+                                        url: window.location.origin+"/scrapers/pubmed/query",
                                        data: formData,
                                        type: 'POST',
                                        beforeSend: function(xhr) {
@@ -370,10 +370,10 @@
                                }

                                if(theType=="ISTex") {
-                                    console.log(window.location.origin+"tests/istextquery")
+                                    console.log(window.location.origin+"scrapers/istex/query")
                                    $.ajax({
                                        // contentType: "application/json",
-                                        url: window.location.origin+"/tests/istextquery",
+                                        url: window.location.origin+"/scrapers/istex/query",
                                        data: formData,
                                        type: 'POST',
                                        beforeSend: function(xhr) {
@@ -436,7 +436,7 @@

                                    $( "#id_name" ).on('input',function(e){
                                        console.log($(this).val())
-                                        if(theType=="Pubmed (xml format)")
+                                        if(theType=="Pubmed (XML format)")
                                        testPUBMED( $(this).val() )
                                    });
                                }
@@ -504,7 +504,7 @@

                                $.ajax({
                                    // contentType: "application/json",
-                                    url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
+                                    url: window.location.origin+"/scrapers/istex/save/"+projectid,
                                    data: postQuery,
                                    type: 'POST',
                                    beforeSend: function(xhr) {

--- a/uploads/.gitignore
+++ b/uploads/.gitignore