Revert "Remove moissonneurs module"

This reverts commit fde04dab.

Revert "Remove moissonneurs module"
This reverts commit fde04dab.
b94e4312 · sim · 5df80fbb · b94e4312 · b94e4312 · b94e4312
Commit b94e4312 authored Feb 06, 2018 by sim
9 changed files
--- a/gargantext/moissonneurs/__init__.py
+++ b/gargantext/moissonneurs/__init__.py
--- a/gargantext/moissonneurs/cern.py
+++ b/gargantext/moissonneurs/cern.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# *****  CERN Crawler    *****
+# ****************************
+RESOURCE_TYPE_SCOAP = 9
+from django.shortcuts import redirect, render
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_SCOAP)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id, return_corpus=False):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_SCOAP)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : "en"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        if return_corpus:
+            return corpus
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/hal.py
+++ b/gargantext/moissonneurs/hal.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ***** HAL Crawler *****
+# ****************************
+# LICENCE: GARGANTEXT.org Licence
+RESOURCE_TYPE_HAL = 11
+from django.shortcuts               import redirect, render
+from django.http                    import Http404, HttpResponseRedirect \
+                                                  , HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_HAL)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            print(results)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id, return_corpus=False):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_HAL)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        if return_corpus:
+            return corpus
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/isidore.py
+++ b/gargantext/moissonneurs/isidore.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ***** ISIDORE Crawler  *****
+# ****************************
+RESOURCE_TYPE_ISIDORE = 12
+from django.shortcuts import redirect, render
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_ISIDORE)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id, return_corpus=False):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_ISIDORE)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : "fr"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        if return_corpus:
+            return corpus
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/istex.py
+++ b/gargantext/moissonneurs/istex.py
+from datetime import datetime
+from time import sleep
+import datetime
+import threading
+from traceback                  import print_tb
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR
+from django.shortcuts import redirect, render
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants       import get_resource, QUERY_SIZE_N_MAX
+from gargantext.models.nodes    import Node
+from gargantext.util.db         import session
+from gargantext.util.http       import JsonHttpResponse
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain  import parse_extract_indexhyperdata
+from .util                      import Scraper
+RESOURCE_TYPE_ISTEX = 8
+def query( request ):
+    """
+    ISTEX simply the total of hits for a query
+    (not reused in testISTEX)
+    """
+    print(request.method)
+    alist = ["bar","foo"]
+    if request.method == "POST":
+        query = request.POST["query"]
+        if request.POST["N"] == "NaN":
+            N = QUERY_SIZE_N_MAX
+        else:
+            N = int(request.POST["N"])
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        query_string = query.replace(" ","+")
+        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
+        tasks = Scraper()
+        try:
+            thedata_path = tasks.download( url )
+            thedata = open(thedata_path, "rb")
+            alist = thedata.read().decode('utf-8')
+        except Exception as error:
+            alist = [str(error)]
+    data = alist
+    return JsonHttpResponse(data)
+def save(request , project_id, return_corpus=False):
+    print("testISTEX:")
+    print(request.method)
+    alist = ["bar","foo"]
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+    if project is None:
+        raise Http404()
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+    query_string = ""
+    if request.method == "POST":
+        query = "-"
+        query_string = "-"
+        #N = QUERY_SIZE_N_MAX
+        if "query" in request.POST:
+            query = request.POST["query"]
+            query_string = query.replace(" ","+")   # url encoded q
+        if "N" in request.POST:
+            if request.POST["N"] == "NaN":
+                N = QUERY_SIZE_N_MAX
+            else:
+                N = int(request.POST["N"])     # query_size from views_opti
+            if N > QUERY_SIZE_N_MAX:
+                N = QUERY_SIZE_N_MAX
+                #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+                #print("ERROR (scrap: istex d/l ): ",msg)
+                #raise ValueError(msg)
+        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
+        urlreqs = []
+        pagesize = 50
+        tasks = Scraper()
+        chunks = list(tasks.chunks(range(N), pagesize))
+        for k in chunks:
+            if (k[0]+pagesize)>N: pagesize = N-k[0]
+            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"+"from="+str(k[0])+"&size="+str(pagesize))
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : None
+                                        }
+        )
+        tasks = Scraper()
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource(
+                  type = get_resource(RESOURCE_TYPE_ISTEX)["type"]
+                , path = filename
+                                   )
+                dwnldsOK+=1
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+        ###########################
+        ###########################
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        if return_corpus:
+            return corpus
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/multivac.py
+++ b/gargantext/moissonneurs/multivac.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# ****************************
+# ***** MULTIVAC Crawler *****
+# ****************************
+# LICENCE: GARGANTEXT.org Licence
+RESOURCE_TYPE_MULTIVAC = 10
+from django.shortcuts import redirect, render
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants           import get_resource, load_crawler, QUERY_SIZE_N_MAX
+from gargantext.models.nodes        import Node
+from gargantext.util.db             import session
+from gargantext.util.db_cache       import cache
+from gargantext.util.http           import JsonHttpResponse
+from gargantext.util.scheduling     import scheduled
+from gargantext.util.toolchain      import parse_extract_indexhyperdata
+def query( request):
+    '''get GlobalResults()'''
+    if request.method == "POST":
+        query = request.POST["query"]
+        source = get_resource(RESOURCE_TYPE_MULTIVAC)
+        if source["crawler"] is not None:
+            crawlerbot = load_crawler(source)()
+            #old raw way to get results_nb
+            results = crawlerbot.scan_results(query)
+            #ids = crawlerbot.get_ids(query)
+            print(results)
+            return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
+def save(request, project_id, return_corpus=False):
+    '''save'''
+    if request.method == "POST":
+        query = request.POST.get("query")
+        try:
+            N = int(request.POST.get("N"))
+        except:
+            N = 0
+        print(query, N)
+        #for next time
+        #ids = request.POST["ids"]
+        source = get_resource(RESOURCE_TYPE_MULTIVAC)
+        if N == 0:
+            raise Http404()
+        if N > QUERY_SIZE_N_MAX:
+            N = QUERY_SIZE_N_MAX
+        try:
+            project_id = int(project_id)
+        except ValueError:
+            raise Http404()
+        # do we have a valid project?
+        project = session.query( Node ).filter(Node.id == project_id).first()
+        if project is None:
+            raise Http404()
+        user = cache.User[request.user.id]
+        if not user.owns(project):
+            return HttpResponseForbidden()
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+                        hyperdata    = { "action"        : "Scrapping data"
+                                        , "language_id" : "en"
+                                        }
+        )
+        #download_file
+        crawler_bot = load_crawler(source)()
+        #for now no way to force downloading X records
+        #the long running command
+        filename = crawler_bot.download(query)
+        corpus.add_resource(
+           type = source["type"]
+        #,  name = source["name"]
+        ,  path = crawler_bot.path
+                           )
+        session.add(corpus)
+        session.commit()
+        #corpus_id = corpus.id
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus.id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        if return_corpus:
+            return corpus
+        return render(
+            template_name = 'pages/projects/wait.html',
+            request = request,
+            context = {
+                'user'   : request.user,
+                'project': project,
+            },
+        )
+    data = [query_string,query,N]
+    print(data)
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/pubmed.py
+++ b/gargantext/moissonneurs/pubmed.py
+# ****************************
+# *****  Medline Scraper *****
+# ****************************
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
+# between 9 pm and 5 am Eastern Time weekdays
+# from datetime import datetime
+from time import sleep
+import json
+import datetime
+from os import path
+import threading
+from traceback                  import print_tb
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR
+from django.shortcuts import redirect
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+from gargantext.constants       import get_resource_by_name, QUERY_SIZE_N_MAX
+from gargantext.models.nodes    import Node
+from gargantext.util.db         import session
+from gargantext.util.db_cache   import cache
+from gargantext.util.http       import JsonHttpResponse
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain  import parse_extract_indexhyperdata
+from .util                      import Scraper
+def query( request ):
+    """
+    Pubmed year by year results
+    # alist = [
+    # {'string': '2011[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
+    # {'string': '2012[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
+    #  ... ]
+    (reused as thequeries in query_save)
+    """
+    print(request.method)
+    alist = []
+    if request.method == "POST":
+        query = request.POST["query"]
+        if request.POST["N"] == "NaN":
+            N = QUERY_SIZE_N_MAX
+        else:
+            N = int(request.POST["N"])
+        if N > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+            print("ERROR(scrap: pubmed stats): ",msg)
+            raise ValueError(msg)
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        instancia = Scraper()
+        # serialFetcher (n_last_years, query, query_size)
+        alist = instancia.serialFetcher( 5, query , N )
+    data = alist
+    return JsonHttpResponse(data)
+def save( request , project_id, return_corpus=False ) :
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+    project = session.query( Node ).filter(Node.id == project_id).first()
+    if project is None:
+        raise Http404()
+    user = cache.User[request.user.id]
+    if not user.owns(project):
+        return HttpResponseForbidden()
+    if request.method == "POST":
+        queries = request.POST["query"]
+        name    = request.POST["string"]
+        # here we just realize queries already prepared by getGlobalStats
+        #    ===> no need to repeat N parameter like in testISTEX <===
+        instancia  = Scraper()
+        thequeries = json.loads(queries)
+        # fyi the sum of our prepared yearly proportional quotas
+        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
+        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
+        urlreqs = []
+        for yearquery in thequeries:
+            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+        alist = ["tudo fixe" , "tudo bem"]
+        # corpus node instanciation as a Django model
+        corpus = project.add_child( name=name
+                                  , typename = "CORPUS"
+                                  )
+        # """
+        # urlreqs: List of urls to query.
+        # - Then, to each url in urlreqs you do:
+        #     eFetchResult = urlopen(url)
+        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
+        # """
+        tasks = Scraper()
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            tasks.q.put( url ) #put a task in the queue
+        tasks.q.join() # wait until everything is finished
+        dwnldsOK = 0
+        for filename in tasks.firstResults :
+            print(filename)
+            if filename != False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
+                                   , path = filename
+                                   , url  = None
+                                   )
+                print("Adding the resource")
+                dwnldsOK+=1
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])
+        try:
+            scheduled(parse_extract_indexhyperdata)(corpus_id)
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+            try:
+                print_tb(error.__traceback__)
+            except:
+                pass
+            # IMPORTANT ---------------------------------
+            # sanitize session after interrupted transact
+            session.rollback()
+            # --------------------------------------------
+        sleep(1)
+        if return_corpus:
+            return corpus
+        return HttpResponseRedirect('/projects/' + str(project_id))
+    data = alist
+    return JsonHttpResponse(data)
--- a/gargantext/moissonneurs/urls.py
+++ b/gargantext/moissonneurs/urls.py
+# ____   ____ ____  _  _   ____ _____ ____   _
+#/ ___| / ___|  _ \| || | |  _ \___ /|  _ \ | |
+#\___ \| |   | |_) | || |_| |_) ||_ \| |_) / __)
+# ___) | |___|  _ <|__   _|  __/___) |  _ <\__ \
+#|____/ \____|_| \_\  |_| |_|  |____/|_| \_(   /
+#                                           |_|
+#
+# moissonneurs == getting data from external databases
+from django.conf.urls import url
+# Available databases :
+import gargantext.moissonneurs.pubmed   as pubmed
+import gargantext.moissonneurs.istex    as istex
+import gargantext.moissonneurs.cern     as cern
+import gargantext.moissonneurs.multivac as multivac
+import gargantext.moissonneurs.hal      as hal
+import gargantext.moissonneurs.isidore  as isidore
+# TODO : ISIDORE
+# /!\ urls patterns here are *without* the trailing slash
+urlpatterns = [ url(r'^pubmed/query$'       , pubmed.query   )
+              , url(r'^pubmed/save/(\d+)'   , pubmed.save    )
+              , url(r'^istex/query$'        , istex.query    )
+              , url(r'^istex/save/(\d+)'    , istex.save     )
+              , url(r'^cern/query$'         , cern.query     )
+              , url(r'^cern/save/(\d+)'     , cern.save      )
+              , url(r'^multivac/query$'     , multivac.query )
+              , url(r'^multivac/save/(\d+)' , multivac.save  )
+              , url(r'^hal/query$'          , hal.query      )
+              , url(r'^hal/save/(\d+)'      , hal.save       )
+              , url(r'^isidore/query$'      , isidore.query  )
+              , url(r'^isidore/save/(\d+)'  , isidore.save   )
+              ]
--- a/gargantext/moissonneurs/util.py
+++ b/gargantext/moissonneurs/util.py
+from gargantext.util.files import download
+import sys
+import time
+import threading
+from queue import Queue
+from lxml import etree
+if sys.version_info >= (3, 0):
+    from urllib.request import urlopen
+else:
+    from urllib import urlopen
+class Scraper :
+    def __init__(self):
+        self.queue_size      = 8
+        self.q               = Queue()
+        self.firstResults    = []
+        self.lock            = threading.Lock() # lock to serialize console output
+        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.pubMedDB        = 'Pubmed'
+        self.reportType      = 'medline'
+    # Return the globalResults!:
+    # - count =
+    # - queryKey =
+    # - webEnv =
+    def medlineEsearch(self , query):
+        # print ("MedlineFetcher::medlineEsearch :")
+        "Get number of results for query 'query' in variable 'count'"
+        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+        # print(query)
+        origQuery = query
+        query     = query.replace(' ', '%20')
+        eSearch   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
+                     % ( self.pubMedEutilsURL, self.pubMedDB, query )
+        try:
+            eSearchResult = urlopen(eSearch)
+            data          = eSearchResult.read()
+            root          = etree.XML(data)
+            findcount     = etree.XPath("/eSearchResult/Count/text()")
+            count         = findcount(root)[0]
+            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
+            queryKey      = findquerykey(root)[0]
+            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
+            webEnv        = findwebenv(root)[0]
+        except Exception as Error:
+            print(Error)
+            count         = 0
+            queryKey      = False
+            webEnv        = False
+            origQuery     = False
+        values = { "query"    : origQuery
+                 , "count"    : int(count)
+                 , "queryKey" : queryKey
+                 , "webEnv"   : webEnv
+                 }
+        return values
+    # RETMAX:
+    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
+    # maximum of 100,000 records
+    def medlineEfetchRAW( self , fullquery):
+        query    = fullquery [ "string"  ]
+        retmax   = fullquery [ "retmax"  ]
+        count    = fullquery [ "count"   ]
+        queryKey = fullquery [ "queryKey"]
+        webEnv   = fullquery [ "webEnv"  ]
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+        retstart = 0
+        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
+        return eFetch
+    # generic!
+    def download(self, url):
+        print(url)
+        filename = download(url)
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            # print(threading.current_thread().name, item)
+            return returnvalue
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            results = []
+            try:
+                result = self.download(item)
+            except Exception as error :
+                print(error)
+                result = False
+            self.firstResults.append(result)
+            self.q.task_done()
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]
+    # GLOBALLIMIT:
+    # I will retrieve this exact amount of publications.
+    # The publications per year i'll retrieve per year will be :
+    #        (k/N)*GlobalLimit
+    #                  \_ this is used as RETMAX
+    # - k : Number of publications of x year (according to pubmed)
+    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
+    # - GlobalLimit : Number of publications i want.
+    def serialFetcher(self , yearsNumber , query, globalLimit):
+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+        N = 0
+        # print ("MedlineFetcher::serialFetcher :")
+        thequeries = []
+        globalresults = []
+        for i in range(yearsNumber):
+            year = str(2015 - i)
+            # print ('YEAR ' + year)
+            # print ('---------\n')
+            pubmedquery = str(year) + '[dp] '+query
+            self.q.put( pubmedquery ) #put task in the queue
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+        Total = 0
+        Fails = 0
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
+            Total += 1
+            if globalresults["queryKey"]==False:
+                Fails += 1
+            if globalresults["count"] > 0 :
+                N+=globalresults["count"]
+                queryhyperdata = { "string"   : globalresults["query"]
+                                 , "count"    : globalresults["count"]
+                                 , "queryKey" : globalresults["queryKey"]
+                                 , "webEnv"   : globalresults["webEnv"]
+                                 , "retmax"   : 0
+                                 }
+                thequeries.append ( queryhyperdata )
+        print("Total Number:", N,"publications")
+        print("And i want just:",globalLimit,"publications")
+        print("---------------------------------------\n")
+        for i,query in enumerate(thequeries):
+            k                  = query["count"]
+            proportion         = k/float(N)
+            retmax_forthisyear = int(round(globalLimit*proportion))
+            query["retmax"]    = retmax_forthisyear
+            if query["retmax"] == 0 : query["retmax"]+=1
+            print(query["string"],"\t[",k,">",query["retmax"],"]")
+        if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
+            thequeries = [False]
+        return thequeries