[FEAT] Adding scrappers (not finished yet).

68b1707b · delanoe · ab0aa5ba · 68b1707b · 68b1707b · 68b1707b
Commit 68b1707b authored Apr 05, 2016 by delanoe
7 changed files
--- a/gargantext/settings.py
+++ b/gargantext/settings.py
@@ -51,6 +51,7 @@ INSTALLED_APPS = [
    'djcelery',
    'annotations',
    'graphExplorer',
+    'scrappers',
 ]

 MIDDLEWARE_CLASSES = [

--- a/gargantext/urls.py
+++ b/gargantext/urls.py
@@ -21,27 +21,29 @@ import gargantext.views.pages.urls
 from annotations       import urls as annotations_urls
 from annotations.views import main as annotations_main_view

-# Module "Graph Explorer" 
+# Module "Graph Explorer"
 #from graphExplorer     import urls as graphExplorer_urls
 from graphExplorer.rest import Graph
 from graphExplorer.views import explorer

-urlpatterns = [
-    url(r'^admin/', admin.site.urls),
-    url(r'^generated/', include(gargantext.views.generated.urls)),
-    url(r'^api/', include(gargantext.views.api.urls)),
-    url(r'^', include(gargantext.views.pages.urls)),
-
-    # Module Annotation
-        # tempo: unchanged doc-annotations routes --
-    url(r'^annotations/', include(annotations_urls)),
-    url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
-
-    # Module "Graph Explorer"
-    url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer), 
-    url(r'^projects/(\d+)/corpora/(\d+)/graph$',    Graph.as_view()),
-    # to be removed:
-    url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$',    Graph.as_view())
-    #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
-    #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
-]
+from scrappers          import urls as scrappers_urls
+
+urlpatterns = [ url(r'^admin/', admin.site.urls)
+              , url(r'^generated/', include(gargantext.views.generated.urls))
+              , url(r'^api/', include(gargantext.views.api.urls))
+              , url(r'^', include(gargantext.views.pages.urls))
+
+              # Module Annotation
+              # tempo: unchanged doc-annotations routes --
+              , url(r'^annotations/', include(annotations_urls))
+              , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
+
+              # Module "Graph Explorer"
+              , url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer)
+              , url(r'^projects/(\d+)/corpora/(\d+)/graph$',    Graph.as_view())
+              # to be removed:
+              , url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$',    Graph.as_view())
+              #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
+              #url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
+              , url(r'^scrappers/', include(scrappers_urls))
+              ]
--- a/gargantext/util/tools.py
+++ b/gargantext/util/tools.py
+import os
+from gargantext.settings import MEDIA_ROOT
+
+
+def ensure_dir(user):
+    '''
+    If user is new, folder does not exist yet, create it then
+    '''
+    dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
+    if not os.path.exists(dirpath):
+        print("Creating folder %s" % dirpath)
+        os.makedirs(dirpath)
--- a/scrappers/MedlineFetcher.py
+++ b/scrappers/MedlineFetcher.py
+# ****************************
+# *****  Medline Fetcher *****
+# ****************************
+
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
+import sys
+if sys.version_info >= (3, 0): from urllib.request import urlopen
+else: from urllib import urlopen
+import os
+import time
+# import libxml2
+from lxml import etree
+import datetime
+from django.core.files import File
+import codecs
+
+import threading
+from queue import Queue
+# import time
+
+class MedlineFetcher:
+
+    def __init__(self):
+        self.queue_size = 8
+        self.q = Queue()
+        self.firstResults = []
+        self.lock = threading.Lock() # lock to serialize console output
+        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.pubMedDB = 'Pubmed'
+        self.reportType = 'medline'
+
+
+    # Return the globalResults!:
+    # - count = 
+    # - queryKey = 
+    # - webEnv = 
+    def medlineEsearch(self , query):
+
+        # print ("MedlineFetcher::medlineEsearch :")
+
+        "Get number of results for query 'query' in variable 'count'"
+        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+        # print(query)
+        origQuery = query
+        query = query.replace(' ', '%20')
+            
+        eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
+
+        try:
+            eSearchResult = urlopen(eSearch)
+            data = eSearchResult.read()
+            root = etree.XML(data)
+            findcount = etree.XPath("/eSearchResult/Count/text()")
+            count = findcount(root)[0]
+            findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
+            queryKey = findquerykey(root)[0]
+            findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
+            webEnv = findwebenv(root)[0]
+        except:
+            count=0
+            queryKey=False
+            webEnv=False
+            origQuery=False
+
+        values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
+        return values
+
+
+    # RETMAX:
+    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
+    # maximum of 100,000 records
+    def medlineEfetchRAW( self , fullquery):
+        
+
+        query = fullquery["string"]
+        retmax = fullquery["retmax"]
+        count = fullquery["count"]
+        queryKey = fullquery["queryKey"]
+        webEnv = fullquery["webEnv"]
+
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+
+        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+        
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+
+        retstart = 0
+        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
+        return eFetch
+
+    def ensure_dir(self , f):
+        d = os.path.dirname(f)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+    # generic!
+    def downloadFile(self, item):
+        url = item[0]
+        filename = item[1]
+        # print("\tin test_downloadFile:")
+        # print(url,filename)
+        data = urlopen(url)
+        f = codecs.open(filename, "w" ,encoding='utf-8')
+        myfile = File(f)
+        myfile.write( data.read().decode('utf-8') )
+        myfile.close()
+        f.close()
+        with self.lock:
+            print(threading.current_thread().name, filename+" OK")
+            return filename
+
+    # generic!
+    def test_downloadFile(self, item):
+        url = item[0]
+        filename = item[1]
+        # print("\tin downloadFile:")
+        data = urlopen(url)
+        return data
+
+    # generic!
+    def do_work(self,item):
+        # time.sleep(1) # pretend to do some lengthy work.
+        returnvalue = self.medlineEsearch(item)
+        with self.lock:
+            # print(threading.current_thread().name, item)
+            return returnvalue
+
+    # The worker thread pulls an item from the queue and processes it
+    def worker(self):
+        while True:
+            item = self.q.get()
+            self.firstResults.append(self.do_work(item))
+            self.q.task_done()
+
+    def worker2(self):
+        while True:
+            item = self.q.get()
+            results = []
+            try: result = self.downloadFile(item)
+            except: result = False
+            self.firstResults.append(result)
+            self.q.task_done()
+
+    def chunks(self , l , n):
+        print("chunks:")
+        for i in range(0, len(l), n):
+            yield l[i:i+n]
+
+    # GLOBALLIMIT:
+    # I will retrieve this exact amount of publications.
+    # The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
+    # - k : Number of publications of x year (according to pubmed)
+    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
+    # - GlobalLimit : Number of publications i want.
+    def serialFetcher(self , yearsNumber , query, globalLimit):
+
+        # Create the queue and thread pool.
+        for i in range(self.queue_size):
+             t = threading.Thread(target=self.worker)
+             t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+             t.start()
+        start = time.perf_counter()
+
+        N = 0
+
+        # print ("MedlineFetcher::serialFetcher :")
+        thequeries = []
+        globalresults = []
+        for i in range(yearsNumber):
+            year = str(2015 - i)
+            # print ('YEAR ' + year)
+            # print ('---------\n')
+            pubmedquery = str(year) + '[dp] '+query
+            self.q.put( pubmedquery ) #put task in the queue
+        
+        self.q.join()
+        print('time:',time.perf_counter() - start)
+
+        Total = 0
+        Fails = 0
+        for globalresults in self.firstResults:
+            # globalresults = self.medlineEsearch(pubmedquery)
+            Total += 1
+            if globalresults["queryKey"]==False:
+                Fails += 1
+            if globalresults["count"]>0:
+                N+=globalresults["count"]
+                queryhyperdata = { 
+                    "string": globalresults["query"] , 
+                    "count": globalresults["count"] , 
+                    "queryKey":globalresults["queryKey"] , 
+                    "webEnv":globalresults["webEnv"] , 
+                    "retmax":0 
+                }
+                thequeries.append ( queryhyperdata )
+
+        print("Total Number:", N,"publications")
+        print("And i want just:",globalLimit,"publications")
+        print("---------------------------------------\n")
+
+        for i,query in enumerate(thequeries):
+            k = query["count"]
+            proportion = k/float(N)
+            retmax_forthisyear = int(round(globalLimit*proportion))
+            query["retmax"] = retmax_forthisyear
+            if query["retmax"]==0: query["retmax"]+=1
+            print(query["string"],"\t[",k,">",query["retmax"],"]")
+
+        if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
+            thequeries = [False]
+
+        return thequeries
--- a/scrappers/pubmed.py
+++ b/scrappers/pubmed.py
+
+from scrappers.MedlineFetcher import MedlineFetcher
+
+
+# from datetime import datetime
+from time import sleep
+import json
+import datetime
+from os import path
+import threading
+from gargantext.settings import MEDIA_ROOT, BASE_DIR
+
+from django.shortcuts import redirect
+from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
+
+from gargantext.constants import RESOURCETYPES
+from gargantext.models.nodes import Node
+from gargantext.util.db import session
+from gargantext.util.http import JsonHttpResponse
+from gargantext.util.tools import ensure_dir
+
+from gargantext.util.scheduling import scheduled
+from gargantext.util.toolchain import parse_extract_indexhyperdata
+
+
+
+# pour lire la section [scrappers] de gargantext.ini
+#from configparser import ConfigParser
+
+# --------------------------------------------------------------------
+# importing constants from config file
+#CONF = ConfigParser()
+#with open(path.join(BASE_DIR, 'gargantext.ini')) as inifile:
+#    CONF.read_file(inifile)
+
+QUERY_SIZE_N_MAX = 100 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
+
+# QUERY_SIZE_N_DEFAULT   = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
+# --------------------------------------------------------------------
+def getGlobalStats(request ):
+    """
+    Pubmed year by year results
+
+    # alist = [
+    # {'string': '2011[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
+    # {'string': '2012[dp] serendipity', 'queryKey': '1',
+    #  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
+    #  ... ]
+
+    (reused as thequeries in doTheQuery)
+    """
+    print(request.method)
+    alist = []
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+
+        if N > QUERY_SIZE_N_MAX:
+            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+            print("ERROR(scrap: pubmed stats): ",msg)
+            raise ValueError(msg)
+
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        instancia = MedlineFetcher()
+
+        # serialFetcher (n_last_years, query, query_size)
+        alist = instancia.serialFetcher( 5, query , N )
+
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def getGlobalStatsISTEXT(request ):
+    """
+    ISTEX simply the total of hits for a query
+
+    (not reused in testISTEX)
+    """
+    print(request.method)
+    alist = ["bar","foo"]
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        query_string = query.replace(" ","+")
+        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
+
+        tasks = MedlineFetcher()
+
+        filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+
+        try:
+            thedata = tasks.test_downloadFile( [url,filename] )
+            alist = thedata.read().decode('utf-8')
+        except Exception as error:
+            alist = [str(error)]
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def doTheQuery(request , project_id):
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+    if request.method == "POST":
+        queries = request.POST["query"]
+        name = request.POST["string"]
+
+        # here we just realize queries already prepared by getGlobalStats
+        #    ===> no need to repeat N parameter like in testISTEX <===
+
+        instancia = MedlineFetcher()
+        thequeries = json.loads(queries)
+
+        # fyi the sum of our prepared yearly proportional quotas
+        sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
+        print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
+
+        urlreqs = []
+        for yearquery in thequeries:
+            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+        alist = ["tudo fixe" , "tudo bem"]
+
+        resourcetype = RESOURCETYPES['name']['Pubmed (xml format)']
+
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = name,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+            language_id = None,
+                        hyperdata    = {'Processing' : "Parsing documents",}
+        )
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+        # """
+        # urlreqs: List of urls to query.
+        # - Then, to each url in urlreqs you do:
+        #     eFetchResult = urlopen(url)
+        #     eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
+        # """
+
+
+        ensure_dir(request.user)
+        tasks = MedlineFetcher()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+            tasks.q.put( [url , filename]) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                add_resource(corpus,
+                    user_id = request.user.id,
+                    type_id = resourcetype.id,
+                    file = filename,
+                )
+                dwnldsOK+=1
+
+        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/project/' + str(project_id))
+
+    data = alist
+    return JsonHttpResponse(data)
+
+
+def testISTEX(request , project_id):
+    print("testISTEX:")
+    print(request.method)
+    alist = ["bar","foo"]
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+
+    if request.method == "POST":
+        query = "-"
+        query_string = "-"
+        N = 0
+
+        if "query" in request.POST:
+            query = request.POST["query"]
+            query_string = query.replace(" ","+")   # url encoded q
+
+        if "N" in request.POST:
+            N = int(request.POST["N"])     # query_size from views_opti
+            if N > QUERY_SIZE_N_MAX:
+                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+                print("ERROR (scrap: istex d/l ): ",msg)
+                raise ValueError(msg)
+
+        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
+
+        urlreqs = []
+        pagesize = 50
+        tasks = MedlineFetcher()
+        chunks = list(tasks.chunks(range(N), pagesize))
+        for k in chunks:
+            if (k[0]+pagesize)>N: pagesize = N-k[0]
+            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+
+
+        resourcetype = RESOURCETYPES["name"]["ISTex"]
+
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+            language_id = None,
+            hyperdata    = {'Processing' : "Parsing documents",}
+        )
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+
+        print("NEW CORPUS", corpus_id)
+        ensure_dir(request.user)
+        tasks = MedlineFetcher()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+            tasks.q.put( [url , filename]) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource(corpus,
+                    user_id = request.user.id,
+                    type_id = resourcetype.id,
+                    file = filename,
+                )
+                dwnldsOK+=1
+        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+        ###########################
+        ###########################
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/project/' + str(project_id))
+
+
+    data = [query_string,query,N]
+    return JsonHttpResponse(data)
--- a/scrappers/urls.py
+++ b/scrappers/urls.py
+from django.conf.urls import url
+
+import scrappers.pubmed as pubmed
+
+
+# /!\ urls patterns here are *without* the trailing slash
+
+urlpatterns = [ url(r'^pubmed/query$', pubmed.getGlobalStats)
+              ,
+              ]
--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -3,15 +3,16 @@
 {% block css %}
 {% load staticfiles %}
 <link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
-<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
-<script type="text/javascript" src="{% static "js/gargantext/garganrest.js" %}"></script>
-<link rel="stylesheet" href="http://code.jquery.com/ui/1.11.2/themes/smoothness/jquery-ui.css">

-<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
-<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
 <link rel="stylesheet" href="{% static "css/morris.css" %}">

 <script src="{% static "js/raphael-min.js"%}"></script>
+<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
+<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
+<script type="text/javascript" src="{% static "js/gargantext/garganrest.js" %}"></script>
+
+<link rel="stylesheet" href="http://code.jquery.com/ui/1.11.2/themes/smoothness/jquery-ui.css">
+
 <style type="text/css">
    .ui-autocomplete {
        z-index: 5000;
@@ -203,7 +204,7 @@
                                                        <div id="pubmedcrawl" style="visibility: hidden;">
                                                            Do you have a file already? &nbsp;
                                                            <input type="radio" id="file_yes" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="true" checked>Yes </input>
-                                                            <input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input>
+                                                            <input type="radio" id="file_no"  name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input>
                                                        </div>
                                                    </td>
                                                </tr>
@@ -329,10 +330,10 @@

                                var theType = $("#id_type option:selected").html();

-                                if(theType=="Pubmed (xml format)") {
+                                if( theType=="Pubmed (xml format)") {
                                    $.ajax({
                                        // contentType: "application/json",
-                                        url: window.location.origin+"/tests/pubmedquery",
+                                        url: window.location.origin+"/scrappers/pubmed/query",
                                        data: formData,
                                        type: 'POST',
                                        beforeSend: function(xhr) {