[FEAT] Scrapper pubmed: ok

9eead9fa · delanoe · 88036658 · 9eead9fa · 9eead9fa · 9eead9fa
Commit 9eead9fa authored Apr 07, 2016 by delanoe
9 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -29,7 +29,7 @@ import urllib.request

 def get(url):
    response = urllib.request.urlopen(url)
-    html = response.read()
+    return response.read()


 # retrieve GET parameters from a request

--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
--- a/scrapers/MedlineFetcher.py
+++ b/scrapers/MedlineFetcher.py
@@ -2,10 +2,15 @@
 # *****  Medline Fetcher *****
 # ****************************

-# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
+# between 9 pm and 5 am Eastern Time weekdays
+
+from gargantext.util.files import download
+
 import sys
 if sys.version_info >= (3, 0): from urllib.request import urlopen
 else: from urllib import urlopen
+
 import os
 import time
 # import libxml2
@@ -40,29 +45,41 @@ class MedlineFetcher:

        "Get number of results for query 'query' in variable 'count'"
        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+
        # print(query)
        origQuery = query
        query     = query.replace(' ', '%20')

-        eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
+        eSearch   = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
+                     % ( self.pubMedEutilsURL, self.pubMedDB, query )

        try:
            eSearchResult = urlopen(eSearch)
+
            data          = eSearchResult.read()
            root          = etree.XML(data)
+
            findcount     = etree.XPath("/eSearchResult/Count/text()")
            count         = findcount(root)[0]
+
            findquerykey  = etree.XPath("/eSearchResult/QueryKey/text()")
            queryKey      = findquerykey(root)[0]
+
            findwebenv    = etree.XPath("/eSearchResult/WebEnv/text()")
            webEnv        = findwebenv(root)[0]
-        except:
-            count=0
-            queryKey=False
-            webEnv=False
-            origQuery=False

-        values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
+        except Exception as Error:
+            print(Error)
+            count         = 0
+            queryKey      = False
+            webEnv        = False
+            origQuery     = False
+
+        values = { "query"    : origQuery
+                 , "count"    : int(count)
+                 , "queryKey" : queryKey
+                 , "webEnv"   : webEnv
+                 }
        return values


@@ -71,12 +88,11 @@ class MedlineFetcher:
    # maximum of 100,000 records
    def medlineEfetchRAW( self , fullquery):

-
-        query = fullquery["string"]
-        retmax = fullquery["retmax"]
-        count = fullquery["count"]
-        queryKey = fullquery["queryKey"]
-        webEnv = fullquery["webEnv"]
+        query    = fullquery [ "string"  ]
+        retmax   = fullquery [ "retmax"  ]
+        count    = fullquery [ "count"   ]
+        queryKey = fullquery [ "queryKey"]
+        webEnv   = fullquery [ "webEnv"  ]

        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"

@@ -88,34 +104,15 @@ class MedlineFetcher:
        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
        return eFetch

-    def ensure_dir(self , f):
-        d = os.path.dirname(f)
-        if not os.path.exists(d):
-            os.makedirs(d)

    # generic!
-    def downloadFile(self, item):
-        url = item[0]
-        filename = item[1]
-        # print("\tin test_downloadFile:")
-        # print(url,filename)
-        data = urlopen(url)
-        f = codecs.open(filename, "w" ,encoding='utf-8')
-        myfile = File(f)
-        myfile.write( data.read().decode('utf-8') )
-        myfile.close()
-        f.close()
+    def download(self, url):
+        print(url)
+        filename = download(url)
        with self.lock:
            print(threading.current_thread().name, filename+" OK")
            return filename

-    # generic!
-    def test_downloadFile(self, item):
-        url = item[0]
-        filename = item[1]
-        # print("\tin downloadFile:")
-        data = urlopen(url)
-        return data

    # generic!
    def do_work(self,item):
@@ -132,23 +129,24 @@ class MedlineFetcher:
            self.firstResults.append(self.do_work(item))
            self.q.task_done()

+
    def worker2(self):
        while True:
            item = self.q.get()
            results = []
-            try: result = self.downloadFile(item)
-            except: result = False
+            try:
+                result = self.download(item)
+            except Exception as error :
+                print(error)
+                result = False
            self.firstResults.append(result)
            self.q.task_done()

-    def chunks(self , l , n):
-        print("chunks:")
-        for i in range(0, len(l), n):
-            yield l[i:i+n]
-
    # GLOBALLIMIT:
    # I will retrieve this exact amount of publications.
-    # The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
+    # The publications per year i'll retrieve per year will be :
+    #        (k/N)*GlobalLimit
+    #                  \_ this is used as RETMAX
    # - k : Number of publications of x year (according to pubmed)
    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
    # - GlobalLimit : Number of publications i want.
@@ -183,14 +181,15 @@ class MedlineFetcher:
            Total += 1
            if globalresults["queryKey"]==False:
                Fails += 1
-            if globalresults["count"]>0:
+            if globalresults["count"] > 0 :
+
                N+=globalresults["count"]
-                queryhyperdata = { 
-                    "string": globalresults["query"] , 
-                    "count": globalresults["count"] , 
-                    "queryKey":globalresults["queryKey"] , 
-                    "webEnv":globalresults["webEnv"] , 
-                    "retmax":0 
+
+                queryhyperdata = { "string"   : globalresults["query"]
+                                 , "count"    : globalresults["count"]
+                                 , "queryKey" : globalresults["queryKey"]
+                                 , "webEnv"   : globalresults["webEnv"]
+                                 , "retmax"   : 0
                                 }
                thequeries.append ( queryhyperdata )

@@ -203,10 +202,12 @@ class MedlineFetcher:
            proportion         = k/float(N)
            retmax_forthisyear = int(round(globalLimit*proportion))
            query["retmax"]    = retmax_forthisyear
-            if query["retmax"]==0: query["retmax"]+=1
+
+            if query["retmax"] == 0 : query["retmax"]+=1
+
            print(query["string"],"\t[",k,">",query["retmax"],"]")

-        if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
+        if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
            thequeries = [False]

        return thequeries
--- a/scrapers/istex.py
+++ b/scrapers/istex.py
+def getGlobalStatsISTEXT(request ):
+    """
+    ISTEX simply the total of hits for a query
+
+    (not reused in testISTEX)
+    """
+    print(request.method)
+    alist = ["bar","foo"]
+
+    if request.method == "POST":
+        query = request.POST["query"]
+        N = int(request.POST["N"])
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
+        query_string = query.replace(" ","+")
+        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
+
+        tasks = MedlineFetcher()
+
+        try:
+            thedata_path = tasks.download( url )
+            thedata = open(thedata_path, "rb")
+            alist = thedata.read().decode('utf-8')
+        except Exception as error:
+            alist = [str(error)]
+    data = alist
+    return JsonHttpResponse(data)
+
+
+
+
+def testISTEX(request , project_id):
+    print("testISTEX:")
+    print(request.method)
+    alist = ["bar","foo"]
+    # implicit global session
+    # do we have a valid project id?
+    try:
+        project_id = int(project_id)
+    except ValueError:
+        raise Http404()
+
+    # do we have a valid project?
+    project = (session
+        .query(Node)
+        .filter(Node.id == project_id)
+        .filter(Node.typename == 'PROJECT')
+    ).first()
+
+    if project is None:
+        raise Http404()
+
+    # do we have a valid user?
+    user = request.user
+    if not user.is_authenticated():
+        return redirect('/auth/?next=%s' % request.path)
+    if project.user_id != user.id:
+        return HttpResponseForbidden()
+
+
+
+    if request.method == "POST":
+        query = "-"
+        query_string = "-"
+        N = 0
+
+        if "query" in request.POST:
+            query = request.POST["query"]
+            query_string = query.replace(" ","+")   # url encoded q
+
+        if "N" in request.POST:
+            N = int(request.POST["N"])     # query_size from views_opti
+            if N > QUERY_SIZE_N_MAX:
+                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+                print("ERROR (scrap: istex d/l ): ",msg)
+                raise ValueError(msg)
+
+        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
+
+        urlreqs = []
+        pagesize = 50
+        tasks = MedlineFetcher()
+        chunks = list(tasks.chunks(range(N), pagesize))
+        for k in chunks:
+            if (k[0]+pagesize)>N: pagesize = N-k[0]
+            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+
+
+        resourcetype = RESOURCETYPES["name"]["ISTex"]
+
+        # corpus node instanciation as a Django model
+        corpus = Node(
+            name = query,
+            user_id = request.user.id,
+            parent_id = project_id,
+            typename = 'CORPUS',
+            language_id = None,
+            hyperdata    = {'Processing' : "Parsing documents",}
+        )
+        session.add(corpus)
+        session.commit()
+        corpus_id = corpus.id
+
+        print("NEW CORPUS", corpus_id)
+        ensure_dir(request.user)
+        tasks = MedlineFetcher()
+
+        for i in range(8):
+            t = threading.Thread(target=tasks.worker2) #thing to do
+            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+            t.start()
+        for url in urlreqs:
+            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+            tasks.q.put( [url , filename]) #put a task in th queue
+        tasks.q.join() # wait until everything is finished
+
+        dwnldsOK = 0
+        for filename in tasks.firstResults:
+            if filename!=False:
+                # add the uploaded resource to the corpus
+                corpus.add_resource(corpus,
+                    user_id = request.user.id,
+                    type_id = resourcetype.id,
+                    file = filename,
+                )
+                dwnldsOK+=1
+        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+        ###########################
+        ###########################
+        try:
+            scheduled(parse_extract_indexhyperdata(corpus_id,))
+        except Exception as error:
+            print('WORKFLOW ERROR')
+            print(error)
+        sleep(1)
+        return HttpResponseRedirect('/project/' + str(project_id))
+
+
+    data = [query_string,query,N]
+    return JsonHttpResponse(data)
+
+
--- a/scrapers/pubmed.py
+++ b/scrapers/pubmed.py
@@ -8,7 +8,7 @@ import json
 import datetime
 from os import path
 import threading
-from gargantext.settings import MEDIA_ROOT, BASE_DIR
+#from gargantext.settings import MEDIA_ROOT, BASE_DIR

 from django.shortcuts import redirect
 from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
@@ -18,7 +18,6 @@ from gargantext.models.nodes import Node
 from gargantext.util.db         import session
 from gargantext.util.http       import JsonHttpResponse
 from gargantext.util.tools      import ensure_dir
-
 from gargantext.util.scheduling import scheduled
 from gargantext.util.toolchain  import parse_extract_indexhyperdata

@@ -37,7 +36,7 @@ QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])

 # QUERY_SIZE_N_DEFAULT   = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
 # --------------------------------------------------------------------
-def getGlobalStats(request ):
+def getGlobalStats( request ):
    """
    Pubmed year by year results

@@ -73,37 +72,8 @@ def getGlobalStats(request ):
    return JsonHttpResponse(data)


-def getGlobalStatsISTEXT(request ):
-    """
-    ISTEX simply the total of hits for a query
-
-    (not reused in testISTEX)
-    """
-    print(request.method)
-    alist = ["bar","foo"]
-
-    if request.method == "POST":
-        query = request.POST["query"]
-        N = int(request.POST["N"])
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
-        query_string = query.replace(" ","+")
-        url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
-
-        tasks = MedlineFetcher()
-
-        filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-
-        try:
-            thedata = tasks.test_downloadFile( [url,filename] )
-            alist = thedata.read().decode('utf-8')
-        except Exception as error:
-            alist = [str(error)]
-    data = alist
-    return JsonHttpResponse(data)
-

-def doTheQuery(request , project_id):
+def doTheQuery( request , project_id ) :
    # implicit global session
    # do we have a valid project id?
    try:
@@ -111,8 +81,7 @@ def doTheQuery(request , project_id):
    except ValueError:
        raise Http404()
    # do we have a valid project?
-    project = (session
-        .query(Node)
+    project = (session.query( Node )
                      .filter(Node.id == project_id)
                      .filter(Node.typename == 'PROJECT')
              ).first()
@@ -147,7 +116,6 @@ def doTheQuery(request , project_id):
            urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
        alist = ["tudo fixe" , "tudo bem"]

-        resourcetype = RESOURCETYPES['name']['Pubmed (xml format)']

        # corpus node instanciation as a Django model
        corpus = Node(
@@ -155,8 +123,9 @@ def doTheQuery(request , project_id):
            user_id = request.user.id,
            parent_id = project_id,
            typename = 'CORPUS',
-            language_id = None,
-                        hyperdata    = {'Processing' : "Parsing documents",}
+                        hyperdata    = { "action"        : "Scraping data"
+                                        , "language_id" : None
+                                        }
        )
        session.add(corpus)
        session.commit()
@@ -177,22 +146,21 @@ def doTheQuery(request , project_id):
            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
            t.start()
        for url in urlreqs:
-            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-            tasks.q.put( [url , filename]) #put a task in th queue
+            tasks.q.put( url ) #put a task in the queue
        tasks.q.join() # wait until everything is finished

        dwnldsOK = 0
-        for filename in tasks.firstResults:
-            if filename!=False:
+        for filename in tasks.firstResults :
+            print(filename)
+            if filename != False:
                # add the uploaded resource to the corpus
-                add_resource(corpus,
-                    user_id = request.user.id,
-                    type_id = resourcetype.id,
-                    file = filename,
+                corpus.add_resource( type = 3
+                                   , path = filename
                                   )
                dwnldsOK+=1

-        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+        if dwnldsOK == 0 :
+            return JsonHttpResponse(["fail"])

        try:
            scheduled(parse_extract_indexhyperdata(corpus_id,))
@@ -200,118 +168,10 @@ def doTheQuery(request , project_id):
            print('WORKFLOW ERROR')
            print(error)
        sleep(1)
-        return HttpResponseRedirect('/project/' + str(project_id))
+        return HttpResponseRedirect('/projects/' + str(project_id))

    data = alist
    return JsonHttpResponse(data)


-def testISTEX(request , project_id):
-    print("testISTEX:")
-    print(request.method)
-    alist = ["bar","foo"]
-    # implicit global session
-    # do we have a valid project id?
-    try:
-        project_id = int(project_id)
-    except ValueError:
-        raise Http404()
-
-    # do we have a valid project?
-    project = (session
-        .query(Node)
-        .filter(Node.id == project_id)
-        .filter(Node.typename == 'PROJECT')
-    ).first()
-
-    if project is None:
-        raise Http404()
-
-    # do we have a valid user?
-    user = request.user
-    if not user.is_authenticated():
-        return redirect('/auth/?next=%s' % request.path)
-    if project.user_id != user.id:
-        return HttpResponseForbidden()
-
-
-
-    if request.method == "POST":
-        query = "-"
-        query_string = "-"
-        N = 0

-        if "query" in request.POST:
-            query = request.POST["query"]
-            query_string = query.replace(" ","+")   # url encoded q
-
-        if "N" in request.POST:
-            N = int(request.POST["N"])     # query_size from views_opti
-            if N > QUERY_SIZE_N_MAX:
-                msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
-                print("ERROR (scrap: istex d/l ): ",msg)
-                raise ValueError(msg)
-
-        print("Scrapping Istex: '%s' (%i)" % (query_string , N))
-
-        urlreqs = []
-        pagesize = 50
-        tasks = MedlineFetcher()
-        chunks = list(tasks.chunks(range(N), pagesize))
-        for k in chunks:
-            if (k[0]+pagesize)>N: pagesize = N-k[0]
-            urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
-
-
-        resourcetype = RESOURCETYPES["name"]["ISTex"]
-
-        # corpus node instanciation as a Django model
-        corpus = Node(
-            name = query,
-            user_id = request.user.id,
-            parent_id = project_id,
-            typename = 'CORPUS',
-            language_id = None,
-            hyperdata    = {'Processing' : "Parsing documents",}
-        )
-        session.add(corpus)
-        session.commit()
-        corpus_id = corpus.id
-
-        print("NEW CORPUS", corpus_id)
-        ensure_dir(request.user)
-        tasks = MedlineFetcher()
-
-        for i in range(8):
-            t = threading.Thread(target=tasks.worker2) #thing to do
-            t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-            t.start()
-        for url in urlreqs:
-            filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-            tasks.q.put( [url , filename]) #put a task in th queue
-        tasks.q.join() # wait until everything is finished
-
-        dwnldsOK = 0
-        for filename in tasks.firstResults:
-            if filename!=False:
-                # add the uploaded resource to the corpus
-                corpus.add_resource(corpus,
-                    user_id = request.user.id,
-                    type_id = resourcetype.id,
-                    file = filename,
-                )
-                dwnldsOK+=1
-        if dwnldsOK == 0: return JsonHttpResponse(["fail"])
-        ###########################
-        ###########################
-        try:
-            scheduled(parse_extract_indexhyperdata(corpus_id,))
-        except Exception as error:
-            print('WORKFLOW ERROR')
-            print(error)
-        sleep(1)
-        return HttpResponseRedirect('/project/' + str(project_id))
-
-
-    data = [query_string,query,N]
-    return JsonHttpResponse(data)
--- a/scrapers/urls.py
+++ b/scrapers/urls.py
@@ -10,7 +10,11 @@ import scrapers.pubmed as pubmed
 # Available databases : Pubmed, IsTex, (next: CERN)

 # /!\ urls patterns here are *without* the trailing slash
-urlpatterns = [ url(r'^pubmed/query$', pubmed.getGlobalStats)
+urlpatterns = [ url(r'^pubmed/query$'       , pubmed.getGlobalStats            )
+              , url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery                )
+
+#              , url(r'^istex/query$'        , pubmed.getGlobalStatsISTEXT      )
+#              , url(r'^istex/search/(\d+)'  , pubmed.testISTEX                 )
            #, url(r'^scraping$'              , scraping.Target.as_view()      )
              ,
              ]
--- a/templates/pages/projects/project.html
+++ b/templates/pages/projects/project.html
@@ -260,7 +260,7 @@

                                $.ajax({
                                    // contentType: "application/json",
-                                    url: window.location.origin+"/tests/project/"+projectid+"/pubmedquery/go",
+                                    url: window.location.origin+"/scrapers/pubmed/search/"+projectid,
                                    data: pubmedifiedQuery,
                                    type: 'POST',
                                    beforeSend: function(xhr) {