Commit 9eead9fa authored by delanoe's avatar delanoe

[FEAT] Scrapper pubmed: ok

parent 88036658
......@@ -29,7 +29,7 @@ import urllib.request
def get(url):
response = urllib.request.urlopen(url)
html = response.read()
return response.read()
# retrieve GET parameters from a request
......
......@@ -2,10 +2,15 @@
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from gargantext.util.files import download
import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time
# import libxml2
......@@ -40,29 +45,41 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except:
count=0
queryKey=False
webEnv=False
origQuery=False
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
except Exception as Error:
print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values
......@@ -71,12 +88,11 @@ class MedlineFetcher:
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery["string"]
retmax = fullquery["retmax"]
count = fullquery["count"]
queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"]
query = fullquery [ "string" ]
retmax = fullquery [ "retmax" ]
count = fullquery [ "count" ]
queryKey = fullquery [ "queryKey"]
webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
......@@ -88,34 +104,15 @@ class MedlineFetcher:
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
def ensure_dir(self , f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
# generic!
def downloadFile(self, item):
url = item[0]
filename = item[1]
# print("\tin test_downloadFile:")
# print(url,filename)
data = urlopen(url)
f = codecs.open(filename, "w" ,encoding='utf-8')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
def download(self, url):
print(url)
filename = download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
# print("\tin downloadFile:")
data = urlopen(url)
return data
# generic!
def do_work(self,item):
......@@ -132,23 +129,24 @@ class MedlineFetcher:
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try: result = self.downloadFile(item)
except: result = False
try:
result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
......@@ -183,14 +181,15 @@ class MedlineFetcher:
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"]>0:
if globalresults["count"] > 0 :
N+=globalresults["count"]
queryhyperdata = {
"string": globalresults["query"] ,
"count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] ,
"retmax":0
queryhyperdata = { "string" : globalresults["query"]
, "count" : globalresults["count"]
, "queryKey" : globalresults["queryKey"]
, "webEnv" : globalresults["webEnv"]
, "retmax" : 0
}
thequeries.append ( queryhyperdata )
......@@ -203,10 +202,12 @@ class MedlineFetcher:
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = RESOURCETYPES["name"]["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
......@@ -8,7 +8,7 @@ import json
import datetime
from os import path
import threading
from gargantext.settings import MEDIA_ROOT, BASE_DIR
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
......@@ -18,7 +18,6 @@ from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
......@@ -37,7 +36,7 @@ QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def getGlobalStats(request ):
def getGlobalStats( request ):
"""
Pubmed year by year results
......@@ -73,37 +72,8 @@ def getGlobalStats(request ):
return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id):
def doTheQuery( request , project_id ) :
# implicit global session
# do we have a valid project id?
try:
......@@ -111,8 +81,7 @@ def doTheQuery(request , project_id):
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
project = (session.query( Node )
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
......@@ -147,7 +116,6 @@ def doTheQuery(request , project_id):
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
resourcetype = RESOURCETYPES['name']['Pubmed (xml format)']
# corpus node instanciation as a Django model
corpus = Node(
......@@ -155,8 +123,9 @@ def doTheQuery(request , project_id):
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
......@@ -177,22 +146,21 @@ def doTheQuery(request , project_id):
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.put( url ) #put a task in the queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
for filename in tasks.firstResults :
print(filename)
if filename != False:
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
......@@ -200,118 +168,10 @@ def doTheQuery(request , project_id):
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = RESOURCETYPES["name"]["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
......@@ -10,7 +10,11 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$', pubmed.getGlobalStats)
urlpatterns = [ url(r'^pubmed/query$' , pubmed.getGlobalStats )
, url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery )
# , url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
# , url(r'^istex/search/(\d+)' , pubmed.testISTEX )
#, url(r'^scraping$' , scraping.Target.as_view() )
,
]
......@@ -260,7 +260,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/pubmedquery/go",
url: window.location.origin+"/scrapers/pubmed/search/"+projectid,
data: pubmedifiedQuery,
type: 'POST',
beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment