Commit d81d7759 authored by delanoe's avatar delanoe

[FEAT] Scrapper pubmed: ok

parent 54636791
...@@ -47,59 +47,59 @@ def convert_to_date(date): ...@@ -47,59 +47,59 @@ def convert_to_date(date):
return dateutil.parser.parse(date) return dateutil.parser.parse(date)
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing # TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db) # (type, convert_to_db, convert_from_db)
'count': 'count':
{ 'id' : 1 { 'id' : 1
, 'type' : int , 'type' : int
, 'convert_to_db' : int , 'convert_to_db' : int
, 'convert_from_db': int , 'convert_from_db': int
}, },
'publication_date': 'publication_date':
{ 'id' : 2 { 'id' : 2
, 'type' : datetime.datetime , 'type' : datetime.datetime
, 'convert_to_db' : convert_to_date , 'convert_to_db' : convert_to_date
, 'convert_from_db': datetime.datetime.fromtimestamp , 'convert_from_db': datetime.datetime.fromtimestamp
}, },
'title': 'title':
{ 'id' : 3 { 'id' : 3
, 'type' : str , 'type' : str
, 'convert_to_db' : str , 'convert_to_db' : str
, 'convert_from_db': str , 'convert_from_db': str
}, },
'authors': 'authors':
{ 'id' : 4 { 'id' : 4
, 'type' : str , 'type' : str
, 'convert_to_db' : str , 'convert_to_db' : str
, 'convert_from_db': str , 'convert_from_db': str
}, },
'journal': 'journal':
{ 'id' : 5 { 'id' : 5
, 'type' : str , 'type' : str
, 'convert_to_db' : str , 'convert_to_db' : str
, 'convert_from_db': str , 'convert_from_db': str
}, },
'abstract': 'abstract':
{ 'id' : 6 { 'id' : 6
, 'type' : str , 'type' : str
, 'convert_to_db' : str , 'convert_to_db' : str
, 'convert_from_db': str , 'convert_from_db': str
}, },
'text': 'text':
{ 'id' : 7 { 'id' : 7
, 'type' : str , 'type' : str
, 'convert_to_db' : str , 'convert_to_db' : str
, 'convert_from_db': str , 'convert_from_db': str
}, },
'page': 'page':
{ 'id' : 8 { 'id' : 8
, 'type' : int , 'type' : int
......
from gargantext.constants import * from gargantext.constants import *
from gargantext.util.digest import str_digest from gargantext.util.digest import str_digest
from gargantext.util import http from gargantext.util import http
def save(contents, name='', basedir=''): def save(contents, name='', basedir=''):
......
...@@ -29,7 +29,7 @@ import urllib.request ...@@ -29,7 +29,7 @@ import urllib.request
def get(url): def get(url):
response = urllib.request.urlopen(url) response = urllib.request.urlopen(url)
html = response.read() return response.read()
# retrieve GET parameters from a request # retrieve GET parameters from a request
......
...@@ -94,7 +94,7 @@ def project(request, project_id): ...@@ -94,7 +94,7 @@ def project(request, project_id):
) )
session.add(corpus) session.add(corpus)
session.commit() session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists # parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract_indexhyperdata)(corpus.id) scheduled(parse_extract_indexhyperdata)(corpus.id)
......
...@@ -2,10 +2,15 @@ ...@@ -2,10 +2,15 @@
# ***** Medline Fetcher ***** # ***** Medline Fetcher *****
# **************************** # ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays # MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
from gargantext.util.files import download
import sys import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen else: from urllib import urlopen
import os import os
import time import time
# import libxml2 # import libxml2
...@@ -21,48 +26,60 @@ from queue import Queue ...@@ -21,48 +26,60 @@ from queue import Queue
class MedlineFetcher: class MedlineFetcher:
def __init__(self): def __init__(self):
self.queue_size = 8 self.queue_size = 8
self.q = Queue() self.q = Queue()
self.firstResults = [] self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils' self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed' self.pubMedDB = 'Pubmed'
self.reportType = 'medline' self.reportType = 'medline'
# Return the globalResults!: # Return the globalResults!:
# - count = # - count =
# - queryKey = # - queryKey =
# - webEnv = # - webEnv =
def medlineEsearch(self , query): def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :") # print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'" "Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'" "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query) # print(query)
origQuery = query origQuery = query
query = query.replace(' ', '%20') query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query) eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try: try:
eSearchResult = urlopen(eSearch) eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data) data = eSearchResult.read()
findcount = etree.XPath("/eSearchResult/Count/text()") root = etree.XML(data)
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()") findcount = etree.XPath("/eSearchResult/Count/text()")
queryKey = findquerykey(root)[0] count = findcount(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0] findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
except: queryKey = findquerykey(root)[0]
count=0
queryKey=False findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv=False webEnv = findwebenv(root)[0]
origQuery=False
except Exception as Error:
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv } print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values return values
...@@ -70,52 +87,32 @@ class MedlineFetcher: ...@@ -70,52 +87,32 @@ class MedlineFetcher:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20) # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records # maximum of 100,000 records
def medlineEfetchRAW( self , fullquery): def medlineEfetchRAW( self , fullquery):
query = fullquery["string"] query = fullquery [ "string" ]
retmax = fullquery["retmax"] retmax = fullquery [ "retmax" ]
count = fullquery["count"] count = fullquery [ "count" ]
queryKey = fullquery["queryKey"] queryKey = fullquery [ "queryKey"]
webEnv = fullquery["webEnv"] webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles" "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results') # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0 retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv) eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch return eFetch
def ensure_dir(self , f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
# generic! # generic!
def downloadFile(self, item): def download(self, url):
url = item[0] print(url)
filename = item[1] filename = download(url)
# print("\tin test_downloadFile:")
# print(url,filename)
data = urlopen(url)
f = codecs.open(filename, "w" ,encoding='utf-8')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
with self.lock: with self.lock:
print(threading.current_thread().name, filename+" OK") print(threading.current_thread().name, filename+" OK")
return filename return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
# print("\tin downloadFile:")
data = urlopen(url)
return data
# generic! # generic!
def do_work(self,item): def do_work(self,item):
...@@ -132,23 +129,24 @@ class MedlineFetcher: ...@@ -132,23 +129,24 @@ class MedlineFetcher:
self.firstResults.append(self.do_work(item)) self.firstResults.append(self.do_work(item))
self.q.task_done() self.q.task_done()
def worker2(self): def worker2(self):
while True: while True:
item = self.q.get() item = self.q.get()
results = [] results = []
try: result = self.downloadFile(item) try:
except: result = False result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result) self.firstResults.append(result)
self.q.task_done() self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT: # GLOBALLIMIT:
# I will retrieve this exact amount of publications. # I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX # The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed) # - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed) # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want. # - GlobalLimit : Number of publications i want.
...@@ -172,7 +170,7 @@ class MedlineFetcher: ...@@ -172,7 +170,7 @@ class MedlineFetcher:
# print ('---------\n') # print ('---------\n')
pubmedquery = str(year) + '[dp] '+query pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue self.q.put( pubmedquery ) #put task in the queue
self.q.join() self.q.join()
print('time:',time.perf_counter() - start) print('time:',time.perf_counter() - start)
...@@ -183,15 +181,16 @@ class MedlineFetcher: ...@@ -183,15 +181,16 @@ class MedlineFetcher:
Total += 1 Total += 1
if globalresults["queryKey"]==False: if globalresults["queryKey"]==False:
Fails += 1 Fails += 1
if globalresults["count"]>0: if globalresults["count"] > 0 :
N+=globalresults["count"] N+=globalresults["count"]
queryhyperdata = {
"string": globalresults["query"] , queryhyperdata = { "string" : globalresults["query"]
"count": globalresults["count"] , , "count" : globalresults["count"]
"queryKey":globalresults["queryKey"] , , "queryKey" : globalresults["queryKey"]
"webEnv":globalresults["webEnv"] , , "webEnv" : globalresults["webEnv"]
"retmax":0 , "retmax" : 0
} }
thequeries.append ( queryhyperdata ) thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications") print("Total Number:", N,"publications")
...@@ -199,14 +198,16 @@ class MedlineFetcher: ...@@ -199,14 +198,16 @@ class MedlineFetcher:
print("---------------------------------------\n") print("---------------------------------------\n")
for i,query in enumerate(thequeries): for i,query in enumerate(thequeries):
k = query["count"] k = query["count"]
proportion = k/float(N) proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion)) retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]") print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False] thequeries = [False]
return thequeries return thequeries
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = RESOURCETYPES["name"]["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
...@@ -8,19 +8,18 @@ import json ...@@ -8,19 +8,18 @@ import json
import datetime import datetime
from os import path from os import path
import threading import threading
from gargantext.settings import MEDIA_ROOT, BASE_DIR #from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES from gargantext.constants import RESOURCETYPES
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata from gargantext.util.toolchain import parse_extract_indexhyperdata
...@@ -37,7 +36,7 @@ QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX']) ...@@ -37,7 +36,7 @@ QUERY_SIZE_N_MAX = 1000 # int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT']) # QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# -------------------------------------------------------------------- # --------------------------------------------------------------------
def getGlobalStats(request ): def getGlobalStats( request ):
""" """
Pubmed year by year results Pubmed year by year results
...@@ -73,37 +72,8 @@ def getGlobalStats(request ): ...@@ -73,37 +72,8 @@ def getGlobalStats(request ):
return JsonHttpResponse(data) return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher() def doTheQuery( request , project_id ) :
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id):
# implicit global session # implicit global session
# do we have a valid project id? # do we have a valid project id?
try: try:
...@@ -111,11 +81,10 @@ def doTheQuery(request , project_id): ...@@ -111,11 +81,10 @@ def doTheQuery(request , project_id):
except ValueError: except ValueError:
raise Http404() raise Http404()
# do we have a valid project? # do we have a valid project?
project = (session project = (session.query( Node )
.query(Node) .filter(Node.id == project_id)
.filter(Node.id == project_id) .filter(Node.typename == 'PROJECT')
.filter(Node.typename == 'PROJECT') ).first()
).first()
if project is None: if project is None:
raise Http404() raise Http404()
...@@ -130,7 +99,7 @@ def doTheQuery(request , project_id): ...@@ -130,7 +99,7 @@ def doTheQuery(request , project_id):
if request.method == "POST": if request.method == "POST":
queries = request.POST["query"] queries = request.POST["query"]
name = request.POST["string"] name = request.POST["string"]
# here we just realize queries already prepared by getGlobalStats # here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <=== # ===> no need to repeat N parameter like in testISTEX <===
...@@ -147,7 +116,6 @@ def doTheQuery(request , project_id): ...@@ -147,7 +116,6 @@ def doTheQuery(request , project_id):
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"] alist = ["tudo fixe" , "tudo bem"]
resourcetype = RESOURCETYPES['name']['Pubmed (xml format)']
# corpus node instanciation as a Django model # corpus node instanciation as a Django model
corpus = Node( corpus = Node(
...@@ -155,8 +123,9 @@ def doTheQuery(request , project_id): ...@@ -155,8 +123,9 @@ def doTheQuery(request , project_id):
user_id = request.user.id, user_id = request.user.id,
parent_id = project_id, parent_id = project_id,
typename = 'CORPUS', typename = 'CORPUS',
language_id = None, hyperdata = { "action" : "Scraping data"
hyperdata = {'Processing' : "Parsing documents",} , "language_id" : None
}
) )
session.add(corpus) session.add(corpus)
session.commit() session.commit()
...@@ -177,22 +146,21 @@ def doTheQuery(request , project_id): ...@@ -177,22 +146,21 @@ def doTheQuery(request , project_id):
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start() t.start()
for url in urlreqs: for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat())) tasks.q.put( url ) #put a task in the queue
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished tasks.q.join() # wait until everything is finished
dwnldsOK = 0 dwnldsOK = 0
for filename in tasks.firstResults: for filename in tasks.firstResults :
if filename!=False: print(filename)
if filename != False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
add_resource(corpus, corpus.add_resource( type = 3
user_id = request.user.id, , path = filename
type_id = resourcetype.id, )
file = filename,
)
dwnldsOK+=1 dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"]) if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
try: try:
scheduled(parse_extract_indexhyperdata(corpus_id,)) scheduled(parse_extract_indexhyperdata(corpus_id,))
...@@ -200,118 +168,10 @@ def doTheQuery(request , project_id): ...@@ -200,118 +168,10 @@ def doTheQuery(request , project_id):
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
sleep(1) sleep(1)
return HttpResponseRedirect('/project/' + str(project_id)) return HttpResponseRedirect('/projects/' + str(project_id))
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = RESOURCETYPES["name"]["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
...@@ -10,7 +10,11 @@ import scrapers.pubmed as pubmed ...@@ -10,7 +10,11 @@ import scrapers.pubmed as pubmed
# Available databases : Pubmed, IsTex, (next: CERN) # Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash # /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$', pubmed.getGlobalStats) urlpatterns = [ url(r'^pubmed/query$' , pubmed.getGlobalStats )
, url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery )
# , url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
# , url(r'^istex/search/(\d+)' , pubmed.testISTEX )
#, url(r'^scraping$' , scraping.Target.as_view() ) #, url(r'^scraping$' , scraping.Target.as_view() )
, ,
] ]
...@@ -260,7 +260,7 @@ ...@@ -260,7 +260,7 @@
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/pubmedquery/go", url: window.location.origin+"/scrapers/pubmed/search/"+projectid,
data: pubmedifiedQuery, data: pubmedifiedQuery,
type: 'POST', type: 'POST',
beforeSend: function(xhr) { beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment