Commit b94e4312 authored by sim's avatar sim

Revert "Remove moissonneurs module"

This reverts commit fde04dab.
parent 5df80fbb
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** CERN Crawler *****
# ****************************
RESOURCE_TYPE_SCOAP = 9
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL = 11
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect \
, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_HAL)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_HAL)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** ISIDORE Crawler *****
# ****************************
RESOURCE_TYPE_ISIDORE = 12
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "fr"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
from datetime import datetime
from time import sleep
import datetime
import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from .util import Scraper
RESOURCE_TYPE_ISTEX = 8
def query( request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
if request.POST["N"] == "NaN":
N = QUERY_SIZE_N_MAX
else:
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = Scraper()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def save(request , project_id, return_corpus=False):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
query_string = ""
if request.method == "POST":
query = "-"
query_string = "-"
#N = QUERY_SIZE_N_MAX
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
if request.POST["N"] == "NaN":
N = QUERY_SIZE_N_MAX
else:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
#msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
#print("ERROR (scrap: istex d/l ): ",msg)
#raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : None
}
)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = get_resource(RESOURCE_TYPE_ISTEX)["type"]
, path = filename
)
dwnldsOK+=1
session.add(corpus)
session.commit()
#corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC = 10
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id, return_corpus=False):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
if return_corpus:
return corpus
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
# ****************************
# ***** Medline Scraper *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
from time import sleep
import json
import datetime
from os import path
import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource_by_name, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from .util import Scraper
def query( request ):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in query_save)
"""
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
if request.POST["N"] == "NaN":
N = QUERY_SIZE_N_MAX
else:
N = int(request.POST["N"])
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = Scraper()
# serialFetcher (n_last_years, query, query_size)
alist = instancia.serialFetcher( 5, query , N )
data = alist
return JsonHttpResponse(data)
def save( request , project_id, return_corpus=False ) :
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
if request.method == "POST":
queries = request.POST["query"]
name = request.POST["string"]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia = Scraper()
thequeries = json.loads(queries)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
urlreqs = []
for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
# corpus node instanciation as a Django model
corpus = project.add_child( name=name
, typename = "CORPUS"
)
# """
# urlreqs: List of urls to query.
# - Then, to each url in urlreqs you do:
# eFetchResult = urlopen(url)
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in the queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults :
print(filename)
if filename != False:
# add the uploaded resource to the corpus
corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"]
, path = filename
, url = None
)
print("Adding the resource")
dwnldsOK+=1
session.add(corpus)
session.commit()
corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
sleep(1)
if return_corpus:
return corpus
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
return JsonHttpResponse(data)
# ____ ____ ____ _ _ ____ _____ ____ _
#/ ___| / ___| _ \| || | | _ \___ /| _ \ | |
#\___ \| | | |_) | || |_| |_) ||_ \| |_) / __)
# ___) | |___| _ <|__ _| __/___) | _ <\__ \
#|____/ \____|_| \_\ |_| |_| |____/|_| \_( /
# |_|
#
# moissonneurs == getting data from external databases
from django.conf.urls import url
# Available databases :
import gargantext.moissonneurs.pubmed as pubmed
import gargantext.moissonneurs.istex as istex
import gargantext.moissonneurs.cern as cern
import gargantext.moissonneurs.multivac as multivac
import gargantext.moissonneurs.hal as hal
import gargantext.moissonneurs.isidore as isidore
# TODO : ISIDORE
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
, url(r'^isidore/query$' , isidore.query )
, url(r'^isidore/save/(\d+)' , isidore.save )
]
from gargantext.util.files import download
import sys
import time
import threading
from queue import Queue
from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except Exception as Error:
print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery [ "string" ]
retmax = fullquery [ "retmax" ]
count = fullquery [ "count" ]
queryKey = fullquery [ "queryKey"]
webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
# generic!
def download(self, url):
print(url)
filename = download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try:
result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"] > 0 :
N+=globalresults["count"]
queryhyperdata = { "string" : globalresults["query"]
, "count" : globalresults["count"]
, "queryKey" : globalresults["queryKey"]
, "webEnv" : globalresults["webEnv"]
, "retmax" : 0
}
thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment