Commit 0c6a4ea4 authored by delanoe's avatar delanoe

[MERGE/FIX] Fix merge urls

parents 6256e2ee 3aae3103
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100755 to 100644
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -47,59 +47,59 @@ def convert_to_date(date):
return dateutil.parser.parse(date)
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count':
{ 'id' : 1
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
'publication_date':
{ 'id' : 2
, 'type' : datetime.datetime
, 'convert_to_db' : convert_to_date
, 'convert_from_db': datetime.datetime.fromtimestamp
},
'title':
{ 'id' : 3
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'authors':
{ 'id' : 4
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'journal':
{ 'id' : 5
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'abstract':
{ 'id' : 6
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'text':
{ 'id' : 7
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'page':
{ 'id' : 8
, 'type' : int
......@@ -160,10 +160,10 @@ RESOURCETYPES = [
'parser': CSVParser,
'default_language': 'en',
},
# { 'name': 'ISTex',
# # 'parser': ISTexParser,
# 'default_language': 'en',
# },
{ 'name': 'ISTex',
'parser': ISTexParser,
'default_language': 'en',
},
]
# linguistic extraction parameters ---------------------------------------------
......@@ -179,11 +179,11 @@ DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
DEFAULT_MAX_NGRAM_LEN = 7 # limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
# (and most longer ngrams have tiny freq anyway)
DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
# them to their DB table
# (potentially bad for acronyms but
# good for variants like same term
......@@ -198,7 +198,9 @@ QUERY_SIZE_N_DEFAULT = 1000
import os
from .settings import BASE_DIR
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
# uploads/.gitignore prevents corpora indexing
# copora can be either a folder or symlink towards specific partition
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
......@@ -206,3 +208,9 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
......@@ -71,7 +71,7 @@ class NodeHyperdata(Base):
value_flt = Column( Double() , index=True )
value_utc = Column( DateTime(timezone=True) , index=True )
value_str = Column( String(255) , index=True )
value_txt = Column( Text , index=True )
value_txt = Column( Text , index=False )
def __init__(self, node=None, key=None, value=None):
......
......@@ -8,9 +8,9 @@ Views are shared between these modules:
- `graph explorer`, to explore graphs
"""
from django.conf.urls import include, url
from django.conf.urls import include, url
from django.contrib import admin
from django.contrib import admin
import gargantext.views.api.urls
import gargantext.views.generated.urls
......@@ -18,30 +18,34 @@ import gargantext.views.pages.urls
# Module Annotation
## tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
# Module "Graph Explorer"
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph
from graphExplorer.rest import Graph
from graphExplorer.views import explorer
urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^generated/', include(gargantext.views.generated.urls)),
url(r'^api/', include(gargantext.views.api.urls)),
url(r'^', include(gargantext.views.pages.urls)),
# Module Annotation
# tempo: unchanged doc-annotations routes --
url(r'^annotations/', include(annotations_urls)),
url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
# Module "Graph Explorer"
url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer),
url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view()),
# to be removed:
url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
]
# Module Scrapers
from scrapers import urls as scrapers_urls
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^generated/' , include( gargantext.views.generated.urls ))
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
# Module "Graph Explorer"
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer )
, url(r'^projects/(\d+)/corpora/(\d+)/graph$' , Graph.as_view())
# to be removed:
, url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
# Scrapers module
, url(r'^scrapers/' , include( scrapers_urls ) )
]
from gargantext.constants import *
from gargantext.constants import *
from gargantext.util.digest import str_digest
from gargantext.util import http
from gargantext.util import http
def save(contents, name='', basedir=''):
......
......@@ -29,7 +29,7 @@ import urllib.request
def get(url):
response = urllib.request.urlopen(url)
html = response.read()
return response.read()
# retrieve GET parameters from a request
......
......@@ -4,7 +4,7 @@ from datetime import datetime
from io import BytesIO
import json
class ISTex(Parser):
class ISTexParser(Parser):
def parse(self, thefile):
json_data=open(thefile,"r")
......@@ -84,16 +84,16 @@ class ISTex(Parser):
# ---------------------------------------------------
if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
# default value = eng
# possible even better: langid.classify(abstract)
else:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"]
......
......@@ -7,5 +7,5 @@ from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1
from .Europress import EuropressParser
# from .ISTex import ISTexParser
from .ISTex import ISTexParser
from .CSV import CSVParser
def suggest(keywords):
return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
def count(keywords):
return 42
def query_save(keywords):
return 'path/to/query.xml'
/srv/gargantext_lib/taggers/nlpserver/turboparser.cpython-34m.so
\ No newline at end of file
import os
from gargantext.settings import MEDIA_ROOT
def ensure_dir(user):
'''
If user is new, folder does not exist yet, create it then
'''
dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
if not os.path.exists(dirpath):
print("Creating folder %s" % dirpath)
os.makedirs(dirpath)
......@@ -3,30 +3,26 @@ from django.conf.urls import url
from . import nodes
from . import ngramlists
urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view())
, url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view() )
, url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view() )
, url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view() )
urlpatterns = [
url(r'^nodes$' , nodes.NodeListResource.as_view()),
url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view()),
# get a list of ngram_ids or ngram_infos by list_id
# url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),
url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
, url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view())
# modify grouping couples of a group node
# ex: POST ngramlists/groups?node=43
# post data looks like : {"767":[209,640],"779":[436,265,385]}"
# add or remove ngram from a list
# ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
# rm <=> DEL ngramlists/change?list=42&ngrams=1,2
url(r'^ngramlists/change$', ngramlists.ListChange.as_view()),
# modify grouping couples of a group node
# ex: POST ngramlists/groups?node=43
# post data looks like : {"767":[209,640],"779":[436,265,385]}"
url(r'^ngramlists/groups$', ngramlists.GroupChange.as_view()),
, url(r'^ngramlists/family$' , ngramlists.ListFamily.as_view())
# entire combination of lists from a corpus
# (or any combination of lists that go together :
# - a mainlist
# - an optional stoplist
# - an optional maplist
# - an optional grouplist
# get entire combination of lists from a corpus
# (or any combination of lists that go together :
# - a mainlist
# - an optional stoplist
# - an optional maplist
# - an optional grouplist)
url(r'^ngramlists/family$', ngramlists.ListFamily.as_view()),
]
]
......@@ -94,7 +94,7 @@ def project(request, project_id):
)
session.add(corpus)
session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract_indexhyperdata)(corpus.id)
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
# from datetime import datetime
from time import sleep
import datetime
import threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
def query( request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = Scraper()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def save(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
# ****************************
# ***** Medline Scraper *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or
# between 9 pm and 5 am Eastern Time weekdays
# from datetime import datetime
from time import sleep
import json
import datetime
from os import path
import threading
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import RESOURCETYPES, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.util.tools import ensure_dir
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from scrapers.util import Scraper
def query( request ):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in query_save)
"""
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = Scraper()
# serialFetcher (n_last_years, query, query_size)
alist = instancia.serialFetcher( 5, query , N )
data = alist
return JsonHttpResponse(data)
def save( request , project_id ) :
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session.query( Node )
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
queries = request.POST["query"]
name = request.POST["string"]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia = Scraper()
thequeries = json.loads(queries)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
urlreqs = []
for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
# """
# urlreqs: List of urls to query.
# - Then, to each url in urlreqs you do:
# eFetchResult = urlopen(url)
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
ensure_dir(request.user)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in the queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults :
print(filename)
if filename != False:
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
return JsonHttpResponse(data)
from django.conf.urls import url
import scrapers.pubmed as pubmed
import scrapers.istex as istex
#import scrapers.cern as cern
#import scrapers.hal as hal
# Scraping : getting data from external database
# Available databases : Pubmed, IsTex, (next: CERN)
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
# TODO REST API for the scrapers
#, url(r'^rest$' , scraping.Target.as_view() )
,
]
#def count(keywords):
# return 42
#
#def query_save(keywords):
# return 'path/to/query.xml'
#
from gargantext.util.files import download
import sys
import time
import threading
from queue import Queue
from lxml import etree
if sys.version_info >= (3, 0):
from urllib.request import urlopen
else:
from urllib import urlopen
class Scraper :
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' \
% ( self.pubMedEutilsURL, self.pubMedDB, query )
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except Exception as Error:
print(Error)
count = 0
queryKey = False
webEnv = False
origQuery = False
values = { "query" : origQuery
, "count" : int(count)
, "queryKey" : queryKey
, "webEnv" : webEnv
}
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery [ "string" ]
retmax = fullquery [ "retmax" ]
count = fullquery [ "count" ]
queryKey = fullquery [ "queryKey"]
webEnv = fullquery [ "webEnv" ]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
# generic!
def download(self, url):
print(url)
filename = download(url)
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try:
result = self.download(item)
except Exception as error :
print(error)
result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
# (k/N)*GlobalLimit
# \_ this is used as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"] > 0 :
N+=globalresults["count"]
queryhyperdata = { "string" : globalresults["query"]
, "count" : globalresults["count"]
, "queryKey" : globalresults["queryKey"]
, "webEnv" : globalresults["webEnv"]
, "retmax" : 0
}
thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"] == 0 : query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1)) == 1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -260,7 +260,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/pubmedquery/go",
url: window.location.origin+"/scrapers/pubmed/save/"+projectid,
data: pubmedifiedQuery,
type: 'POST',
beforeSend: function(xhr) {
......@@ -290,7 +290,7 @@
var theType = $("#id_type option:selected").html();
console.log("consoling the typeeee: ")
console.log(theType)
if(theType=="Pubmed (xml format)") doTheQuery();
if(theType=="Pubmed (XML format)") doTheQuery();
if(theType=="ISTex") {
var origQuery = $("#id_name").val()
console.log("printing the results:")
......@@ -329,10 +329,10 @@
var theType = $("#id_type option:selected").html();
if(theType=="Pubmed (xml format)") {
if(theType=="Pubmed (XML format)") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
url: window.location.origin+"/scrapers/pubmed/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
......@@ -370,10 +370,10 @@
}
if(theType=="ISTex") {
console.log(window.location.origin+"tests/istextquery")
console.log(window.location.origin+"scrapers/istex/query")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
url: window.location.origin+"/scrapers/istex/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
......@@ -436,7 +436,7 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
if(theType=="Pubmed (xml format)")
if(theType=="Pubmed (XML format)")
testPUBMED( $(this).val() )
});
}
......@@ -504,7 +504,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
url: window.location.origin+"/scrapers/istex/save/"+projectid,
data: postQuery,
type: 'POST',
beforeSend: function(xhr) {
......
File mode changed from 100644 to 100755
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment