Commit f2f0ce75 authored by delanoe's avatar delanoe

[FEAT] Istex scraper ok, need parser now.

parent 9eead9fa
......@@ -160,10 +160,10 @@ RESOURCETYPES = [
'parser': CSVParser,
'default_language': 'en',
},
# { 'name': 'ISTex',
# # 'parser': ISTexParser,
# 'default_language': 'en',
# },
{ 'name': 'ISTex',
'parser': ISTexParser,
'default_language': 'en',
},
]
# linguistic extraction parameters ---------------------------------------------
......
......@@ -4,7 +4,7 @@ from datetime import datetime
from io import BytesIO
import json
class ISTex(Parser):
class ISTexParser(Parser):
def parse(self, thefile):
json_data=open(thefile,"r")
......
......@@ -7,5 +7,5 @@ from .Pubmed import PubmedParser
# # 2015-12-08: parser 2 en 1
from .Europress import EuropressParser
# from .ISTex import ISTexParser
from .ISTex import ISTexParser
from .CSV import CSVParser
......@@ -142,6 +142,13 @@ class MedlineFetcher:
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be :
......
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
resourcetype = RESOURCETYPES["name"]["ISTex"]
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
language_id = None,
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/project/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
......@@ -72,7 +72,6 @@ def getGlobalStats( request ):
return JsonHttpResponse(data)
def doTheQuery( request , project_id ) :
# implicit global session
# do we have a valid project id?
......@@ -174,4 +173,144 @@ def doTheQuery( request , project_id ) :
return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=id,title,abstract,pubdate,corpusName,authors,language"
tasks = MedlineFetcher()
try:
thedata_path = tasks.download( url )
thedata = open(thedata_path, "rb")
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print("testISTEX:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
, "language_id" : None
}
)
session.add(corpus)
session.commit()
corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
, path = filename
)
dwnldsOK+=1
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata(corpus_id,))
except Exception as error:
print('WORKFLOW ERROR')
print(error)
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
......@@ -2,6 +2,7 @@ from django.conf.urls import url
import scrapers.pubmed as pubmed
#import scrapers.istex as istex
#import scrapers.cern as cern
#import scrapers.hal as hal
......@@ -13,8 +14,8 @@ import scrapers.pubmed as pubmed
urlpatterns = [ url(r'^pubmed/query$' , pubmed.getGlobalStats )
, url(r'^pubmed/search/(\d+)' , pubmed.doTheQuery )
# , url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
# , url(r'^istex/search/(\d+)' , pubmed.testISTEX )
, url(r'^istex/query$' , pubmed.getGlobalStatsISTEXT )
, url(r'^istex/search/(\d+)' , pubmed.testISTEX )
#, url(r'^scraping$' , scraping.Target.as_view() )
,
]
......@@ -370,10 +370,10 @@
}
if(theType=="ISTex") {
console.log(window.location.origin+"tests/istextquery")
console.log(window.location.origin+"scrapers/istex/query")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
url: window.location.origin+"/scrapers/istex/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
......@@ -504,7 +504,7 @@
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
url: window.location.origin+"/scrapers/istex/search/"+projectid,
data: postQuery,
type: 'POST',
beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment