Commit 45a1fa9d authored by PkSM3's avatar PkSM3

[UPDATE] istex|pubmed scrapper

parent 119671ab
...@@ -70,6 +70,7 @@ urlpatterns = patterns('', ...@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url(r'^tests/mvc$', views.tests_mvc), url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments), url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats), url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery), url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX) url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
......
...@@ -125,8 +125,8 @@ class CustomForm(forms.Form): ...@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond) # file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name)) # # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size # #File size
if len(file_)>104857600: # if len(file_)>104857600:
raise forms.ValidationError(_('File to heavy! (<100MB).')) # raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type: ## File type:
# if file_.content_type == "application/zip": # if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.')) # raise forms.ValidationError(_('We need a zip pls.'))
......
...@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet): ...@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()} metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
data = [] data = []
for node in self: for node in self:
print(node.id)
for key, value in node.metadata.items(): for key, value in node.metadata.items():
if key in metadata_cache: if key in metadata_cache:
metadata = metadata_cache[key] metadata = metadata_cache[key]
...@@ -249,13 +248,14 @@ class Node(CTENode): ...@@ -249,13 +248,14 @@ class Node(CTENode):
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time import time
total = 0
print("LOG::TIME: In workflow() parse_resources()") print("LOG::TIME: In workflow() parse_resources()")
start = time.time() start = time.time()
self.metadata['Processing'] = 1 self.metadata['Processing'] = 1
self.save() self.save()
self.parse_resources() self.parse_resources()
end = time.time() end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()") print("LOG::TIME: In workflow() / parse_resources()")
...@@ -266,7 +266,7 @@ class Node(CTENode): ...@@ -266,7 +266,7 @@ class Node(CTENode):
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time() end = time.time()
print("- - - - - - - - - - \n") print("- - - - - - - - - - \n")
print ("LOG::TIME: ",(end - start)) total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()") print("LOG::TIME: In workflow() / extract_ngrams()")
...@@ -275,9 +275,9 @@ class Node(CTENode): ...@@ -275,9 +275,9 @@ class Node(CTENode):
from analysis.functions import do_tfidf from analysis.functions import do_tfidf
do_tfidf(self) do_tfidf(self)
end = time.time() end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()") print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END") print("In workflow() END")
self.metadata['Processing'] = 0 self.metadata['Processing'] = 0
self.save() self.save()
......
...@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser): ...@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser):
if len(RealDate)>4: if len(RealDate)>4:
if len(RealDate)>8: if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date() try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else: else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date() try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except:
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year) if Decision!=False:
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month) if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day) if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "realdate_year_" in metadata: metadata.pop("realdate_year_") if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_month_" in metadata: metadata.pop("realdate_month_") if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_") if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "title2" in metadata: metadata.pop("title2") if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata) # print(metadata)
metadata_list.append(metadata)
# return the list of metadata # return the list of metadata
return metadata_list return metadata_list
...@@ -105,6 +105,13 @@ class MedlineFetcher: ...@@ -105,6 +105,13 @@ class MedlineFetcher:
print(threading.current_thread().name, filename+" OK") print(threading.current_thread().name, filename+" OK")
return filename return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
data = urlopen(url)
return data
# generic! # generic!
def do_work(self,item): def do_work(self,item):
...@@ -124,7 +131,10 @@ class MedlineFetcher: ...@@ -124,7 +131,10 @@ class MedlineFetcher:
def worker2(self): def worker2(self):
while True: while True:
item = self.q.get() item = self.q.get()
self.firstResults.append(self.downloadFile(item)) results = []
try: result = self.downloadFile(item)
except: result = False
self.firstResults.append(result)
self.q.task_done() self.q.task_done()
def chunks(self , l , n): def chunks(self , l , n):
......
...@@ -43,6 +43,32 @@ def getGlobalStats(request ): ...@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return JsonHttpResponse(data) return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=*"
tasks = MedlineFetcher()
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id): def doTheQuery(request , project_id):
alist = ["hola","mundo"] alist = ["hola","mundo"]
...@@ -97,8 +123,13 @@ def doTheQuery(request , project_id): ...@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat())) filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults: for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename ) if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow # do the WorkFlow
try: try:
...@@ -146,7 +177,8 @@ def testISTEX(request , project_id): ...@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs) print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ] urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" ) resource_type = ResourceType.objects.get(name="istext" )
......
...@@ -313,44 +313,77 @@ ...@@ -313,44 +313,77 @@
console.log("disabling "+"#"+value.id) console.log("disabling "+"#"+value.id)
$("#"+value.id).prop('onclick',null); $("#"+value.id).prop('onclick',null);
$.ajax({ var theType = $("#id_type option:selected").html();
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery", if(theType=="pubmed") {
data: formData, $.ajax({
type: 'POST', // contentType: "application/json",
beforeSend: function(xhr) { url: window.location.origin+"/tests/pubmedquery",
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); data: formData,
}, type: 'POST',
success: function(data) { beforeSend: function(xhr) {
console.log("in getGlobalResults") xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
console.log(data) },
console.log("enabling "+"#"+value.id) success: function(data) {
$("#"+value.id).attr('onclick','getGlobalResults(this);'); console.log("in getGlobalResults")
// $("#submit_thing").prop('disabled' , false) console.log(data)
$("#submit_thing").html("Process a 100 sample!") console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
thequeries = data // $("#submit_thing").prop('disabled' , false)
var N=0,k=0; $("#submit_thing").html("Process a 100 sample!")
for(var i in thequeries) N += thequeries[i].count thequeries = data
if( N>0) { var N=0,k=0;
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', false); for(var i in thequeries) N += thequeries[i].count
} else { if( N>0) {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>") $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', true); $('#submit_thing').prop('disabled', false);
} } else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
$('#submit_thing').prop('disabled', true);
}
}, },
error: function(result) { error: function(result) {
console.log("Data not found"); console.log("Data not found");
} }
}); });
}
if(theType=="istext") {
console.log(window.location.origin+"tests/istextquery")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+data[0]+"</i><br>")
thequeries = data
},
error: function(result) {
console.log("Data not found");
}
});
}
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
function FileOrNotFile( value ) { function FileOrNotFile( value ) {
var showfile = JSON.parse(value) var showfile = JSON.parse(value)
var theType = $("#id_type option:selected").html();
// @upload-file events // @upload-file events
if (showfile) { if (showfile) {
console.log("You've clicked the YES") console.log("You've clicked the YES")
...@@ -376,7 +409,7 @@ ...@@ -376,7 +409,7 @@
$( "#id_name" ).on('input',function(e){ $( "#id_name" ).on('input',function(e){
console.log($(this).val()) console.log($(this).val())
testAjax( $(this).val() ) if(theType=="pubmed") testPUBMED( $(this).val() )
}); });
} }
} }
...@@ -384,8 +417,8 @@ ...@@ -384,8 +417,8 @@
//CSS events for changing the Select element //CSS events for changing the Select element
function CustomForSelect( selected ) { function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed") { if(selected=="pubmed" || selected=="istext") {
console.log("show the button") console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show(); $("#pubmedcrawl").show();
$("#file_yes").click(); $("#file_yes").click();
...@@ -414,7 +447,7 @@ ...@@ -414,7 +447,7 @@
return data; return data;
} }
function testAjax( query ) { function testPUBMED( query ) {
LastData = [] LastData = []
if(!query || query=="") return; if(!query || query=="") return;
var pubmedquery = encodeURIComponent(query) var pubmedquery = encodeURIComponent(query)
...@@ -450,7 +483,7 @@ ...@@ -450,7 +483,7 @@
success: function(data) { success: function(data) {
console.log("ajax_success: in testISTEX()") console.log("ajax_success: in testISTEX()")
console.log(data) console.log(data)
location.reload(); // location.reload();
}, },
error: function(result) { error: function(result) {
console.log("in testISTEX(). Data not found"); console.log("in testISTEX(). Data not found");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment