Commit 851fb664 authored by Administrator's avatar Administrator

Merge branch 'testing' into stable

parents f08ee762 7476e4c0
import os
command = 'export PGPASSWORD=C8kdcUrAQy66U\npg_dump -U alexandre -h localhost gargandb| gzip > %s' % "mysqldump.db"
os.system(command)
...@@ -70,6 +70,7 @@ urlpatterns = patterns('', ...@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url(r'^tests/mvc$', views.tests_mvc), url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments), url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats), url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery), url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX) url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
......
...@@ -125,8 +125,8 @@ class CustomForm(forms.Form): ...@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond) # file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name)) # # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size # #File size
if len(file_)>104857600: # if len(file_)>104857600:
raise forms.ValidationError(_('File to heavy! (<100MB).')) # raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type: ## File type:
# if file_.content_type == "application/zip": # if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.')) # raise forms.ValidationError(_('We need a zip pls.'))
......
...@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet): ...@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()} metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
data = [] data = []
for node in self: for node in self:
print(node.id)
for key, value in node.metadata.items(): for key, value in node.metadata.items():
if key in metadata_cache: if key in metadata_cache:
metadata = metadata_cache[key] metadata = metadata_cache[key]
...@@ -249,13 +248,14 @@ class Node(CTENode): ...@@ -249,13 +248,14 @@ class Node(CTENode):
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time import time
total = 0
print("LOG::TIME: In workflow() parse_resources()") print("LOG::TIME: In workflow() parse_resources()")
start = time.time() start = time.time()
self.metadata['Processing'] = 1 self.metadata['Processing'] = 1
self.save() self.save()
self.parse_resources() self.parse_resources()
end = time.time() end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()") print("LOG::TIME: In workflow() / parse_resources()")
...@@ -266,7 +266,7 @@ class Node(CTENode): ...@@ -266,7 +266,7 @@ class Node(CTENode):
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time() end = time.time()
print("- - - - - - - - - - \n") print("- - - - - - - - - - \n")
print ("LOG::TIME: ",(end - start)) total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()") print("LOG::TIME: In workflow() / extract_ngrams()")
...@@ -275,9 +275,9 @@ class Node(CTENode): ...@@ -275,9 +275,9 @@ class Node(CTENode):
from analysis.functions import do_tfidf from analysis.functions import do_tfidf
do_tfidf(self) do_tfidf(self)
end = time.time() end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()") print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END") print("In workflow() END")
self.metadata['Processing'] = 0 self.metadata['Processing'] = 0
self.save() self.save()
......
...@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser): ...@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser):
if len(RealDate)>4: if len(RealDate)>4:
if len(RealDate)>8: if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date() try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else: else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date() try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except:
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year) if Decision!=False:
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month) if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day) if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "realdate_year_" in metadata: metadata.pop("realdate_year_") if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_month_" in metadata: metadata.pop("realdate_month_") if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_") if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "title2" in metadata: metadata.pop("title2") if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata) # print(metadata)
metadata_list.append(metadata)
# return the list of metadata # return the list of metadata
return metadata_list return metadata_list
...@@ -12,6 +12,7 @@ import time ...@@ -12,6 +12,7 @@ import time
from lxml import etree from lxml import etree
import datetime import datetime
from django.core.files import File from django.core.files import File
import codecs
import threading import threading
from queue import Queue from queue import Queue
...@@ -39,6 +40,7 @@ class MedlineFetcher: ...@@ -39,6 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'" "Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'" "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print(query)
origQuery = query origQuery = query
query = query.replace(' ', '%20') query = query.replace(' ', '%20')
...@@ -92,10 +94,10 @@ class MedlineFetcher: ...@@ -92,10 +94,10 @@ class MedlineFetcher:
def downloadFile(self, item): def downloadFile(self, item):
url = item[0] url = item[0]
filename = item[1] filename = item[1]
print("\tin downloadFile:") print("\tin test_downloadFile:")
print(url,filename) # print(url,filename)
data = urlopen(url) data = urlopen(url)
f = open(filename, 'w') f = codecs.open(filename, "w" ,encoding='utf-8')
myfile = File(f) myfile = File(f)
myfile.write( data.read().decode('utf-8') ) myfile.write( data.read().decode('utf-8') )
myfile.close() myfile.close()
...@@ -104,6 +106,13 @@ class MedlineFetcher: ...@@ -104,6 +106,13 @@ class MedlineFetcher:
print(threading.current_thread().name, filename+" OK") print(threading.current_thread().name, filename+" OK")
return filename return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
data = urlopen(url)
return data
# generic! # generic!
def do_work(self,item): def do_work(self,item):
...@@ -123,7 +132,10 @@ class MedlineFetcher: ...@@ -123,7 +132,10 @@ class MedlineFetcher:
def worker2(self): def worker2(self):
while True: while True:
item = self.q.get() item = self.q.get()
self.firstResults.append(self.downloadFile(item)) results = []
try: result = self.downloadFile(item)
except: result = False
self.firstResults.append(result)
self.q.task_done() self.q.task_done()
def chunks(self , l , n): def chunks(self , l , n):
......
...@@ -43,6 +43,32 @@ def getGlobalStats(request ): ...@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return JsonHttpResponse(data) return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string
tasks = MedlineFetcher()
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id): def doTheQuery(request , project_id):
alist = ["hola","mundo"] alist = ["hola","mundo"]
...@@ -85,36 +111,36 @@ def doTheQuery(request , project_id): ...@@ -85,36 +111,36 @@ def doTheQuery(request , project_id):
corpus.save() corpus.save()
try: tasks = MedlineFetcher()
tasks = MedlineFetcher() for i in range(8):
tasks.ensure_dir ( MEDIA_ROOT + '/corpora/'+str(request.user)+"/" ) t = threading.Thread(target=tasks.worker2) #thing to do
# configuring your queue with the event t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
for i in range(8): t.start()
t = threading.Thread(target=tasks.worker2) #thing to do for url in urlreqs:
t.daemon = True # thread dies when main thread (only non-daemon thread) exits. filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
t.start() tasks.q.put( [url , filename]) #put a task in th queue
for url in urlreqs: tasks.q.join() # wait until everything is finished
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"]) dwnldsOK = 0
except Exception as error: for filename in tasks.firstResults:
print(error) if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
return JsonHttpResponse(["workflow","finished","outside the try-except"]) # do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error: except Exception as error:
print("lele",error) print(error)
return JsonHttpResponse(["workflow","finished","outside the try-except"])
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
...@@ -146,7 +172,8 @@ def testISTEX(request , project_id): ...@@ -146,7 +172,8 @@ def testISTEX(request , project_id):
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize)) urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs) print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ] urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" ) resource_type = ResourceType.objects.get(name="istext" )
......
...@@ -313,44 +313,87 @@ ...@@ -313,44 +313,87 @@
console.log("disabling "+"#"+value.id) console.log("disabling "+"#"+value.id)
$("#"+value.id).prop('onclick',null); $("#"+value.id).prop('onclick',null);
$.ajax({ var theType = $("#id_type option:selected").html();
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery", if(theType=="pubmed") {
data: formData, $.ajax({
type: 'POST', // contentType: "application/json",
beforeSend: function(xhr) { url: window.location.origin+"/tests/pubmedquery",
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); data: formData,
}, type: 'POST',
success: function(data) { beforeSend: function(xhr) {
console.log("in getGlobalResults") xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
console.log(data) },
console.log("enabling "+"#"+value.id) success: function(data) {
$("#"+value.id).attr('onclick','getGlobalResults(this);'); console.log("in getGlobalResults")
// $("#submit_thing").prop('disabled' , false) console.log(data)
$("#submit_thing").html("Process a 100 sample!") console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
thequeries = data // $("#submit_thing").prop('disabled' , false)
var N=0,k=0; $("#submit_thing").html("Process a 100 sample!")
for(var i in thequeries) N += thequeries[i].count thequeries = data
if( N>0) { var N=0,k=0;
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', false); for(var i in thequeries) N += thequeries[i].count
} else { if( N>0) {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>") $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', true); $('#submit_thing').prop('disabled', false);
} } else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
$('#submit_thing').prop('disabled', true);
}
}, },
error: function(result) { error: function(result) {
console.log("Data not found"); console.log("Data not found");
} }
}); });
}
if(theType=="istext") {
console.log(window.location.origin+"tests/istextquery")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=data.length,k=0;
console.log("N: "+N)
// for(var i in thequeries) N += thequeries[i].count
if( N>1) {
var total = JSON.parse(data).total
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+data[0]+"</b></i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
console.log("Data not found");
}
});
}
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
function FileOrNotFile( value ) { function FileOrNotFile( value ) {
var showfile = JSON.parse(value) var showfile = JSON.parse(value)
var theType = $("#id_type option:selected").html();
// @upload-file events // @upload-file events
if (showfile) { if (showfile) {
console.log("You've clicked the YES") console.log("You've clicked the YES")
...@@ -376,7 +419,7 @@ ...@@ -376,7 +419,7 @@
$( "#id_name" ).on('input',function(e){ $( "#id_name" ).on('input',function(e){
console.log($(this).val()) console.log($(this).val())
testAjax( $(this).val() ) if(theType=="pubmed") testPUBMED( $(this).val() )
}); });
} }
} }
...@@ -384,8 +427,8 @@ ...@@ -384,8 +427,8 @@
//CSS events for changing the Select element //CSS events for changing the Select element
function CustomForSelect( selected ) { function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed") { if(selected=="pubmed" || selected=="istext") {
console.log("show the button") console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show(); $("#pubmedcrawl").show();
$("#file_yes").click(); $("#file_yes").click();
...@@ -414,7 +457,7 @@ ...@@ -414,7 +457,7 @@
return data; return data;
} }
function testAjax( query ) { function testPUBMED( query ) {
LastData = [] LastData = []
if(!query || query=="") return; if(!query || query=="") return;
var pubmedquery = encodeURIComponent(query) var pubmedquery = encodeURIComponent(query)
...@@ -450,7 +493,7 @@ ...@@ -450,7 +493,7 @@
success: function(data) { success: function(data) {
console.log("ajax_success: in testISTEX()") console.log("ajax_success: in testISTEX()")
console.log(data) console.log(data)
location.reload(); // location.reload();
}, },
error: function(result) { error: function(result) {
console.log("in testISTEX(). Data not found"); console.log("in testISTEX(). Data not found");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment