Commit 7476e4c0 authored by Administrator's avatar Administrator

Merge branch 'samuel' into testing

parents 2ea48b86 4cfacd18
import os
command = 'export PGPASSWORD=C8kdcUrAQy66U\npg_dump -U alexandre -h localhost gargandb| gzip > %s' % "mysqldump.db"
os.system(command)
......@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
......
......@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
if len(file_)>104857600:
raise forms.ValidationError(_('File to heavy! (<100MB).'))
# if len(file_)>104857600:
# raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
......
......@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
data = []
for node in self:
print(node.id)
for key, value in node.metadata.items():
if key in metadata_cache:
metadata = metadata_cache[key]
......@@ -249,13 +248,14 @@ class Node(CTENode):
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.metadata['Processing'] = 1
self.save()
self.parse_resources()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
......@@ -266,7 +266,7 @@ class Node(CTENode):
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print("- - - - - - - - - - \n")
print ("LOG::TIME: ",(end - start))
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
......@@ -275,9 +275,9 @@ class Node(CTENode):
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
self.metadata['Processing'] = 0
self.save()
......
......@@ -80,21 +80,28 @@ class PubmedFileParser(FileParser):
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata)
if Decision!=False:
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -12,6 +12,7 @@ import time
from lxml import etree
import datetime
from django.core.files import File
import codecs
import threading
from queue import Queue
......@@ -39,6 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print(query)
origQuery = query
query = query.replace(' ', '%20')
......@@ -92,10 +94,10 @@ class MedlineFetcher:
def downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
print(url,filename)
print("\tin test_downloadFile:")
# print(url,filename)
data = urlopen(url)
f = open(filename, 'w')
f = codecs.open(filename, "w" ,encoding='utf-8')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
......@@ -104,6 +106,13 @@ class MedlineFetcher:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
data = urlopen(url)
return data
# generic!
def do_work(self,item):
......@@ -123,7 +132,10 @@ class MedlineFetcher:
def worker2(self):
while True:
item = self.q.get()
self.firstResults.append(self.downloadFile(item))
results = []
try: result = self.downloadFile(item)
except: result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
......
......@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string
tasks = MedlineFetcher()
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
......@@ -85,36 +111,36 @@ def doTheQuery(request , project_id):
corpus.save()
try:
tasks = MedlineFetcher()
tasks.ensure_dir ( MEDIA_ROOT + '/corpora/'+str(request.user)+"/" )
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
return JsonHttpResponse(["workflow","finished","outside the try-except"])
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print("lele",error)
print(error)
return JsonHttpResponse(["workflow","finished","outside the try-except"])
data = alist
return JsonHttpResponse(data)
......@@ -146,7 +172,8 @@ def testISTEX(request , project_id):
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" )
......
......@@ -313,44 +313,87 @@
console.log("disabling "+"#"+value.id)
$("#"+value.id).prop('onclick',null);
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=0,k=0;
for(var i in thequeries) N += thequeries[i].count
if( N>0) {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
$('#submit_thing').prop('disabled', true);
}
var theType = $("#id_type option:selected").html();
if(theType=="pubmed") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=0,k=0;
for(var i in thequeries) N += thequeries[i].count
if( N>0) {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications in the last 5 years</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
console.log("Data not found");
}
});
},
error: function(result) {
console.log("Data not found");
}
});
}
if(theType=="istext") {
console.log(window.location.origin+"tests/istextquery")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
thequeries = data
var N=data.length,k=0;
console.log("N: "+N)
// for(var i in thequeries) N += thequeries[i].count
if( N>1) {
var total = JSON.parse(data).total
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+total+" publications.</i><br>")
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+data[0]+"</b></i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
console.log("Data not found");
}
});
}
}
// CSS events for selecting one Radio-Input
function FileOrNotFile( value ) {
var showfile = JSON.parse(value)
var theType = $("#id_type option:selected").html();
// @upload-file events
if (showfile) {
console.log("You've clicked the YES")
......@@ -376,7 +419,7 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
testAjax( $(this).val() )
if(theType=="pubmed") testPUBMED( $(this).val() )
});
}
}
......@@ -384,8 +427,8 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed") {
console.log("show the button")
if(selected=="pubmed" || selected=="istext") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show();
$("#file_yes").click();
......@@ -414,7 +457,7 @@
return data;
}
function testAjax( query ) {
function testPUBMED( query ) {
LastData = []
if(!query || query=="") return;
var pubmedquery = encodeURIComponent(query)
......@@ -450,7 +493,7 @@
success: function(data) {
console.log("ajax_success: in testISTEX()")
console.log(data)
location.reload();
// location.reload();
},
error: function(result) {
console.log("in testISTEX(). Data not found");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment