Commit 45a1fa9d authored by PkSM3's avatar PkSM3

[UPDATE] istex|pubmed scrapper

parent 119671ab
......@@ -70,6 +70,7 @@ urlpatterns = patterns('',
url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/istextquery$', pubmedscrapper.getGlobalStatsISTEXT),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
......
......@@ -125,8 +125,8 @@ class CustomForm(forms.Form):
# file_.name = str(datetime.now().microsecond)
# # raise forms.ValidationError(_('Come on dude, name too long. Now is:'+file_.name))
# #File size
if len(file_)>104857600:
raise forms.ValidationError(_('File to heavy! (<100MB).'))
# if len(file_)>104857600:
# raise forms.ValidationError(_('File to heavy! (<100MB).'))
## File type:
# if file_.content_type == "application/zip":
# raise forms.ValidationError(_('We need a zip pls.'))
......
......@@ -81,7 +81,6 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
data = []
for node in self:
print(node.id)
for key, value in node.metadata.items():
if key in metadata_cache:
metadata = metadata_cache[key]
......@@ -249,13 +248,14 @@ class Node(CTENode):
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.metadata['Processing'] = 1
self.save()
self.parse_resources()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
......@@ -266,7 +266,7 @@ class Node(CTENode):
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print("- - - - - - - - - - \n")
print ("LOG::TIME: ",(end - start))
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
......@@ -275,9 +275,9 @@ class Node(CTENode):
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
self.metadata['Processing'] = 0
self.save()
......
......@@ -80,12 +80,19 @@ class PubmedFileParser(FileParser):
if len(RealDate)>4:
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
else:
try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
except: Decision=False
if Decision!=False:
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
......
......@@ -105,6 +105,13 @@ class MedlineFetcher:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
data = urlopen(url)
return data
# generic!
def do_work(self,item):
......@@ -124,7 +131,10 @@ class MedlineFetcher:
def worker2(self):
while True:
item = self.q.get()
self.firstResults.append(self.downloadFile(item))
results = []
try: result = self.downloadFile(item)
except: result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
......
......@@ -43,6 +43,32 @@ def getGlobalStats(request ):
return JsonHttpResponse(data)
def getGlobalStatsISTEXT(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
N = 100
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+")
url = "http://api.istex.fr/document/?q="+query_string+"&output=*"
tasks = MedlineFetcher()
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
try:
thedata = tasks.test_downloadFile( [url,filename] )
alist = thedata.read().decode('utf-8')
except Exception as error:
alist = [str(error)]
data = alist
return JsonHttpResponse(data)
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
......@@ -97,8 +123,13 @@ def doTheQuery(request , project_id):
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK+=1
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow
try:
......@@ -146,7 +177,8 @@ def testISTEX(request , project_id):
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print(urlreqs)
resource_type = ResourceType.objects.get(name="istext" )
......
......@@ -313,6 +313,9 @@
console.log("disabling "+"#"+value.id)
$("#"+value.id).prop('onclick',null);
var theType = $("#id_type option:selected").html();
if(theType=="pubmed") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
......@@ -348,9 +351,39 @@
});
}
if(theType=="istext") {
console.log(window.location.origin+"tests/istextquery")
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/istextquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 100 sample!")
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+data[0]+"</i><br>")
thequeries = data
},
error: function(result) {
console.log("Data not found");
}
});
}
}
// CSS events for selecting one Radio-Input
function FileOrNotFile( value ) {
var showfile = JSON.parse(value)
var theType = $("#id_type option:selected").html();
// @upload-file events
if (showfile) {
console.log("You've clicked the YES")
......@@ -376,7 +409,7 @@
$( "#id_name" ).on('input',function(e){
console.log($(this).val())
testAjax( $(this).val() )
if(theType=="pubmed") testPUBMED( $(this).val() )
});
}
}
......@@ -384,8 +417,8 @@
//CSS events for changing the Select element
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
if(selected=="pubmed") {
console.log("show the button")
if(selected=="pubmed" || selected=="istext") {
console.log("show the button for: "+selected)
$("#pubmedcrawl").css("visibility", "visible");
$("#pubmedcrawl").show();
$("#file_yes").click();
......@@ -414,7 +447,7 @@
return data;
}
function testAjax( query ) {
function testPUBMED( query ) {
LastData = []
if(!query || query=="") return;
var pubmedquery = encodeURIComponent(query)
......@@ -450,7 +483,7 @@
success: function(data) {
console.log("ajax_success: in testISTEX()")
console.log(data)
location.reload();
// location.reload();
},
error: function(result) {
console.log("in testISTEX(). Data not found");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment