Commit a85e4c98 authored by delanoe's avatar delanoe

[REPEC/CRAWLER] Fix formular duplicates and number of pages to crawl.

parent 9f5286ff
......@@ -61,10 +61,13 @@ class MultivacCrawler(Crawler):
, params = querystring
)
print(querystring)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = response.headers["Content-Type"].split("; ")[1].split("=")[1]
charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
......@@ -75,7 +78,12 @@ class MultivacCrawler(Crawler):
Query String -> Int
'''
self.results_nb = 0
total = self._get(query)["results"]["total"]
total = ( self._get(query)
.get("results", {})
.get("total" , 0)
)
self.results_nb = total
return self.results_nb
......@@ -89,20 +97,29 @@ class MultivacCrawler(Crawler):
corpus = []
paging = 100
self.query_max = self.scan_results(query)
print("self.query_max : %s" % self.query_max)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
print("ERROR (scrap: Multivac d/l ): ",msg)
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 1):
for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging))
docs = self._get(query, fromPage=page, count=paging)["results"]["hits"]
docs = (self._get(query, fromPage=page, count=paging)
.get("results", {})
.get("hits" , [])
)
for doc in docs:
corpus.append(doc)
self.path = save(json.dumps(corpus).encode("utf-8"), name='Multivac.json', basedir=UPLOAD_DIRECTORY )
self.path = save( json.dumps(corpus).encode("utf-8")
, name='Multivac.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
......@@ -656,9 +656,11 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
......@@ -695,7 +697,7 @@
},
error: function(result) {
$("#theresults").html(theType +" connection error!</i><br>")
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
......@@ -703,8 +705,6 @@
}
}
// CSS events for selecting one Radio-Input
......@@ -887,13 +887,17 @@
}
function saveMultivac(query, N){
//alert("CERN!")
console.log("In Multivac")
if(!query || query=="") return;
//var origQuery = query
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')//replace all the slashes
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
......@@ -911,14 +915,6 @@
function() {
$('#addcorpus').modal('hide')
$("#wait").modal("show");
// setTimeout(
// function(){
// location.reload();
//
// }, 600);
// )
//setTimeout(, 300)
//location.reload();
}, 600);
},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment