Commit 7952a330 authored by delanoe's avatar delanoe

[REPEC/CRAWLER] Fix formular duplicates and number of pages to crawl.

parent bec6acf4
...@@ -61,10 +61,13 @@ class MultivacCrawler(Crawler): ...@@ -61,10 +61,13 @@ class MultivacCrawler(Crawler):
, params = querystring , params = querystring
) )
print(querystring) #print(querystring)
# Validation : 200 if ok else raise Value # Validation : 200 if ok else raise Value
if response.status_code == 200: if response.status_code == 200:
charset = response.headers["Content-Type"].split("; ")[1].split("=")[1] charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset))) return (json.loads(response.content.decode(charset)))
else: else:
raise ValueError(response.status_code, response.reason) raise ValueError(response.status_code, response.reason)
...@@ -75,7 +78,12 @@ class MultivacCrawler(Crawler): ...@@ -75,7 +78,12 @@ class MultivacCrawler(Crawler):
Query String -> Int Query String -> Int
''' '''
self.results_nb = 0 self.results_nb = 0
total = self._get(query)["results"]["total"]
total = ( self._get(query)
.get("results", {})
.get("total" , 0)
)
self.results_nb = total self.results_nb = total
return self.results_nb return self.results_nb
...@@ -89,20 +97,29 @@ class MultivacCrawler(Crawler): ...@@ -89,20 +97,29 @@ class MultivacCrawler(Crawler):
corpus = [] corpus = []
paging = 100 paging = 100
self.query_max = self.scan_results(query) self.query_max = self.scan_results(query)
print("self.query_max : %s" % self.query_max) #print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX: if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX) msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
print("ERROR (scrap: Multivac d/l ): ",msg) , QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 1): for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging)) print("Downloading page %s to %s results" % (page, paging))
docs = self._get(query, fromPage=page, count=paging)["results"]["hits"] docs = (self._get(query, fromPage=page, count=paging)
.get("results", {})
.get("hits" , [])
)
for doc in docs: for doc in docs:
corpus.append(doc) corpus.append(doc)
self.path = save(json.dumps(corpus).encode("utf-8"), name='Multivac.json', basedir=UPLOAD_DIRECTORY ) self.path = save( json.dumps(corpus).encode("utf-8")
, name='Multivac.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True downloaded = True
return downloaded return downloaded
...@@ -656,9 +656,11 @@ ...@@ -656,9 +656,11 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
}, },
success: function(data) { success: function(data) {
console.log(data)
console.log("SUCCESS") console.log("SUCCESS")
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!") //$("#submit_thing").html("Process a {{ query_size }} sample!")
...@@ -695,7 +697,7 @@ ...@@ -695,7 +697,7 @@
}, },
error: function(result) { error: function(result) {
$("#theresults").html(theType +" connection error!</i><br>") $("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true); $('#submit_thing').prop('disabled', true);
} }
}); });
...@@ -703,8 +705,6 @@ ...@@ -703,8 +705,6 @@
} }
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
...@@ -887,13 +887,17 @@ ...@@ -887,13 +887,17 @@
} }
function saveMultivac(query, N){ function saveMultivac(query, N){
//alert("CERN!")
console.log("In Multivac") console.log("In Multivac")
if(!query || query=="") return; if(!query || query=="") return;
//var origQuery = query console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N }; var data = { "query" : query , "N": N };
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')//replace all the slashes
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data) console.log(data)
$.ajax({ $.ajax({
dataType: 'json', dataType: 'json',
...@@ -911,14 +915,6 @@ ...@@ -911,14 +915,6 @@
function() { function() {
$('#addcorpus').modal('hide') $('#addcorpus').modal('hide')
$("#wait").modal("show"); $("#wait").modal("show");
// setTimeout(
// function(){
// location.reload();
//
// }, 600);
// )
//setTimeout(, 300)
//location.reload();
}, 600); }, 600);
}, },
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment