Set scrappers size in conf (QUERY_SIZE_N_DEFAULT in gargantext.ini)

502c5086 · Romain Loth · 856dc56d · 502c5086 · 502c5086 · 502c5086
Commit 502c5086 authored Jan 29, 2016 by Romain Loth
5 changed files
--- a/gargantext.ini
+++ b/gargantext.ini
@@ -59,3 +59,15 @@ max-requests = 5000

 uid = 1000
 gid = 1000
+
+
+
+################### other gargantext constants ###################
+[scrappers]
+# default number of docs POSTed to scrappers.views.py
+#  (at page  project > add a corpus > scan/process sample)
+QUERY_SIZE_N_DEFAULT = 1000
+
+# checked just before scrap to prevent running impossible workflows
+# even if somebody would set "query size N" manually in POST data
+QUERY_SIZE_N_MAX = 20000
--- a/gargantext_web/views_optimized.py
+++ b/gargantext_web/views_optimized.py
@@ -18,7 +18,7 @@ from gargantext_web.db import *
 from gargantext_web.db import get_or_create_node
 from gargantext_web.views import session

-from gargantext_web.settings import DEBUG, MEDIA_ROOT
+from gargantext_web.settings import DEBUG, MEDIA_ROOT, BASE_DIR
 from rest_v1_0.api import JsonHttpResponse
 from django.db import connection

@@ -32,6 +32,19 @@ from gargantext_web.celery import apply_workflow

 from admin.utils import ensure_dir

+# pour lire la section [scrappers] de gargantext.ini
+from configparser import ConfigParser
+from os import path
+
+# --------------------------------------------------------------------
+# importing constants from config file
+CONF = ConfigParser()
+with open(path.join(BASE_DIR,'gargantext.ini')) as inifile:
+	CONF.read_file(inifile)
+
+QUERY_SIZE_N_DEFAULT = CONF['scrappers']['QUERY_SIZE_N_DEFAULT']
+# --------------------------------------------------------------------
+
 def project(request, project_id):
    # do we have a valid project id?
    try:
@@ -64,7 +77,7 @@ def project(request, project_id):
        if not in_group:
            return JsonHttpResponse( {"request" : "forbidden"} )

-    # Let's find out about the children nodes of the project
+    # Let's find out about the children nodes of the corpus
    ChildrenNode = aliased(Node)
    # This query is giving you the wrong number of docs from the pubmedquerier (x 5)
    #  ... sqlalchemy.func by Resource.type_id is the guilty
@@ -196,6 +209,8 @@ def project(request, project_id):
        'blacklists'    : '',
        'cooclists'     : '',
        'number'        : corpora_count,
+        'query_size'    : QUERY_SIZE_N_DEFAULT,
+        'user_is_admin' : user.is_superuser
    })

 def tfidf(request, corpus_id, ngram_ids):

--- a/scrappers/scrap_pubmed/views.py
+++ b/scrappers/scrap_pubmed/views.py
@@ -30,7 +30,7 @@ import threading
 from node.admin import CustomForm
 from gargantext_web.db import *
 from gargantext_web.db import get_sessionmaker, session,get_session
-from gargantext_web.settings import DEBUG, MEDIA_ROOT
+from gargantext_web.settings import DEBUG, MEDIA_ROOT, BASE_DIR
 from rest_v1_0.api import JsonHttpResponse

 from parsing.corpustools import add_resource, parse_resources, extract_ngrams
@@ -41,16 +41,50 @@ from time import sleep

 from admin.utils import ensure_dir

+# pour lire la section [scrappers] de gargantext.ini
+from configparser import ConfigParser
+from os import path
+
+# --------------------------------------------------------------------
+# importing constants from config file
+CONF = ConfigParser()
+with open(path.join(BASE_DIR,'gargantext.ini')) as inifile:
+	CONF.read_file(inifile)
+
+QUERY_SIZE_N_MAX   = int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
+
+# QUERY_SIZE_N_DEFAULT   = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
+# --------------------------------------------------------------------
 def getGlobalStats(request ):
+	"""
+	Pubmed year by year results
+
+	# alist = [
+	# {'string': '2011[dp] serendipity', 'queryKey': '1', 
+	#  'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
+	# {'string': '2012[dp] serendipity', 'queryKey': '1', 
+	#  'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
+	#  ... ]
+	
+	(reused as thequeries in doTheQuery)
+	"""
 	print(request.method)
-	alist = ["bar","foo"]
+	alist = []

 	if request.method == "POST":
-		N = 1000
 		query = request.POST["query"]
+		N = int(request.POST["N"])
+		
+		if N > QUERY_SIZE_N_MAX:
+			msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+			print("ERROR(scrap: pubmed stats): ",msg)
+			raise ValueError(msg)
+		
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
 		instancia = MedlineFetcher()
+		
+		# serialFetcher (n_last_years, query, query_size)
 		alist = instancia.serialFetcher( 5, query , N )

 	data = alist
@@ -59,12 +93,17 @@ def getGlobalStats(request ):


 def getGlobalStatsISTEXT(request ):
+	"""
+	ISTEX simply the total of hits for a query
+	
+	(not reused in testISTEX)
+	"""
 	print(request.method)
 	alist = ["bar","foo"]

 	if request.method == "POST":
-		N = 1000
 		query = request.POST["query"]
+		N = int(request.POST["N"])
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
 		query_string = query.replace(" ","+")
@@ -109,11 +148,18 @@ def doTheQuery(request , project_id):


 	if request.method == "POST":
-		query = request.POST["query"]
+		queries = request.POST["query"]
 		name = request.POST["string"]
-
+		
+		# here we just realize queries already prepared by getGlobalStats
+		#    ===> no need to repeat N parameter like in testISTEX <===
+		
 		instancia = MedlineFetcher()
-		thequeries = json.loads(query)
+		thequeries = json.loads(queries)
+		
+		# fyi the sum of our prepared yearly proportional quotas
+		sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
+		print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))

 		urlreqs = []
 		for yearquery in thequeries:
@@ -214,15 +260,22 @@ def testISTEX(request , project_id):


 	if request.method == "POST":
-		# print(alist)
 		query = "-"
 		query_string = "-"
-		N = 1000
-		if "query" in request.POST: query = request.POST["query"]
-		if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
-		# if "N" in request.POST: N = request.POST["N"]
-		print(query_string , query , N)
+		N = 0
+		
+		if "query" in request.POST: 
+			query = request.POST["query"]
+			query_string = query.replace(" ","+")   # url encoded q

+		if "N" in request.POST:
+			N = int(request.POST["N"])     # query_size from views_opti
+			if N > QUERY_SIZE_N_MAX:
+				msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
+				print("ERROR (scrap: istex d/l ): ",msg)
+				raise ValueError(msg)
+
+		print("Scrapping Istex: '%s' (%i)" % (query_string , N))

 		urlreqs = []
 		pagesize = 50
@@ -247,7 +300,8 @@ def testISTEX(request , project_id):
 		session.add(corpus)
 		session.commit()
 		corpus_id = corpus.id
-
+		
+		print("NEW CORPUS", corpus_id)
 		ensure_dir(request.user)
 		tasks = MedlineFetcher()


--- a/templates/project.html
+++ b/templates/project.html
@@ -249,7 +249,12 @@
 		    return cookieValue;
 		}

-		var thequeries = []
+		var thequeries = [] ;
+    
+    // load the template's value for N scan size
+    var querySize = parseInt({{query_size}}) ;
+    
+    // TODO if is_admin

 		function doTheQuery() {
 			if ( $('#submit_thing').prop('disabled') ) return;
@@ -257,7 +262,11 @@
 			var origQuery = $("#id_name").val()


-			var pubmedifiedQuery = { query : JSON.stringify(thequeries) , string: origQuery } ;
+			var pubmedifiedQuery = { 
+          query : JSON.stringify(thequeries) , 
+          string: origQuery ,
+          N : querySize
+        } ;
 			console.log(pubmedifiedQuery)

 			var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes
@@ -299,7 +308,7 @@
 					var origQuery = $("#id_name").val()
 					console.log("printing the results:")
 					console.log(origQuery)
-					testISTEX(origQuery.replace(" ","+"),1000)
+					testISTEX(origQuery.replace(" ","+"), querySize)
 				}
 			}
 			else {
@@ -324,9 +333,9 @@
 			console.log("in getGlobalResults()")
 			// AJAX to django
 			var pubmedquery = $("#id_name").val()
-			var Npubs = $("#id_N").val();
+			// var Npubs = $("#id_N").val();
 			if(pubmedquery=="") return;
-			var formData = {query:pubmedquery , N:Npubs}
+			var formData = {query:pubmedquery , N:querySize}
 			$("#theresults").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
 			console.log("disabling "+"#"+value.id)
 			$("#"+value.id).prop('onclick',null);
@@ -349,7 +358,7 @@
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
-					$("#submit_thing").html("Process a 1000 sample!")
+					$("#submit_thing").html("Process a {{ query_size }} sample!")

 		            thequeries = data
 		            var N=0,k=0;
@@ -388,7 +397,7 @@
 					console.log("enabling "+"#"+value.id)
 					$("#"+value.id).attr('onclick','getGlobalResults(this);');
 					// $("#submit_thing").prop('disabled' , false)
-					$("#submit_thing").html("Process a 1000 sample!")
+					$("#submit_thing").html("Process a {{ query_size }} sample!")

 		            thequeries = data
 		            var N=data.length,k=0;
@@ -494,20 +503,19 @@
 			return false;
 		}

-		function testISTEX(query,Npubs) {
+		function testISTEX(query,N) {
 			console.log("in testISTEX:");
 			if(!query || query=="") return;
 			var origQuery = query

-
-			var pubmedifiedQuery = { query : query , string: query }
+      var postQuery = { query : query , N: N }

 			var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes

 		    $.ajax({
 			  // contentType: "application/json",
 		      url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
-		      data: pubmedifiedQuery,
+		      data: postQuery,
 		      type: 'POST',
 		      beforeSend: function(xhr) {
 		        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));

--- a/tests/urls.py
+++ b/tests/urls.py
@@ -14,6 +14,8 @@ urlpatterns = patterns('',
    url(r'paginator/corpus/(\d+)/$', views.newpaginatorJSON),
    url(r'move2trash/$' , views.move_to_trash_multiple ),

+
+    # TODO correct and move to scappers
    url(r'istextquery$', pubmedscrapper.getGlobalStatsISTEXT), # api/query?type=istext ?
    url(r'pubmedquery$', pubmedscrapper.getGlobalStats),
    url(r'project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),