Commit 502c5086 authored by Romain Loth's avatar Romain Loth

Set scrappers size in conf (QUERY_SIZE_N_DEFAULT in gargantext.ini)

parent 856dc56d
...@@ -59,3 +59,15 @@ max-requests = 5000 ...@@ -59,3 +59,15 @@ max-requests = 5000
uid = 1000 uid = 1000
gid = 1000 gid = 1000
################### other gargantext constants ###################
[scrappers]
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000
# checked just before scrap to prevent running impossible workflows
# even if somebody would set "query size N" manually in POST data
QUERY_SIZE_N_MAX = 20000
...@@ -18,7 +18,7 @@ from gargantext_web.db import * ...@@ -18,7 +18,7 @@ from gargantext_web.db import *
from gargantext_web.db import get_or_create_node from gargantext_web.db import get_or_create_node
from gargantext_web.views import session from gargantext_web.views import session
from gargantext_web.settings import DEBUG, MEDIA_ROOT from gargantext_web.settings import DEBUG, MEDIA_ROOT, BASE_DIR
from rest_v1_0.api import JsonHttpResponse from rest_v1_0.api import JsonHttpResponse
from django.db import connection from django.db import connection
...@@ -32,6 +32,19 @@ from gargantext_web.celery import apply_workflow ...@@ -32,6 +32,19 @@ from gargantext_web.celery import apply_workflow
from admin.utils import ensure_dir from admin.utils import ensure_dir
# pour lire la section [scrappers] de gargantext.ini
from configparser import ConfigParser
from os import path
# --------------------------------------------------------------------
# importing constants from config file
CONF = ConfigParser()
with open(path.join(BASE_DIR,'gargantext.ini')) as inifile:
CONF.read_file(inifile)
QUERY_SIZE_N_DEFAULT = CONF['scrappers']['QUERY_SIZE_N_DEFAULT']
# --------------------------------------------------------------------
def project(request, project_id): def project(request, project_id):
# do we have a valid project id? # do we have a valid project id?
try: try:
...@@ -64,7 +77,7 @@ def project(request, project_id): ...@@ -64,7 +77,7 @@ def project(request, project_id):
if not in_group: if not in_group:
return JsonHttpResponse( {"request" : "forbidden"} ) return JsonHttpResponse( {"request" : "forbidden"} )
# Let's find out about the children nodes of the project # Let's find out about the children nodes of the corpus
ChildrenNode = aliased(Node) ChildrenNode = aliased(Node)
# This query is giving you the wrong number of docs from the pubmedquerier (x 5) # This query is giving you the wrong number of docs from the pubmedquerier (x 5)
# ... sqlalchemy.func by Resource.type_id is the guilty # ... sqlalchemy.func by Resource.type_id is the guilty
...@@ -196,6 +209,8 @@ def project(request, project_id): ...@@ -196,6 +209,8 @@ def project(request, project_id):
'blacklists' : '', 'blacklists' : '',
'cooclists' : '', 'cooclists' : '',
'number' : corpora_count, 'number' : corpora_count,
'query_size' : QUERY_SIZE_N_DEFAULT,
'user_is_admin' : user.is_superuser
}) })
def tfidf(request, corpus_id, ngram_ids): def tfidf(request, corpus_id, ngram_ids):
......
...@@ -30,7 +30,7 @@ import threading ...@@ -30,7 +30,7 @@ import threading
from node.admin import CustomForm from node.admin import CustomForm
from gargantext_web.db import * from gargantext_web.db import *
from gargantext_web.db import get_sessionmaker, session,get_session from gargantext_web.db import get_sessionmaker, session,get_session
from gargantext_web.settings import DEBUG, MEDIA_ROOT from gargantext_web.settings import DEBUG, MEDIA_ROOT, BASE_DIR
from rest_v1_0.api import JsonHttpResponse from rest_v1_0.api import JsonHttpResponse
from parsing.corpustools import add_resource, parse_resources, extract_ngrams from parsing.corpustools import add_resource, parse_resources, extract_ngrams
...@@ -41,16 +41,50 @@ from time import sleep ...@@ -41,16 +41,50 @@ from time import sleep
from admin.utils import ensure_dir from admin.utils import ensure_dir
# pour lire la section [scrappers] de gargantext.ini
from configparser import ConfigParser
from os import path
# --------------------------------------------------------------------
# importing constants from config file
CONF = ConfigParser()
with open(path.join(BASE_DIR,'gargantext.ini')) as inifile:
CONF.read_file(inifile)
QUERY_SIZE_N_MAX = int(CONF['scrappers']['QUERY_SIZE_N_MAX'])
# QUERY_SIZE_N_DEFAULT = int(CONF['scrappers']['QUERY_SIZE_N_DEFAULT'])
# --------------------------------------------------------------------
def getGlobalStats(request ): def getGlobalStats(request ):
"""
Pubmed year by year results
# alist = [
# {'string': '2011[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6},
# {'string': '2012[dp] serendipity', 'queryKey': '1',
# 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4},
# ... ]
(reused as thequeries in doTheQuery)
"""
print(request.method) print(request.method)
alist = ["bar","foo"] alist = []
if request.method == "POST": if request.method == "POST":
N = 1000
query = request.POST["query"] query = request.POST["query"]
N = int(request.POST["N"])
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
instancia = MedlineFetcher() instancia = MedlineFetcher()
# serialFetcher (n_last_years, query, query_size)
alist = instancia.serialFetcher( 5, query , N ) alist = instancia.serialFetcher( 5, query , N )
data = alist data = alist
...@@ -59,12 +93,17 @@ def getGlobalStats(request ): ...@@ -59,12 +93,17 @@ def getGlobalStats(request ):
def getGlobalStatsISTEXT(request ): def getGlobalStatsISTEXT(request ):
"""
ISTEX simply the total of hits for a query
(not reused in testISTEX)
"""
print(request.method) print(request.method)
alist = ["bar","foo"] alist = ["bar","foo"]
if request.method == "POST": if request.method == "POST":
N = 1000
query = request.POST["query"] query = request.POST["query"]
N = int(request.POST["N"])
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
query_string = query.replace(" ","+") query_string = query.replace(" ","+")
...@@ -109,11 +148,18 @@ def doTheQuery(request , project_id): ...@@ -109,11 +148,18 @@ def doTheQuery(request , project_id):
if request.method == "POST": if request.method == "POST":
query = request.POST["query"] queries = request.POST["query"]
name = request.POST["string"] name = request.POST["string"]
# here we just realize queries already prepared by getGlobalStats
# ===> no need to repeat N parameter like in testISTEX <===
instancia = MedlineFetcher() instancia = MedlineFetcher()
thequeries = json.loads(query) thequeries = json.loads(queries)
# fyi the sum of our prepared yearly proportional quotas
sampled_sum = sum([year_q['retmax'] for year_q in thequeries])
print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum))
urlreqs = [] urlreqs = []
for yearquery in thequeries: for yearquery in thequeries:
...@@ -214,15 +260,22 @@ def testISTEX(request , project_id): ...@@ -214,15 +260,22 @@ def testISTEX(request , project_id):
if request.method == "POST": if request.method == "POST":
# print(alist)
query = "-" query = "-"
query_string = "-" query_string = "-"
N = 1000 N = 0
if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+") if "query" in request.POST:
# if "N" in request.POST: N = request.POST["N"] query = request.POST["query"]
print(query_string , query , N) query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = [] urlreqs = []
pagesize = 50 pagesize = 50
...@@ -247,7 +300,8 @@ def testISTEX(request , project_id): ...@@ -247,7 +300,8 @@ def testISTEX(request , project_id):
session.add(corpus) session.add(corpus)
session.commit() session.commit()
corpus_id = corpus.id corpus_id = corpus.id
print("NEW CORPUS", corpus_id)
ensure_dir(request.user) ensure_dir(request.user)
tasks = MedlineFetcher() tasks = MedlineFetcher()
......
...@@ -249,7 +249,12 @@ ...@@ -249,7 +249,12 @@
return cookieValue; return cookieValue;
} }
var thequeries = [] var thequeries = [] ;
// load the template's value for N scan size
var querySize = parseInt({{query_size}}) ;
// TODO if is_admin
function doTheQuery() { function doTheQuery() {
if ( $('#submit_thing').prop('disabled') ) return; if ( $('#submit_thing').prop('disabled') ) return;
...@@ -257,7 +262,11 @@ ...@@ -257,7 +262,11 @@
var origQuery = $("#id_name").val() var origQuery = $("#id_name").val()
var pubmedifiedQuery = { query : JSON.stringify(thequeries) , string: origQuery } ; var pubmedifiedQuery = {
query : JSON.stringify(thequeries) ,
string: origQuery ,
N : querySize
} ;
console.log(pubmedifiedQuery) console.log(pubmedifiedQuery)
var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes
...@@ -299,7 +308,7 @@ ...@@ -299,7 +308,7 @@
var origQuery = $("#id_name").val() var origQuery = $("#id_name").val()
console.log("printing the results:") console.log("printing the results:")
console.log(origQuery) console.log(origQuery)
testISTEX(origQuery.replace(" ","+"),1000) testISTEX(origQuery.replace(" ","+"), querySize)
} }
} }
else { else {
...@@ -324,9 +333,9 @@ ...@@ -324,9 +333,9 @@
console.log("in getGlobalResults()") console.log("in getGlobalResults()")
// AJAX to django // AJAX to django
var pubmedquery = $("#id_name").val() var pubmedquery = $("#id_name").val()
var Npubs = $("#id_N").val(); // var Npubs = $("#id_N").val();
if(pubmedquery=="") return; if(pubmedquery=="") return;
var formData = {query:pubmedquery , N:Npubs} var formData = {query:pubmedquery , N:querySize}
$("#theresults").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>') $("#theresults").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
console.log("disabling "+"#"+value.id) console.log("disabling "+"#"+value.id)
$("#"+value.id).prop('onclick',null); $("#"+value.id).prop('onclick',null);
...@@ -349,7 +358,7 @@ ...@@ -349,7 +358,7 @@
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 1000 sample!") $("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data thequeries = data
var N=0,k=0; var N=0,k=0;
...@@ -388,7 +397,7 @@ ...@@ -388,7 +397,7 @@
console.log("enabling "+"#"+value.id) console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);'); $("#"+value.id).attr('onclick','getGlobalResults(this);');
// $("#submit_thing").prop('disabled' , false) // $("#submit_thing").prop('disabled' , false)
$("#submit_thing").html("Process a 1000 sample!") $("#submit_thing").html("Process a {{ query_size }} sample!")
thequeries = data thequeries = data
var N=data.length,k=0; var N=data.length,k=0;
...@@ -494,20 +503,19 @@ ...@@ -494,20 +503,19 @@
return false; return false;
} }
function testISTEX(query,Npubs) { function testISTEX(query,N) {
console.log("in testISTEX:"); console.log("in testISTEX:");
if(!query || query=="") return; if(!query || query=="") return;
var origQuery = query var origQuery = query
var postQuery = { query : query , N: N }
var pubmedifiedQuery = { query : query , string: query }
var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes var projectid = window.location.href.split("project")[1].replace(/\//g, '')//replace all the slashes
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go", url: window.location.origin+"/tests/project/"+projectid+"/ISTEXquery/go",
data: pubmedifiedQuery, data: postQuery,
type: 'POST', type: 'POST',
beforeSend: function(xhr) { beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
......
...@@ -14,6 +14,8 @@ urlpatterns = patterns('', ...@@ -14,6 +14,8 @@ urlpatterns = patterns('',
url(r'paginator/corpus/(\d+)/$', views.newpaginatorJSON), url(r'paginator/corpus/(\d+)/$', views.newpaginatorJSON),
url(r'move2trash/$' , views.move_to_trash_multiple ), url(r'move2trash/$' , views.move_to_trash_multiple ),
# TODO correct and move to scappers
url(r'istextquery$', pubmedscrapper.getGlobalStatsISTEXT), # api/query?type=istext ? url(r'istextquery$', pubmedscrapper.getGlobalStatsISTEXT), # api/query?type=istext ?
url(r'pubmedquery$', pubmedscrapper.getGlobalStats), url(r'pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery), url(r'project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment