diff --git a/gargantext/util/crawlers/CERN.py b/gargantext/util/crawlers/CERN.py index eeda843b21c8f404954720a7c9f758fae6338cec..4820b2bf107d73f2739a81a661faee61c085908a 100644 --- a/gargantext/util/crawlers/CERN.py +++ b/gargantext/util/crawlers/CERN.py @@ -5,28 +5,31 @@ # **************************** # Author:c24b # Date: 27/05/2015 - -from ._Crawler import Crawler - import hmac, hashlib import requests import os import random + import urllib.parse as uparse from lxml import etree from gargantext.settings import API_TOKENS -#from gargantext.util.files import build_corpus_path -from gargantext.util.db import session -from gargantext.models import Node +from ._Crawler import Crawler +from gargantext.util.timeit_damnit import timing + class CernCrawler(Crawler): '''CERN SCOAP3 API Interaction''' + def __init__(self): + API = API_TOKENS["CERN"] + self.apikey = API["APIKEY"].encode("utf-8") + self.secret = bytearray(API["APISECRET"].encode("utf-8")) + self.BASE_URL = u"http://api.scoap3.org/search?" def __generate_signature__(self, url): '''creation de la signature''' #hmac-sha1 salted with secret - return hmac.new(self.secret,url, hashlib.sha1).hexdigest() + return hmac.new(self.secret,url.encode("utf-8"), hashlib.sha1).hexdigest() def __format_query__(self, query, of="xm", fields= None): ''' for query filters params @@ -45,89 +48,71 @@ class CernCrawler(Crawler): def __format_url__(self, dict_q): '''format the url with encoded query''' - #add the apikey - dict_q["apikey"] = [self.apikey] - params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())]) + #add the apikey at the end + dict_q["apikey"] = self.apikey + #dict_q["p"] = dict_q["p"].replace(" ", "+") >> quote_plus + params = ("&").join([(str(k)+"="+uparse.quote_plus(v)) for k,v in sorted(dict_q.items())]) return self.BASE_URL+params def sign_url(self, dict_q): '''add signature''' - API = API_TOKENS["CERN"] - self.apikey = API["APIKEY"] - self.secret = API["APISECRET"].encode("utf-8") - self.BASE_URL = u"http://api.scoap3.org/search?" url = self.__format_url__(dict_q) - return url+"&signature="+self.__generate_signature__(url.encode("utf-8")) + return url+"&signature="+self.__generate_signature__(url) - - def create_corpus(self): - #create a corpus - corpus = Node( - name = self.query, - #user_id = self.user_id, - parent_id = self.project_id, - typename = 'CORPUS', - hyperdata = { "action" : "Scrapping data" - , "language_id" : self.type["default_language"] - } - ) - #add the resource - corpus.add_resource( - type = self.type["type"], - name = self.type["name"], - path = self.path) - - try: - print("PARSING") - # p = eval(self.type["parser"])() - session.add(corpus) - session.commit() - self.corpus_id = corpus.id - parse_extract_indexhyperdata(corpus.id) - return self - except Exception as error: - print('WORKFLOW ERROR') - print(error) - session.rollback() - return self - - def download(self): - import time + @timing + def download(self, query): self.path = "/tmp/results.xml" - query = self.__format_query__(self.query) + query = self.__format_query__(query) url = self.sign_url(query) - start = time.time() r = requests.get(url, stream=True) downloaded = False #the long part with open(self.path, 'wb') as f: print("Downloading file") for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks #print("===") f.write(chunk) downloaded = True - end = time.time() - #print (">>>>>>>>>>LOAD results", end-start) return downloaded - - def scan_results(self): - '''scanner le nombre de resultat en récupérant 1 seul résultat + def get_ids(self, query): + '''get results nb + individual ids of search query''' + dict_q = uparse.parse_qs(query) + #parameters for a global request + dict_q["p"] = query + dict_q["of"] = "id" + dict_q["rg"] = "10000" + #api key is added when formatting url + url = self.__format_url__(dict_q) + signed_url = url+"&signature="+self.__generate_signature__(url) + r = requests.get(signed_url) + print(signed_url) + self.ids = r.json() + #self.results_nb = len(self.ids) + #self.generate_urls() + return(self.ids) + def generate_urls(self): + ''' generate raw urls of ONE record''' + self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids] + return self.urls + def fetch_records(self, ids): + ''' for NEXT time''' + raise NotImplementedError + + + def scan_results(self, query): + '''[OLD]scanner le nombre de resultat en récupérant 1 seul résultat qui affiche uniquement l'auteur de la page 1 on récupère le commentaire en haut de la page ''' - import time - - self.results_nb = 0 - query = self.__format_query__(self.query, of="hb") + query = self.__format_query__(query, of="hb") query["ot"] = "100" query["jrec"]='1' query["rg"]='1' url = self.sign_url(query) - print(url) + #print(url) #start = time.time() r = requests.get(url) #end = time.time() diff --git a/gargantext/util/crawlers/_Crawler.py b/gargantext/util/crawlers/_Crawler.py index 0143b18a663521628ee1ff33926efeb5b74919e4..d3efa290688cd327d186538cabd5cf9c1254c4a9 100644 --- a/gargantext/util/crawlers/_Crawler.py +++ b/gargantext/util/crawlers/_Crawler.py @@ -101,7 +101,7 @@ class Crawler: ) session.add(corpus) session.commit() - scheduled(parse_extract_indexhyperdata)(corpus.id) + scheduled(parse_extract_indexhyperdata(corpus.id)) else: #add the resource corpus.add_resource( diff --git a/moissonneurs/cern.py b/moissonneurs/cern.py index 5b7a44877647f5e7dc568a0794eeda0a170c44eb..47acffc0918568a43070ddf5ac020a30cfa954c5 100644 --- a/moissonneurs/cern.py +++ b/moissonneurs/cern.py @@ -1,179 +1,116 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # **************************** -# ***** CERN Scrapper ***** +# ***** CERN Crawler ***** # **************************** +RESOURCE_TYPE_SCOAP = 9 -import logging - -from logging.handlers import RotatingFileHandler - -# création de l'objet logger qui va nous servir à écrire dans les logs -logger = logging.getLogger() -# on met le niveau du logger à DEBUG, comme ça il écrit tout -logger.setLevel(logging.DEBUG) - -# création d'un formateur qui va ajouter le temps, le niveau -# de chaque message quand on écrira un message dans le log -formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s') - -# création d'un handler qui va rediriger une écriture du log vers -# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo -#>>> Permission denied entre en conflit avec les los django -#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1) -# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur -# créé précédement et on ajoute ce handler au logger -#~ file_handler.setLevel(logging.DEBUG) -#~ file_handler.setFormatter(formatter) -#~ logger.addHandler(file_handler) - -# création d'un second handler qui va rediriger chaque écriture de log -# sur la console -steam_handler = logging.StreamHandler() -steam_handler.setLevel(logging.DEBUG) -logger.addHandler(steam_handler) - -import json -import datetime -from os import path -import threading -import hmac, hashlib -import requests -import lxml -import subprocess -import urllib.parse as uparse -from lxml import etree -from bs4 import BeautifulSoup, Comment -from collections import defaultdict - - - -#from gargantext.util.files import download - -from gargantext.settings import API_TOKENS as API -#from private import API_PERMISSIONS - -def save( request , project_id ) : - try: - project_id = int(project_id) - except ValueError: - raise Http404() - # do we have a valid project? - project = session.query( Node ).filter(Node.id == project_id).first() - if project is None: - raise Http404() - user = cache.User[request.user.id] - if not user.owns(project): - raise HttpResponseForbidden() +from django.shortcuts import redirect, render +from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden +from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX +from gargantext.models.nodes import Node +from gargantext.util.db import session +from gargantext.util.db_cache import cache +from gargantext.util.http import JsonHttpResponse +from gargantext.util.scheduling import scheduled +from gargantext.util.toolchain import parse_extract_indexhyperdata - if request.method == "POST": - query = request.POST["query"] - - name = request.POST["string"] - corpus = project.add_child( name=name - , typename = "CORPUS" - ) - corpus.add_resource( type = resourcetype('Cern (MARC21 XML)') - , path = filename - , url = None - ) - print("Adding the resource") -def query( request ): - print(request.method) - alist = [] +def query( request): + '''get GlobalResults()''' if request.method == "POST": query = request.POST["query"] - N = int(request.POST["N"]) + source = get_resource(RESOURCE_TYPE_SCOAP) + if source["crawler"] is not None: + crawlerbot = load_crawler(source)() + #old raw way to get results_nb + #results = crawlerbot.scan_results(query) + ids = crawlerbot.get_ids(query) + return JsonHttpResponse({"results_nb":int(len(ids)), "ids": ids}) + +def save(request, project_id): + '''save''' + if request.method == "POST": + query = request.POST.get("query") + try: + N = int(request.POST.get("N")) + except: + N = 0 + print(query, N) + #for next time + #ids = request.POST["ids"] + source = get_resource(RESOURCE_TYPE_SCOAP) + if N == 0: + raise Http404() if N > QUERY_SIZE_N_MAX: - msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) - print("ERROR(scrap: pubmed stats): ",msg) - raise ValueError(msg) - - print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) - print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) - #Here Requests API - # - #API_TOKEN = API["CERN"] - - #instancia = Scraper() - - # serialFetcher (n_last_years, query, query_size) - #alist = instancia.serialFetcher( 5, query , N ) - - data = alist + N = QUERY_SIZE_N_MAX + + try: + project_id = int(project_id) + except ValueError: + raise Http404() + # do we have a valid project? + project = session.query( Node ).filter(Node.id == project_id).first() + if project is None: + raise Http404() + user = cache.User[request.user.id] + if not user.owns(project): + raise HttpResponseForbidden() + # corpus node instanciation as a Django model + + corpus = Node( + name = query, + user_id = request.user.id, + parent_id = project_id, + typename = 'CORPUS', + hyperdata = { "action" : "Scrapping data" + , "language_id" : "en" + } + ) + + #download_file + crawler_bot = load_crawler(source)() + #for now no way to force downloading X records + + #the long running command + filename = crawler_bot.download(query) + corpus.add_resource( + type = source["type"] + #, name = source["name"] + , path = crawler_bot.path + ) + + session.add(corpus) + session.commit() + #corpus_id = corpus.id + + try: + scheduled(parse_extract_indexhyperdata)(corpus.id) + except Exception as error: + print('WORKFLOW ERROR') + print(error) + try: + print_tb(error.__traceback__) + except: + pass + # IMPORTANT --------------------------------- + # sanitize session after interrupted transact + session.rollback() + # -------------------------------------------- + + return render( + template_name = 'pages/projects/wait.html', + request = request, + context = { + 'user' : request.user, + 'project': project, + }, + ) + + + data = [query_string,query,N] + print(data) return JsonHttpResponse(data) - - -class CERN_API(object): - '''CERN SCOAP3 Interaction''' - def __init__(self,query, filename= "./results.xml"): - self.query = query - self.apikey = API["TOKEN"] - self.secret = API["SECRET"].encode("utf-8") - self.results = self.get_results(filename) - self.BASE_URL= u"http://api.scoap3.org/search?" - def __generate_signature__(self, url): - '''creation de la signature''' - #hmac-sha1 salted with secret - return hmac.new(self.secret,url, hashlib.sha1).hexdigest() - - def __format_url__(self): - '''format the url with encoded query''' - dict_q = uparse.parse_qs(self.query) - #add the apikey - dict_q["apikey"] = [self.apikey] - params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())]) - return self.BASE_URL+params - - def sign_url(self): - '''add signature''' - url = self.__format_url__() - return url+"&signature="+self.__generate_signature__(url.encode("utf-8")) - - def get_results(self, filename): - url = self.sign_url() - r = requests.get(url, stream=True) - with open(filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - return filename - - def parse_xml(filename,MARCXML): - parser = etree.XMLParser() - with open(self.filename, 'r') as f: - root = etree.tostring(f.read()) - data = f.read() - records = [] - for record in data.split("<record>")[1:]: - soup = BeautifulSoup("<record>"+record, "lxml") - r = {v:[] for v in self.MARC21["700"].values()} - r["uid"] = soup.find("controlfield").text - - for data in soup.find_all("datafield"): - tag = data.get("tag") - if tag in self.MARC21.keys(): - for sub in data.find_all("subfield"): - code = sub.get("code") - if code in self.MARC21[tag].keys(): - if tag == "700": - r[self.MARC21[tag][code]].append(sub.text) - else: - r[self.MARC21[tag][code]] = sub.text - records.append(r.decode('utf-8')) - return JsonHttpResponse(records) - - -#query="of=xm" -#a = CERN_API(query, "./full.xml") -#p = CERNParser("./full.xml") -#print(p.MARC21.keys()) -#~ #p.parse() -#~ with open("./results_full.json", "r") as f: - #~ data = json.load(f) - #~ for record in data["records"]: - #~ print(record.keys()) diff --git a/moissonneurs/urls.py b/moissonneurs/urls.py index 160d3de0a077f3c51d454dca4eaeee2ee2692c56..567a143ad0985c0d687c039cacc6c152f1b81fc7 100644 --- a/moissonneurs/urls.py +++ b/moissonneurs/urls.py @@ -13,15 +13,13 @@ # Available databases : ## Pubmed ## IsTex, -## En cours CERN +## CERN from django.conf.urls import url import moissonneurs.pubmed as pubmed import moissonneurs.istex as istex - -# TODO import moissonneurs.cern as cern # TODO @@ -38,8 +36,6 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query ) , url(r'^istex/query$' , istex.query ) , url(r'^istex/save/(\d+)' , istex.save ) - - # TODO - , url(r'^scoap3/query$' , cern.query ) - , url(r'^scoap3/save/(\d+)' , cern.save ) + , url(r'^cern/query$' , cern.query ) + , url(r'^cern/save/(\d+)' , cern.save ) ] diff --git a/templates/pages/projects/project.html b/templates/pages/projects/project.html index edeaa7418e16e263faf95631254243efedce1c3a..e493f015a3b923c65cd0de669504dff40842644f 100644 --- a/templates/pages/projects/project.html +++ b/templates/pages/projects/project.html @@ -61,6 +61,23 @@ </p> </div> </div> + <!-- Modal --> + <div id="wait" class="modal" style="top:8%;"> + <div class="jumbotron"> + <div class="container theme-showcase jumbotron" role="main"> + <div> + <div class="col-md-3"></div> + <div class="col-md-6"> + <h2>Your file has been uploaded ! </h2> + <h2>Gargantext need some time to eat it.</h2> + <h2>Duration depends on the size of the dish.</h2> + <a class="btn btn-primary btn-lg" href="/projects/{{ project.id }}" title="Click and test by yourself">Continue on Gargantext</a> + </div> + <div class="col-md-3"></div> + </div> + </div> + </div> + </div> </div> @@ -71,16 +88,7 @@ <div class="container"> - <!-- Modal --> - <div id="wait" class="modal row col-md-6"> - <div class="modal-dialog "> - <h2>Your file has been uploaded ! </h2> - <h2>Gargantext need some time to eat it.</h2> - <h2>Duration depends on the size of the dish.</h2> - <a class="btn btn-primary btn-lg" href="/projects/{{ project.id }}" title="Click and test by yourself">Continue on Gargantext</a> - </div> - </div> {% if list_corpora %} {% for key, corpora in list_corpora.items %} @@ -258,8 +266,8 @@ <td> <div id="pubmedcrawl" style="visibility: hidden;"> Do you have a file already? - <input type="radio" id="file_yes" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="true" checked>Yes </input> - <input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input> + <input type="radio" id="file_yes" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="true" checked> Yes </input> + <input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false"> No </input> </div> </td> </tr> @@ -336,7 +344,7 @@ $('#addcorpus').modal('hide'); $("#wait").modal("show"); - }, 3000); + }, 600); }, error: function(result) { console.log("in doTheQuery(). Data not found"); @@ -391,10 +399,11 @@ $("#theresults").html('<img width="30px" src="{% static "img/loading-bar.gif" %}"></img>') console.log("disabling "+"#"+value.id) $("#"+value.id).prop('onclick',null); - var theType = $("#id_type option:selected").html(); - - if(theType=="Pubmed [XML]") { + var SourceTypeId = $("#id_type").val() + //alert(SourceTypeId); + //PUBMED = 3 + if(SourceTypeId == "3") { $.ajax({ // contentType: "application/json", url: window.location.origin+"/moissonneurs/pubmed/query", @@ -434,7 +443,7 @@ }); } - if(theType=="ISTex") { + if(SourceTypeId == "8") { console.log(window.location.origin+"moissonneurs/istex/query") $.ajax({ // contentType: "application/json", @@ -470,6 +479,64 @@ } }); } + //<option value="9">SCOAP [XML]</option> + if (SourceTypeId == "9"){ + $.ajax({ + // contentType: "application/json", + url: window.location.origin+"/moissonneurs/cern/query", + data: formData, + type: 'POST', + beforeSend: function(xhr) { + xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); + }, + success: function(data) { + console.log("SUCCESS") + console.log("enabling "+"#"+value.id) + $("#"+value.id).attr('onclick','getGlobalResults(this);'); + $("#submit_thing").prop('disabled' , false) + //$("#submit_thing").html("Process a {{ query_size }} sample!") + + N = data["results_nb"] + + + + if(N > 0) { + if (N <= {{query_size}}){ + $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>") + $("#submit_thing").html("Download!") + $("#submit_thing").prop('disabled' , false) + //$("#submit_thing").attr('onclick', testCERN(query, N)); + $("#submit_thing").on("click", function(){ + testCERN(pubmedquery, N); + //$("#submit_thing").onclick() + })} + //(N > {{query_size}}) + else { + $("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>") + $('#submit_thing').prop('disabled', false); + $("#submit_thing").html("Processing a sample file") + $("#submit_thing").on("click", function(){ + testCERN(pubmedquery, N); + //$("#submit_thing").onclick() + })} + } + + + else { + $("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>") + if(data[0]==false) + $("#theresults").html(theType +" connection error!</i><br>") + $('#submit_thing').prop('disabled', false); + } + + }, + error: function(result) { + $("#theresults").html(theType +" connection error!</i><br>") + $('#submit_thing').prop('disabled', true); + } + }); + + } } // CSS events for selecting one Radio-Input @@ -508,17 +575,20 @@ } //CSS events for changing the Select element + function CustomForSelect( selected ) { + // show Radio-Inputs and trigger FileOrNotFile>@upload-file events - selected = selected.toLowerCase() - var is_pubmed = (selected.indexOf('pubmed') != -1); - var is_istex = (selected.indexOf('istex') != -1); - if (is_pubmed || is_istex) { - // if(selected=="pubmed") { + + selectedId = $("#id_type").val() + //alert(selectedId); + //by typeID 3 = PUBMED 8 = ISTEX 9 = CERN + if(selectedId =="3" || selectedId == "8" || selectedId == "9") { console.log("show the button for: " + selected) $("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").show(); - $("#file_yes").click(); + + //$("#file_yes").click(); $("#submit_thing").html("Process this!") } // hide Radio-Inputs and trigger @upload-file events @@ -557,6 +627,51 @@ }); return false; } + function testCERN(query, N){ + //alert("CERN!") + console.log("In testCern") + if(!query || query=="") return; + //var origQuery = query + + var data = { "query" : query , "N": N }; + var projectid = window.location.href.split("projects")[1].replace(/\//g, '')//replace all the slashes + console.log(data) + $.ajax({ + dataType: 'json', + url: window.location.origin+"/moissonneurs/cern/save/"+projectid, + data: data, + type: 'POST', + beforeSend: function(xhr) { + xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken")); + }, + success: function(data) { + console.log("ajax_success: in testCERN()") + console.log(data) + alert("OK") + setTimeout( + function() { + alert("TimeOut!") + $('#addcorpus').modal('hide') + $("#wait").modal("show") + //setTimeout(, 300) + //location.reload(); + }, 600); + }, + + + error: function(data) { + console.log(data) + setTimeout( + function() { + $('#addcorpus').modal('hide') + $("#wait").modal("show") + //setTimeout(, 300) + //location.reload(); + }, 600); + + }, + }); + } function testISTEX(query,N) { console.log("in testISTEX:"); @@ -584,10 +699,10 @@ $("#wait").modal("show"); //location.reload(); - }, 3000); + }, 600); }, - error: function(result) { - console.log("in testISTEX(). Data not found"); + error: function(result) { + console.log("in testISTEX(). Data not found"); } }); }