Commit 470f2841 authored by delanoe's avatar delanoe

[FEAT] Multivac/REPEC scan is ok. Needs to fix parser.

parent 9cc5a609
......@@ -248,13 +248,22 @@ RESOURCETYPES = [
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10,
"name": 'REPEC [RIS]',
"parser": "RISParser",
"format": 'RIS',
'file_formats':["zip","ris", "txt"],
"crawler": None,
"name": 'REPEC [MULTIVAC]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "MultivacCrawler",
},
]
#shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
......@@ -161,6 +161,9 @@ API_TOKENS ={
"APIKEY":'b8514451-82d1-408e-a855-56d342a0b5f8',
"APISECRET":'6680b13e-2b5a-4fba-8c0e-408884d5b904',
},
"MULTIVAC" :{
"APIKEY": "3a8ca010-1dff-11e7-97ef-a1a6aa4c2352"
}
}
# Internationalization
......
......@@ -4,7 +4,7 @@
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/2015
# Date: 27/05/2016
import hmac, hashlib
import requests
import os
......@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
print(self.results_nb, "res")
#self.generate_urls()
return(self.ids)
def generate_urls(self):
''' generate raw urls of ONE record'''
self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
return self.urls
def fetch_records(self, ids):
''' for NEXT time'''
raise NotImplementedError
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
def __init__(self):
self.apikey = API_TOKENS["MULTIVAC"]
# Main EndPoints
self.BASE_URL = "https://api.iscpif.fr/v2"
self.API_URL = "pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
def __format_query__(self, query=None):
'''formating the query'''
if query is not None:
self.query = query
return self.query
else:
self.query = ""
return self.query
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
querystring = { "q" : query
, "count" : count
, "from" : fromPage
, "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
}
if lang is not None:
querystring["lang"] = lang
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = response.headers["Content-Type"].split("; ")[1].split("=")[1]
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = self._get(query)["results"]["total"]
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.results_nb
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
print("ERROR (scrap: multivac d/l ): ",msg)
self.query_max = QUERY_SIZE_N_MAX
with open(self.path, 'wb') as f:
for page in range(0, self.query_max, paging):
corpus.append(self.get(self.query, fromPage=page, count=paging)["hits"])
f.write(str(corpus).encode("utf-8"))
downloaded = True
return downloaded
# Scrapers config
QUERY_SIZE_N_MAX = 1000
from gargantext.constants import get_resource
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.util.scheduling import scheduled
from gargantext.util.db import session
from requests_futures.sessions import FuturesSession
......@@ -31,9 +31,12 @@ class Crawler:
#pas glop
# mais easy version
self.MONTH = str(date.today().month)
if len(self.MONTH) == 1:
self.MONTH = "0"+self.MONTH
self.MAX_RESULTS = 1000
self.MAX_RESULTS = QUERY_SIZE_N_MAX
try:
self.results_nb = int(record["count"])
except KeyError:
......@@ -67,6 +70,7 @@ class Crawler:
if self.download():
self.create_corpus()
return self.corpus_id
def get_sampling_dates():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
......
......@@ -13,20 +13,21 @@ class ISTexParser(Parser):
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"source" : "corpusName",
"title" : "title",
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
"language_iso3" : "language",
"doi" : "doi",
"host" : "host",
"publication_date" : "publicationDate",
"abstract" : "abstract",
# "authors" : 'author',
"authorsRAW" : 'author',
"authorsRAW" : "author",
#"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC = 10
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -21,6 +21,7 @@ from django.conf.urls import url
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
# TODO
#import moissonneurs.hal as hal
......@@ -36,6 +37,10 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
]
......@@ -210,8 +210,10 @@
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex) {
var is_istex = (selected.indexOf('istex' ) != -1);
var is_repec = (selected.indexOf('repec' ) != -1);
if (is_pubmed || is_istex || is_repec) {
// if(selected=="pubmed") {
console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible");
......
......@@ -545,7 +545,7 @@
},
error: function(result) {
$("#theresults").html("Pubmed connection error!</i><br>")
$("#theresults").html("Pubmed connection error.</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
......@@ -643,6 +643,68 @@
});
}
//MULTIVAC = 10
if (SourceTypeId == "10"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/multivac/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
}
// CSS events for selecting one Radio-Input
......@@ -689,7 +751,7 @@
console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if(selectedId =="3" || selectedId == "8" || selectedId == "9") {
if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
$("#div-fileornot").show();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment