Commit 470f2841 authored by delanoe's avatar delanoe

[FEAT] Multivac/REPEC scan is ok. Needs to fix parser.

parent 9cc5a609
...@@ -248,13 +248,22 @@ RESOURCETYPES = [ ...@@ -248,13 +248,22 @@ RESOURCETYPES = [
'file_formats':["zip","xml"], 'file_formats':["zip","xml"],
"crawler": "CernCrawler", "crawler": "CernCrawler",
}, },
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10, { "type": 10,
"name": 'REPEC [RIS]', "name": 'REPEC [MULTIVAC]',
"parser": "RISParser", "parser": "MultivacParser",
"format": 'RIS', "format": 'JSON',
'file_formats':["zip","ris", "txt"], 'file_formats':["zip","json"],
"crawler": None, "crawler": "MultivacCrawler",
}, },
] ]
#shortcut for resources declaration in template #shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None] PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
...@@ -161,6 +161,9 @@ API_TOKENS ={ ...@@ -161,6 +161,9 @@ API_TOKENS ={
"APIKEY":'b8514451-82d1-408e-a855-56d342a0b5f8', "APIKEY":'b8514451-82d1-408e-a855-56d342a0b5f8',
"APISECRET":'6680b13e-2b5a-4fba-8c0e-408884d5b904', "APISECRET":'6680b13e-2b5a-4fba-8c0e-408884d5b904',
}, },
"MULTIVAC" :{
"APIKEY": "3a8ca010-1dff-11e7-97ef-a1a6aa4c2352"
}
} }
# Internationalization # Internationalization
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
# ***** CERN Scrapper ***** # ***** CERN Scrapper *****
# **************************** # ****************************
# Author:c24b # Author:c24b
# Date: 27/05/2015 # Date: 27/05/2016
import hmac, hashlib import hmac, hashlib
import requests import requests
import os import os
...@@ -96,10 +96,12 @@ class CernCrawler(Crawler): ...@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
print(self.results_nb, "res") print(self.results_nb, "res")
#self.generate_urls() #self.generate_urls()
return(self.ids) return(self.ids)
def generate_urls(self): def generate_urls(self):
''' generate raw urls of ONE record''' ''' generate raw urls of ONE record'''
self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids] self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
return self.urls return self.urls
def fetch_records(self, ids): def fetch_records(self, ids):
''' for NEXT time''' ''' for NEXT time'''
raise NotImplementedError raise NotImplementedError
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import * from ._Crawler import *
import json import json
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
def __init__(self):
self.apikey = API_TOKENS["MULTIVAC"]
# Main EndPoints
self.BASE_URL = "https://api.iscpif.fr/v2"
self.API_URL = "pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
def __format_query__(self, query=None):
'''formating the query'''
if query is not None:
self.query = query
return self.query
else:
self.query = ""
return self.query
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
querystring = { "q" : query
, "count" : count
, "from" : fromPage
, "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
}
if lang is not None:
querystring["lang"] = lang
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = response.headers["Content-Type"].split("; ")[1].split("=")[1]
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = self._get(query)["results"]["total"]
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.results_nb
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
print("ERROR (scrap: multivac d/l ): ",msg)
self.query_max = QUERY_SIZE_N_MAX
with open(self.path, 'wb') as f:
for page in range(0, self.query_max, paging):
corpus.append(self.get(self.query, fromPage=page, count=paging)["hits"])
f.write(str(corpus).encode("utf-8"))
downloaded = True
return downloaded
# Scrapers config # Scrapers config
QUERY_SIZE_N_MAX = 1000 QUERY_SIZE_N_MAX = 1000
from gargantext.constants import get_resource from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.util.db import session from gargantext.util.db import session
from requests_futures.sessions import FuturesSession from requests_futures.sessions import FuturesSession
...@@ -18,31 +18,34 @@ class Crawler: ...@@ -18,31 +18,34 @@ class Crawler:
#the name of corpus #the name of corpus
#that will be built in case of internal fileparsing #that will be built in case of internal fileparsing
self.record = record self.record = record
self.name = record["corpus_name"] self.name = record["corpus_name"]
self.project_id = record["project_id"] self.project_id = record["project_id"]
self.user_id = record["user_id"] self.user_id = record["user_id"]
self.resource = record["source"] self.resource = record["source"]
self.type = get_resource(self.resource) self.type = get_resource(self.resource)
self.query = record["query"] self.query = record["query"]
#format the sampling #format the sampling
self.n_last_years = 5 self.n_last_years = 5
self.YEAR = date.today().year self.YEAR = date.today().year
#pas glop #pas glop
# mais easy version # mais easy version
self.MONTH = str(date.today().month) self.MONTH = str(date.today().month)
if len(self.MONTH) == 1: if len(self.MONTH) == 1:
self.MONTH = "0"+self.MONTH self.MONTH = "0"+self.MONTH
self.MAX_RESULTS = 1000
self.MAX_RESULTS = QUERY_SIZE_N_MAX
try: try:
self.results_nb = int(record["count"]) self.results_nb = int(record["count"])
except KeyError: except KeyError:
#n'existe pas encore #n'existe pas encore
self.results_nb = 0 self.results_nb = 0
try: try:
self.webEnv = record["webEnv"] self.webEnv = record["webEnv"]
self.queryKey = record["queryKey"] self.queryKey = record["queryKey"]
self.retMax = record["retMax"] self.retMax = record["retMax"]
except KeyError: except KeyError:
#n'exsite pas encore #n'exsite pas encore
self.queryKey = None self.queryKey = None
...@@ -67,6 +70,7 @@ class Crawler: ...@@ -67,6 +70,7 @@ class Crawler:
if self.download(): if self.download():
self.create_corpus() self.create_corpus()
return self.corpus_id return self.corpus_id
def get_sampling_dates(): def get_sampling_dates():
'''Create a sample list of min and max date based on Y and M f* '''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results''' or N_LAST_YEARS results'''
......
...@@ -13,20 +13,21 @@ class ISTexParser(Parser): ...@@ -13,20 +13,21 @@ class ISTexParser(Parser):
hyperdata_list = [] hyperdata_list = []
hyperdata_path = { hyperdata_path = {
"id" : "id", "id" : "id",
"source" : 'corpusName', "source" : "corpusName",
"title" : 'title', "title" : "title",
"genre" : "genre", "genre" : "genre",
"language_iso3" : 'language', "language_iso3" : "language",
"doi" : 'doi', "doi" : "doi",
"host" : 'host', "host" : "host",
"publication_date" : 'publicationDate', "publication_date" : "publicationDate",
"abstract" : 'abstract', "abstract" : "abstract",
# "authors" : 'author', # "authors" : 'author',
"authorsRAW" : 'author', "authorsRAW" : "author",
#"keywords" : "keywords" #"keywords" : "keywords"
} }
suma = 0 suma = 0
for json_doc in json_docs: for json_doc in json_docs:
hyperdata = {} hyperdata = {}
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC = 10
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
...@@ -18,9 +18,10 @@ ...@@ -18,9 +18,10 @@
from django.conf.urls import url from django.conf.urls import url
import moissonneurs.pubmed as pubmed import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex import moissonneurs.istex as istex
import moissonneurs.cern as cern import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
# TODO # TODO
#import moissonneurs.hal as hal #import moissonneurs.hal as hal
...@@ -31,11 +32,15 @@ import moissonneurs.cern as cern ...@@ -31,11 +32,15 @@ import moissonneurs.cern as cern
# REST API for the moissonneurs # REST API for the moissonneurs
# /!\ urls patterns here are *without* the trailing slash # /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query ) urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save ) , url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query ) , url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save ) , url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save ) , url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
] ]
...@@ -209,9 +209,11 @@ ...@@ -209,9 +209,11 @@
function CustomForSelect( selected ) { function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events // show Radio-Inputs and trigger FileOrNotFile>@upload-file events
selected = selected.toLowerCase() selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1); var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1); var is_istex = (selected.indexOf('istex' ) != -1);
if (is_pubmed || is_istex) { var is_repec = (selected.indexOf('repec' ) != -1);
if (is_pubmed || is_istex || is_repec) {
// if(selected=="pubmed") { // if(selected=="pubmed") {
console.log("show the button for: " + selected) console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
......
...@@ -545,7 +545,7 @@ ...@@ -545,7 +545,7 @@
}, },
error: function(result) { error: function(result) {
$("#theresults").html("Pubmed connection error!</i><br>") $("#theresults").html("Pubmed connection error.</i><br>")
$('#submit_thing').prop('disabled', true); $('#submit_thing').prop('disabled', true);
} }
}); });
...@@ -643,6 +643,68 @@ ...@@ -643,6 +643,68 @@
}); });
} }
//MULTIVAC = 10
if (SourceTypeId == "10"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/multivac/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
...@@ -689,7 +751,7 @@ ...@@ -689,7 +751,7 @@
console.log("selected:", selectedId); console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if(selectedId =="3" || selectedId == "8" || selectedId == "9") { if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
console.log("show the button for: " + selectedId) console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible"); $("#div-fileornot").css("visibility", "visible");
$("#div-fileornot").show(); $("#div-fileornot").show();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment