Commit ba042fa0 authored by delanoe's avatar delanoe

[FEAT] Multivac/REPEC scan is ok. Needs to fix parser.

parent 5a6c8acd
......@@ -248,13 +248,22 @@ RESOURCETYPES = [
'file_formats':["zip","xml"],
"crawler": "CernCrawler",
},
# { "type": 10,
# "name": 'REPEC [RIS]',
# "parser": "RISParser",
# "format": 'RIS',
# 'file_formats':["zip","ris", "txt"],
# "crawler": None,
# },
#
{ "type": 10,
"name": 'REPEC [RIS]',
"parser": "RISParser",
"format": 'RIS',
'file_formats':["zip","ris", "txt"],
"crawler": None,
"name": 'REPEC [MULTIVAC]',
"parser": "MultivacParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "MultivacCrawler",
},
]
#shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
......@@ -4,7 +4,7 @@
# ***** CERN Scrapper *****
# ****************************
# Author:c24b
# Date: 27/05/2015
# Date: 27/05/2016
import hmac, hashlib
import requests
import os
......@@ -96,10 +96,12 @@ class CernCrawler(Crawler):
print(self.results_nb, "res")
#self.generate_urls()
return(self.ids)
def generate_urls(self):
''' generate raw urls of ONE record'''
self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
return self.urls
def fetch_records(self, ids):
''' for NEXT time'''
raise NotImplementedError
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
class MultivacCrawler(Crawler):
''' Multivac API CLIENT'''
def __init__(self):
self.apikey = API_TOKENS["MULTIVAC"]
# Main EndPoints
self.BASE_URL = "https://api.iscpif.fr/v2"
self.API_URL = "pvt/economy/repec/search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
def __format_query__(self, query=None):
'''formating the query'''
if query is not None:
self.query = query
return self.query
else:
self.query = ""
return self.query
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
querystring = { "q" : query
, "count" : count
, "from" : fromPage
, "api_key" : API_TOKENS["MULTIVAC"]["APIKEY"]
}
if lang is not None:
querystring["lang"] = lang
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = response.headers["Content-Type"].split("; ")[1].split("=")[1]
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = self._get(query)["results"]["total"]
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.results_nb
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (self.query_max, QUERY_SIZE_N_MAX)
print("ERROR (scrap: multivac d/l ): ",msg)
self.query_max = QUERY_SIZE_N_MAX
with open(self.path, 'wb') as f:
for page in range(0, self.query_max, paging):
corpus.append(self.get(self.query, fromPage=page, count=paging)["hits"])
f.write(str(corpus).encode("utf-8"))
downloaded = True
return downloaded
# Scrapers config
QUERY_SIZE_N_MAX = 1000
from gargantext.constants import get_resource
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.util.scheduling import scheduled
from gargantext.util.db import session
from requests_futures.sessions import FuturesSession
......@@ -18,31 +18,34 @@ class Crawler:
#the name of corpus
#that will be built in case of internal fileparsing
self.record = record
self.name = record["corpus_name"]
self.project_id = record["project_id"]
self.user_id = record["user_id"]
self.resource = record["source"]
self.type = get_resource(self.resource)
self.query = record["query"]
self.record = record
self.name = record["corpus_name"]
self.project_id = record["project_id"]
self.user_id = record["user_id"]
self.resource = record["source"]
self.type = get_resource(self.resource)
self.query = record["query"]
#format the sampling
self.n_last_years = 5
self.YEAR = date.today().year
self.YEAR = date.today().year
#pas glop
# mais easy version
self.MONTH = str(date.today().month)
self.MONTH = str(date.today().month)
if len(self.MONTH) == 1:
self.MONTH = "0"+self.MONTH
self.MAX_RESULTS = 1000
self.MAX_RESULTS = QUERY_SIZE_N_MAX
try:
self.results_nb = int(record["count"])
except KeyError:
#n'existe pas encore
self.results_nb = 0
try:
self.webEnv = record["webEnv"]
self.webEnv = record["webEnv"]
self.queryKey = record["queryKey"]
self.retMax = record["retMax"]
self.retMax = record["retMax"]
except KeyError:
#n'exsite pas encore
self.queryKey = None
......@@ -67,6 +70,7 @@ class Crawler:
if self.download():
self.create_corpus()
return self.corpus_id
def get_sampling_dates():
'''Create a sample list of min and max date based on Y and M f*
or N_LAST_YEARS results'''
......
......@@ -13,20 +13,21 @@ class ISTexParser(Parser):
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"source" : 'corpusName',
"title" : 'title',
"source" : "corpusName",
"title" : "title",
"genre" : "genre",
"language_iso3" : 'language',
"doi" : 'doi',
"host" : 'host',
"publication_date" : 'publicationDate',
"abstract" : 'abstract',
"language_iso3" : "language",
"doi" : "doi",
"host" : "host",
"publication_date" : "publicationDate",
"abstract" : "abstract",
# "authors" : 'author',
"authorsRAW" : 'author',
"authorsRAW" : "author",
#"keywords" : "keywords"
}
suma = 0
for json_doc in json_docs:
hyperdata = {}
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** MULTIVAC Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_MULTIVAC = 10
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_MULTIVAC)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -18,9 +18,10 @@
from django.conf.urls import url
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
# TODO
#import moissonneurs.hal as hal
......@@ -31,11 +32,15 @@ import moissonneurs.cern as cern
# REST API for the moissonneurs
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
]
......@@ -209,9 +209,11 @@
function CustomForSelect( selected ) {
// show Radio-Inputs and trigger FileOrNotFile>@upload-file events
selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex) {
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex' ) != -1);
var is_repec = (selected.indexOf('repec' ) != -1);
if (is_pubmed || is_istex || is_repec) {
// if(selected=="pubmed") {
console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible");
......
......@@ -545,7 +545,7 @@
},
error: function(result) {
$("#theresults").html("Pubmed connection error!</i><br>")
$("#theresults").html("Pubmed connection error.</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
......@@ -643,6 +643,68 @@
});
}
//MULTIVAC = 10
if (SourceTypeId == "10"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/multivac/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
$("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
testCERN(pubmedquery, N);
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
}
// CSS events for selecting one Radio-Input
......@@ -689,7 +751,7 @@
console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if(selectedId =="3" || selectedId == "8" || selectedId == "9") {
if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
$("#div-fileornot").show();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment