Commit 5de00550 authored by delanoe's avatar delanoe

[FEAT] HAL Parser almost ok (some duplicates, check pages).

parent a85e4c98
......@@ -181,8 +181,6 @@ def get_tagger(lang):
return tagger()
RESOURCETYPES = [
{ "type": 1,
'name': 'Europresse',
......@@ -264,6 +262,14 @@ RESOURCETYPES = [
"crawler": "MultivacCrawler",
},
{ "type": 11,
"name": 'HAL [CRAWLER]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
]
#shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
......@@ -28,19 +28,20 @@ import graph.urls
import moissonneurs.urls
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) )
, url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
, permanent=False), name="favicon")
, permanent=False), name="favicon" )
# Module Graph
, url(r'^' , include( graph.urls ) )
, url(r'^' , include( graph.urls ) )
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$', annotations_main_view)
, url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
, annotations_main_view)
# Module Scrapers (Moissonneurs in French)
, url(r'^moissonneurs/' , include( moissonneurs.urls ) )
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class HalCrawler(Crawler):
''' HAL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None):
'''formating the query'''
#search_field="title_t"
search_field="abstract_t"
return (search_field + ":" + "(" + query + ")")
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ title_s
, abstract_s
, submittedDate_s
, journalDate_s
, authFullName_s
, uri_s
, isbn_s
, issue_s
, journalPublisher_s
"""
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
, "rows" : count
, "start" : fromPage
, "fl" : fl
, "wt" : wt
}
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging))
docs = (self._get(query, fromPage=page, count=paging)
.get("response", {})
.get("docs" , [])
)
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='HAL.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
......@@ -8,9 +8,9 @@
from ._Crawler import *
import json
from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class MultivacCrawler(Crawler):
......@@ -30,14 +30,7 @@ class MultivacCrawler(Crawler):
def __format_query__(self, query=None):
'''formating the query'''
if query is not None:
self.query = query
return self.query
else:
self.query = ""
return self.query
None
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class HalParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "title_s"
, "abstract" : "abstract_s"
, "source" : "journalPublisher_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
}
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
if isinstance(field, list):
hyperdata[key] = ", ".join(field)
else:
hyperdata[key] = field
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
else:
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
......@@ -16,14 +24,11 @@ class MultivacParser(Parser):
json_docs = data
hyperdata_list = []
hyperdata_path = {
"id" : "id",
"title" : "title",
"abstract" : "abstract",
"type" : "type"
}
suma = 0
hyperdata_path = { "id" : "id"
, "title" : "title"
, "abstract" : "abstract"
, "type" : "type"
}
for json_doc in json_docs:
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL = 11
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect \
, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_HAL)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_HAL)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -22,25 +22,27 @@ import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
import moissonneurs.hal as hal
# TODO
#import moissonneurs.hal as hal
#import moissonneurs.revuesOrg as revuesOrg
# TODO ?
# REST API for the moissonneurs
# TODO : ISIDORE
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
]
......@@ -325,11 +325,13 @@
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span> Building the corpus...</h2>
</div>
<div class="modal-body">
<center>
<p>
Gargantext is gathering your texts
and need some time to eat it.
Gargantext is gathering your texts <br>
and need some time to eat it. <br>
Duration depends on the size of the dish.
</p>
</center>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-dismiss="modal">Continue on Gargantext</button>
......@@ -440,9 +442,9 @@
var type = $("#id_type").val()
// 5 booleans
var nameField = $("#id_name").val()!=""
var typeField = (type!="") && (type!="0")
var fileField = $("#id_file").val()!=""
var nameField = $("#id_name").val() != ""
var typeField = (type != "") && (type != "0")
var fileField = $("#id_file").val() != ""
var wantfileField = $("#file_yes").prop("checked")
var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField
......@@ -705,6 +707,67 @@
}
//HAL = 11
if (SourceTypeId == "11"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/hal/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
}
// CSS events for selecting one Radio-Input
......@@ -751,7 +814,12 @@
console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") {
if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
) {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
$("#div-fileornot").show();
......@@ -933,6 +1001,55 @@
});
}
function saveALL(query, N){
console.log("In Gargantext")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
url: window.location.origin+"/moissonneurs/hal/save/"+projectid,
data: data,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("ajax_success: in Gargantext()")
console.log(data)
alert("OK")
setTimeout(
function() {
$('#addcorpus').modal('hide')
$("#wait").modal("show");
}, 600);
},
error: function(data) {
console.log(data)
setTimeout(
function() {
$('#addcorpus').modal('hide')
$("#wait").modal("show")
//setTimeout(, 300)
//location.reload();
}, 600);
},
});
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment