Commit 5de00550 authored by delanoe's avatar delanoe

[FEAT] HAL Parser almost ok (some duplicates, check pages).

parent a85e4c98
...@@ -181,8 +181,6 @@ def get_tagger(lang): ...@@ -181,8 +181,6 @@ def get_tagger(lang):
return tagger() return tagger()
RESOURCETYPES = [ RESOURCETYPES = [
{ "type": 1, { "type": 1,
'name': 'Europresse', 'name': 'Europresse',
...@@ -264,6 +262,14 @@ RESOURCETYPES = [ ...@@ -264,6 +262,14 @@ RESOURCETYPES = [
"crawler": "MultivacCrawler", "crawler": "MultivacCrawler",
}, },
{ "type": 11,
"name": 'HAL [CRAWLER]',
"parser": "HalParser",
"format": 'JSON',
'file_formats':["zip","json"],
"crawler": "HalCrawler",
},
] ]
#shortcut for resources declaration in template #shortcut for resources declaration in template
PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None] PARSERS = [(n["type"],n["name"]) for n in RESOURCETYPES if n["parser"] is not None]
......
...@@ -28,19 +28,20 @@ import graph.urls ...@@ -28,19 +28,20 @@ import graph.urls
import moissonneurs.urls import moissonneurs.urls
urlpatterns = [ url(r'^admin/' , admin.site.urls ) urlpatterns = [ url(r'^admin/' , admin.site.urls )
, url(r'^api/' , include( gargantext.views.api.urls ) ) , url(r'^api/' , include( gargantext.views.api.urls ) )
, url(r'^' , include( gargantext.views.pages.urls ) ) , url(r'^' , include( gargantext.views.pages.urls ) )
, url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico') , url(r'^favicon.ico$', Redirect.as_view( url=static.url('favicon.ico')
, permanent=False), name="favicon") , permanent=False), name="favicon" )
# Module Graph # Module Graph
, url(r'^' , include( graph.urls ) ) , url(r'^' , include( graph.urls ) )
# Module Annotation # Module Annotation
# tempo: unchanged doc-annotations routes -- # tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include( annotations_urls ) ) , url(r'^annotations/', include( annotations_urls ) )
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$', annotations_main_view) , url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/(focus=[0-9,]+)?$'
, annotations_main_view)
# Module Scrapers (Moissonneurs in French) # Module Scrapers (Moissonneurs in French)
, url(r'^moissonneurs/' , include( moissonneurs.urls ) ) , url(r'^moissonneurs/' , include( moissonneurs.urls ) )
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
class HalCrawler(Crawler):
''' HAL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None):
'''formating the query'''
#search_field="title_t"
search_field="abstract_t"
return (search_field + ":" + "(" + query + ")")
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ title_s
, abstract_s
, submittedDate_s
, journalDate_s
, authFullName_s
, uri_s
, isbn_s
, issue_s
, journalPublisher_s
"""
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
, "rows" : count
, "start" : fromPage
, "fl" : fl
, "wt" : wt
}
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
charset = ( response.headers["Content-Type"]
.split("; ")[1]
.split("=" )[1]
)
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
paging = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("ERROR (scrap: Multivac d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
for page in range(1, trunc(self.query_max / 100) + 2):
print("Downloading page %s to %s results" % (page, paging))
docs = (self._get(query, fromPage=page, count=paging)
.get("response", {})
.get("docs" , [])
)
for doc in docs:
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='HAL.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
...@@ -8,9 +8,9 @@ ...@@ -8,9 +8,9 @@
from ._Crawler import * from ._Crawler import *
import json import json
from gargantext.settings import API_TOKENS from gargantext.settings import API_TOKENS
from gargantext.constants import UPLOAD_DIRECTORY from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc from math import trunc
from gargantext.util.files import save from gargantext.util.files import save
class MultivacCrawler(Crawler): class MultivacCrawler(Crawler):
...@@ -30,14 +30,7 @@ class MultivacCrawler(Crawler): ...@@ -30,14 +30,7 @@ class MultivacCrawler(Crawler):
def __format_query__(self, query=None): def __format_query__(self, query=None):
'''formating the query''' '''formating the query'''
None
if query is not None:
self.query = query
return self.query
else:
self.query = ""
return self.query
def _get(self, query, fromPage=1, count=10, lang=None): def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters # Parameters
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class HalParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "title_s"
, "abstract" : "abstract_s"
, "source" : "journalPublisher_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
}
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
if isinstance(field, list):
hyperdata[key] = ", ".join(field)
else:
hyperdata[key] = field
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
else:
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** MULTIVAC Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser from ._Parser import Parser
from datetime import datetime from datetime import datetime
import json import json
...@@ -16,14 +24,11 @@ class MultivacParser(Parser): ...@@ -16,14 +24,11 @@ class MultivacParser(Parser):
json_docs = data json_docs = data
hyperdata_list = [] hyperdata_list = []
hyperdata_path = { hyperdata_path = { "id" : "id"
"id" : "id", , "title" : "title"
"title" : "title", , "abstract" : "abstract"
"abstract" : "abstract", , "type" : "type"
"type" : "type" }
}
suma = 0
for json_doc in json_docs: for json_doc in json_docs:
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** HAL Crawler *****
# ****************************
# LICENCE: GARGANTEXT.org Licence
RESOURCE_TYPE_HAL = 11
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect \
, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_HAL)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
print(results)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_HAL)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
...@@ -22,25 +22,27 @@ import moissonneurs.pubmed as pubmed ...@@ -22,25 +22,27 @@ import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex import moissonneurs.istex as istex
import moissonneurs.cern as cern import moissonneurs.cern as cern
import moissonneurs.multivac as multivac import moissonneurs.multivac as multivac
import moissonneurs.hal as hal
# TODO # TODO : ISIDORE
#import moissonneurs.hal as hal
#import moissonneurs.revuesOrg as revuesOrg
# TODO ?
# REST API for the moissonneurs
# /!\ urls patterns here are *without* the trailing slash # /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$' , pubmed.query ) urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^pubmed/save/(\d+)' , pubmed.save ) , url(r'^pubmed/save/(\d+)' , pubmed.save )
, url(r'^istex/query$' , istex.query ) , url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save ) , url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query ) , url(r'^cern/query$' , cern.query )
, url(r'^cern/save/(\d+)' , cern.save ) , url(r'^cern/save/(\d+)' , cern.save )
, url(r'^multivac/query$' , multivac.query ) , url(r'^multivac/query$' , multivac.query )
, url(r'^multivac/save/(\d+)' , multivac.save ) , url(r'^multivac/save/(\d+)' , multivac.save )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
] ]
...@@ -325,11 +325,13 @@ ...@@ -325,11 +325,13 @@
<h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span> Building the corpus...</h2> <h2 class="modal-title"><h2><span class="glyphicon glyphicon-info-sign" aria-hidden="true"></span> Building the corpus...</h2>
</div> </div>
<div class="modal-body"> <div class="modal-body">
<center>
<p> <p>
Gargantext is gathering your texts Gargantext is gathering your texts <br>
and need some time to eat it. and need some time to eat it. <br>
Duration depends on the size of the dish. Duration depends on the size of the dish.
</p> </p>
</center>
</div> </div>
<div class="modal-footer"> <div class="modal-footer">
<button type="button" class="btn btn-secondary" data-dismiss="modal">Continue on Gargantext</button> <button type="button" class="btn btn-secondary" data-dismiss="modal">Continue on Gargantext</button>
...@@ -440,9 +442,9 @@ ...@@ -440,9 +442,9 @@
var type = $("#id_type").val() var type = $("#id_type").val()
// 5 booleans // 5 booleans
var nameField = $("#id_name").val()!="" var nameField = $("#id_name").val() != ""
var typeField = (type!="") && (type!="0") var typeField = (type != "") && (type != "0")
var fileField = $("#id_file").val()!="" var fileField = $("#id_file").val() != ""
var wantfileField = $("#file_yes").prop("checked") var wantfileField = $("#file_yes").prop("checked")
var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField var crawling = ((type==3)||(type==8)||(type==9)) && ! wantfileField
...@@ -705,6 +707,67 @@ ...@@ -705,6 +707,67 @@
} }
//HAL = 11
if (SourceTypeId == "11"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/hal/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
...@@ -751,7 +814,12 @@ ...@@ -751,7 +814,12 @@
console.log("selected:", selectedId); console.log("selected:", selectedId);
// by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN // by typeID: 3 = PUBMED, 8 = ISTEX, 9 = CERN
if(selectedId =="3" || selectedId == "8" || selectedId == "9" || selectedId == "10") { if ( selectedId == "3"
|| selectedId == "8"
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
) {
console.log("show the button for: " + selectedId) console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible"); $("#div-fileornot").css("visibility", "visible");
$("#div-fileornot").show(); $("#div-fileornot").show();
...@@ -933,6 +1001,55 @@ ...@@ -933,6 +1001,55 @@
}); });
} }
function saveALL(query, N){
console.log("In Gargantext")
if(!query || query=="") return;
console.log(query)
//var origQuery = query
var data = { "query" : query , "N": N };
// Replace all the slashes
var projectid = window.location.href.split("projects")[1].replace(/\//g, '')
console.log(data)
$.ajax({
dataType: 'json',
url: window.location.origin+"/moissonneurs/hal/save/"+projectid,
data: data,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("ajax_success: in Gargantext()")
console.log(data)
alert("OK")
setTimeout(
function() {
$('#addcorpus').modal('hide')
$("#wait").modal("show");
}, 600);
},
error: function(data) {
console.log(data)
setTimeout(
function() {
$('#addcorpus').modal('hide')
$("#wait").modal("show")
//setTimeout(, 300)
//location.reload();
}, 600);
},
});
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment