Commit 89ddcc0f authored by delanoe's avatar delanoe

[FEAT] ISIDORE OK for simple boolean requests.

parent cef6161f
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.crawlers.sparql.bool2sparql import bool2sparql, isidore
class IsidoreCrawler(Crawler):
''' ISIDORE SPARQL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://www.rechercheisidore.fr"
self.API_URL = "sparql"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None, count=False, offset=None, limit=None):
'''formating the query'''
return (bool2sparql(query, count=count, offset=offset, limit=limit))
def _get(self, query, offset=0, limit=100, lang=None):
'''Parameters to download data'''
isidore(query, count=False, offset=offset, limit=limit)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = [n for n in isidore(query, count=True)][0]
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
limit = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("WARNING (scrap: ISIDORE d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for offset in range(0, self.query_max, limit):
print("Downloading result %s to %s" % (offset, self.query_max))
#for doc in self._get(query, count=False, offset=offset, limit=limit) :
for doc in isidore(query, offset=offset, limit=limit) :
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='ISIDORE.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
/home/alexandre/local/logiciels/haskell/myCode/bool2sparql/.stack-work/dist/x86_64-linux-nopie/Cabal-1.24.2.0/build/bool2sparql-exe/bool2sparql-exe
\ No newline at end of file
import subprocess
import sparql as s
from .sparql import Service
#from sparql import Service
def bool2sparql(query, count=False, limit=None):
def bool2sparql(query, count=False, offset=None, limit=None):
"""
bool2sparql :: String -> Bool -> Int -> String
Translate a boolean query into a Sparql request
......@@ -11,13 +12,19 @@ def bool2sparql(query, count=False, limit=None):
See: https://github.com/delanoe/bool2sparql
"""
bashCommand = ["./bool2sparql-exe","-q",query]
bashCommand = ["/srv/gargantext/gargantext/util/crawlers/sparql/bool2sparql-exe","-q",query]
if count is True :
bashCommand.append("-c")
else :
for command in ["-l", str(limit)] :
bashCommand.append(command)
if offset is not None :
for command in ["--offset", str(offset)] :
bashCommand.append(command)
if limit is not None :
for command in ["--limit", str(limit)] :
bashCommand.append(command)
process = subprocess.Popen(bashCommand, stdout=subprocess.PIPE)
output, error = process.communicate()
......@@ -27,27 +34,29 @@ def bool2sparql(query, count=False, limit=None):
else :
return(output.decode("utf-8"))
def isidore(query, count=False, limit=None):
def isidore(query, count=False, offset=None, limit=None):
"""
isidore :: String -> Bool -> Int -> Either (Dict String) Int
use sparql-client either to search or to scan
"""
query = bool2sparql(query, count, limit)
query = bool2sparql(query, count=count, offset=offset, limit=limit)
print(query)
go = s.Service("https://www.rechercheisidore.fr/sparql/", "utf-8", "GET")
go = Service("https://www.rechercheisidore.fr/sparql/", "utf-8", "GET")
results = go.query(query)
if count is False:
for r in results:
doc = dict()
doc_values = dict()
doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["journal"] = r
doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
print(doc)
for k in doc.keys():
doc_values[k] = doc[k].value
print(doc_values)
yield(doc_values)
......@@ -60,12 +69,13 @@ def isidore(query, count=False, limit=None):
def test():
query = "ricoeur"
limit = 2
query = "delanoe"
limit = 100
offset = 10
for d in isidore(query, limit=limit):
print(d["abstract"])
print([n for n in isidore(query, count=True)])
for d in isidore(query, offset=offset, limit=limit):
print(d["date"])
#print([n for n in isidore(query, count=True)])
test()
File mode changed from 100755 to 100644
......@@ -3,7 +3,7 @@
# ****************************
# **** HAL Parser ***
# ****************************
# CNRS COPYRIGHTS
# CNRS COPYRIGHTS 2017
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** ISIDORE Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class IsidoreParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "title" : "title"
, "abstract" : "abstract"
, "authors" : "authors"
, "url" : "url"
, "source" : "source"
}
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
# Source is the Journal Name
hyperdata["source"] = doc.get("journal", "ISIDORE Database")
# Working on the date
maybeDate = doc.get("date" , None)
if maybeDate is None:
date = datetime.now()
else:
try :
# Model of date: 1958-01-01T00:00:00
date = datetime.strptime(maybeDate, '%Y-%m-%dT%H:%M:%S')
except :
print("FIX DATE ISIDORE please >%s<" % maybeDate)
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** ISIDORE Crawler *****
# ****************************
RESOURCE_TYPE_ISIDORE = 12
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "fr"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
......@@ -10,19 +10,15 @@
# moissonneurs == getting data from external databases
# Available databases :
## Pubmed
## IsTex,
## CERN
from django.conf.urls import url
# Available databases :
import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex
import moissonneurs.cern as cern
import moissonneurs.multivac as multivac
import moissonneurs.hal as hal
import moissonneurs.isidore as isidore
# TODO : ISIDORE
......@@ -42,7 +38,7 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save )
, url(r'^isidore/query$' , isidore.query )
, url(r'^isidore/save/(\d+)' , isidore.save )
]
......@@ -675,7 +675,7 @@
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N);
saveMultivac(pubmedquery, N, "/moissonneurs/multivac/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
......@@ -684,7 +684,7 @@
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N);
saveMultivac(pubmedquery, N,"/moissonneurs/multivac/save/" );
//$("#submit_thing").onclick()
})}
}
......@@ -708,7 +708,6 @@
//HAL = 11
if (SourceTypeId == "11"){
$.ajax({
// contentType: "application/json",
......@@ -736,7 +735,7 @@
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
......@@ -745,7 +744,7 @@
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N);
save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick()
})}
}
......@@ -768,6 +767,69 @@
}
//HAL = 12
if (SourceTypeId == "12"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/isidore/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
save(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
}
// CSS events for selecting one Radio-Input
......@@ -819,6 +881,7 @@
|| selectedId == "9"
|| selectedId == "10"
|| selectedId == "11"
|| selectedId == "12"
) {
console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible");
......@@ -1001,7 +1064,7 @@
});
}
function saveALL(query, N){
function save(query, N, urlGarg){
console.log("In Gargantext")
if(!query || query=="") return;
......@@ -1016,7 +1079,7 @@
console.log(data)
$.ajax({
dataType: 'json',
url: window.location.origin+"/moissonneurs/hal/save/"+projectid,
url: window.location.origin + urlGarg + projectid,
data: data,
type: 'POST',
beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment