Commit 89ddcc0f authored by delanoe's avatar delanoe

[FEAT] ISIDORE OK for simple boolean requests.

parent cef6161f
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** HAL Scrapper ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Crawler import *
import json
from gargantext.constants import UPLOAD_DIRECTORY
from math import trunc
from gargantext.util.files import save
from gargantext.util.crawlers.sparql.bool2sparql import bool2sparql, isidore
class IsidoreCrawler(Crawler):
''' ISIDORE SPARQL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://www.rechercheisidore.fr"
self.API_URL = "sparql"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
self.status = []
def __format_query__(self, query=None, count=False, offset=None, limit=None):
'''formating the query'''
return (bool2sparql(query, count=count, offset=offset, limit=limit))
def _get(self, query, offset=0, limit=100, lang=None):
'''Parameters to download data'''
isidore(query, count=False, offset=offset, limit=limit)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = [n for n in isidore(query, count=True)][0]
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
limit = 100
self.query_max = self.scan_results(query)
#print("self.query_max : %s" % self.query_max)
if self.query_max > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % ( self.query_max
, QUERY_SIZE_N_MAX
)
print("WARNING (scrap: ISIDORE d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for offset in range(0, self.query_max, limit):
print("Downloading result %s to %s" % (offset, self.query_max))
#for doc in self._get(query, count=False, offset=offset, limit=limit) :
for doc in isidore(query, offset=offset, limit=limit) :
corpus.append(doc)
self.path = save( json.dumps(corpus).encode("utf-8")
, name='ISIDORE.json'
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
/home/alexandre/local/logiciels/haskell/myCode/bool2sparql/.stack-work/dist/x86_64-linux-nopie/Cabal-1.24.2.0/build/bool2sparql-exe/bool2sparql-exe
\ No newline at end of file
import subprocess import subprocess
import sparql as s from .sparql import Service
#from sparql import Service
def bool2sparql(query, count=False, limit=None): def bool2sparql(query, count=False, offset=None, limit=None):
""" """
bool2sparql :: String -> Bool -> Int -> String bool2sparql :: String -> Bool -> Int -> String
Translate a boolean query into a Sparql request Translate a boolean query into a Sparql request
...@@ -11,14 +12,20 @@ def bool2sparql(query, count=False, limit=None): ...@@ -11,14 +12,20 @@ def bool2sparql(query, count=False, limit=None):
See: https://github.com/delanoe/bool2sparql See: https://github.com/delanoe/bool2sparql
""" """
bashCommand = ["./bool2sparql-exe","-q",query] bashCommand = ["/srv/gargantext/gargantext/util/crawlers/sparql/bool2sparql-exe","-q",query]
if count is True : if count is True :
bashCommand.append("-c") bashCommand.append("-c")
else : else :
for command in ["-l", str(limit)] : if offset is not None :
for command in ["--offset", str(offset)] :
bashCommand.append(command) bashCommand.append(command)
if limit is not None :
for command in ["--limit", str(limit)] :
bashCommand.append(command)
process = subprocess.Popen(bashCommand, stdout=subprocess.PIPE) process = subprocess.Popen(bashCommand, stdout=subprocess.PIPE)
output, error = process.communicate() output, error = process.communicate()
...@@ -27,26 +34,28 @@ def bool2sparql(query, count=False, limit=None): ...@@ -27,26 +34,28 @@ def bool2sparql(query, count=False, limit=None):
else : else :
return(output.decode("utf-8")) return(output.decode("utf-8"))
def isidore(query, count=False, limit=None): def isidore(query, count=False, offset=None, limit=None):
""" """
isidore :: String -> Bool -> Int -> Either (Dict String) Int isidore :: String -> Bool -> Int -> Either (Dict String) Int
use sparql-client either to search or to scan use sparql-client either to search or to scan
""" """
query = bool2sparql(query, count, limit) query = bool2sparql(query, count=count, offset=offset, limit=limit)
print(query) print(query)
go = s.Service("https://www.rechercheisidore.fr/sparql/", "utf-8", "GET")
go = Service("https://www.rechercheisidore.fr/sparql/", "utf-8", "GET")
results = go.query(query) results = go.query(query)
if count is False: if count is False:
for r in results: for r in results:
doc = dict() doc = dict()
doc_values = dict() doc_values = dict()
doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["journal"] = r doc["url"], doc["id"], doc["title"], doc["date"], doc["abstract"], doc["source"] = r
print(doc)
for k in doc.keys(): for k in doc.keys():
doc_values[k] = doc[k].value doc_values[k] = doc[k].value
print(doc_values)
yield(doc_values) yield(doc_values)
...@@ -60,12 +69,13 @@ def isidore(query, count=False, limit=None): ...@@ -60,12 +69,13 @@ def isidore(query, count=False, limit=None):
def test(): def test():
query = "ricoeur" query = "delanoe"
limit = 2 limit = 100
offset = 10
for d in isidore(query, limit=limit): for d in isidore(query, offset=offset, limit=limit):
print(d["abstract"]) print(d["date"])
print([n for n in isidore(query, count=True)]) #print([n for n in isidore(query, count=True)])
test() test()
File mode changed from 100755 to 100644
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# **************************** # ****************************
# **** HAL Parser *** # **** HAL Parser ***
# **************************** # ****************************
# CNRS COPYRIGHTS # CNRS COPYRIGHTS 2017
# SEE LEGAL LICENCE OF GARGANTEXT.ORG # SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser from ._Parser import Parser
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# **** ISIDORE Parser ***
# ****************************
# CNRS COPYRIGHTS
# SEE LEGAL LICENCE OF GARGANTEXT.ORG
from ._Parser import Parser
from datetime import datetime
import json
class IsidoreParser(Parser):
def parse(self, filebuf):
'''
parse :: FileBuff -> [Hyperdata]
'''
contents = filebuf.read().decode("UTF-8")
data = json.loads(contents)
filebuf.close()
json_docs = data
hyperdata_list = []
hyperdata_path = { "title" : "title"
, "abstract" : "abstract"
, "authors" : "authors"
, "url" : "url"
, "source" : "source"
}
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
hyperdata[key] = doc.get(path, "")
# Source is the Journal Name
hyperdata["source"] = doc.get("journal", "ISIDORE Database")
# Working on the date
maybeDate = doc.get("date" , None)
if maybeDate is None:
date = datetime.now()
else:
try :
# Model of date: 1958-01-01T00:00:00
date = datetime.strptime(maybeDate, '%Y-%m-%dT%H:%M:%S')
except :
print("FIX DATE ISIDORE please >%s<" % maybeDate)
date = datetime.now()
hyperdata["publication_date"] = date
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** ISIDORE Crawler *****
# ****************************
RESOURCE_TYPE_ISIDORE = 12
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
def query( request):
'''get GlobalResults()'''
if request.method == "POST":
query = request.POST["query"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
results = crawlerbot.scan_results(query)
#ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":crawlerbot.results_nb})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_ISIDORE)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX:
N = QUERY_SIZE_N_MAX
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
return HttpResponseForbidden()
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "fr"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data)
...@@ -10,19 +10,15 @@ ...@@ -10,19 +10,15 @@
# moissonneurs == getting data from external databases # moissonneurs == getting data from external databases
# Available databases :
## Pubmed
## IsTex,
## CERN
from django.conf.urls import url from django.conf.urls import url
# Available databases :
import moissonneurs.pubmed as pubmed import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex import moissonneurs.istex as istex
import moissonneurs.cern as cern import moissonneurs.cern as cern
import moissonneurs.multivac as multivac import moissonneurs.multivac as multivac
import moissonneurs.hal as hal import moissonneurs.hal as hal
import moissonneurs.isidore as isidore
# TODO : ISIDORE # TODO : ISIDORE
...@@ -42,7 +38,7 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query ) ...@@ -42,7 +38,7 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^hal/query$' , hal.query ) , url(r'^hal/query$' , hal.query )
, url(r'^hal/save/(\d+)' , hal.save ) , url(r'^hal/save/(\d+)' , hal.save )
#, url(r'^isidore/query$' , isidore.query ) , url(r'^isidore/query$' , isidore.query )
#, url(r'^isidore/save/(\d+)' , isidore.save ) , url(r'^isidore/save/(\d+)' , isidore.save )
] ]
...@@ -675,7 +675,7 @@ ...@@ -675,7 +675,7 @@
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N)); //$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){ $("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N); saveMultivac(pubmedquery, N, "/moissonneurs/multivac/save/");
//$("#submit_thing").onclick() //$("#submit_thing").onclick()
})} })}
//(N > {{query_size}}) //(N > {{query_size}})
...@@ -684,7 +684,7 @@ ...@@ -684,7 +684,7 @@
$('#submit_thing').prop('disabled', false); $('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file") $("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){ $("#submit_thing").on("click", function(){
saveMultivac(pubmedquery, N); saveMultivac(pubmedquery, N,"/moissonneurs/multivac/save/" );
//$("#submit_thing").onclick() //$("#submit_thing").onclick()
})} })}
} }
...@@ -708,7 +708,6 @@ ...@@ -708,7 +708,6 @@
//HAL = 11 //HAL = 11
if (SourceTypeId == "11"){ if (SourceTypeId == "11"){
$.ajax({ $.ajax({
// contentType: "application/json", // contentType: "application/json",
...@@ -736,7 +735,7 @@ ...@@ -736,7 +735,7 @@
$("#submit_thing").prop('disabled' , false) $("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N)); //$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){ $("#submit_thing").on("click", function(){
saveALL(pubmedquery, N); save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick() //$("#submit_thing").onclick()
})} })}
//(N > {{query_size}}) //(N > {{query_size}})
...@@ -745,7 +744,7 @@ ...@@ -745,7 +744,7 @@
$('#submit_thing').prop('disabled', false); $('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file") $("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){ $("#submit_thing").on("click", function(){
saveALL(pubmedquery, N); save(pubmedquery, N, "/moissonneurs/hal/save/");
//$("#submit_thing").onclick() //$("#submit_thing").onclick()
})} })}
} }
...@@ -768,6 +767,69 @@ ...@@ -768,6 +767,69 @@
} }
//HAL = 12
if (SourceTypeId == "12"){
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/moissonneurs/isidore/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log(data)
console.log("SUCCESS")
console.log("enabling "+"#"+value.id)
// $("#"+value.id).attr('onclick','getGlobalResults(this);');
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").html("Process a {{ query_size }} sample!")
N = data["results_nb"]
if(N > 0) {
if (N <= {{query_size}}){
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$("#submit_thing").html("Download!")
$("#submit_thing").prop('disabled' , false)
//$("#submit_thing").attr('onclick', testCERN(query, N));
$("#submit_thing").on("click", function(){
save(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
//(N > {{query_size}})
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: "+N+" publications </i><br>")
$('#submit_thing').prop('disabled', false);
$("#submit_thing").html("Processing a sample file")
$("#submit_thing").on("click", function(){
saveALL(pubmedquery, N, "/moissonneurs/isidore/save/");
//$("#submit_thing").onclick()
})}
}
else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html(theType +" connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
$("#theresults").html(theType +" connection error</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
} }
// CSS events for selecting one Radio-Input // CSS events for selecting one Radio-Input
...@@ -819,6 +881,7 @@ ...@@ -819,6 +881,7 @@
|| selectedId == "9" || selectedId == "9"
|| selectedId == "10" || selectedId == "10"
|| selectedId == "11" || selectedId == "11"
|| selectedId == "12"
) { ) {
console.log("show the button for: " + selectedId) console.log("show the button for: " + selectedId)
$("#div-fileornot").css("visibility", "visible"); $("#div-fileornot").css("visibility", "visible");
...@@ -1001,7 +1064,7 @@ ...@@ -1001,7 +1064,7 @@
}); });
} }
function saveALL(query, N){ function save(query, N, urlGarg){
console.log("In Gargantext") console.log("In Gargantext")
if(!query || query=="") return; if(!query || query=="") return;
...@@ -1016,7 +1079,7 @@ ...@@ -1016,7 +1079,7 @@
console.log(data) console.log(data)
$.ajax({ $.ajax({
dataType: 'json', dataType: 'json',
url: window.location.origin+"/moissonneurs/hal/save/"+projectid, url: window.location.origin + urlGarg + projectid,
data: data, data: data,
type: 'POST', type: 'POST',
beforeSend: function(xhr) { beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment