Commit 8b5700cc authored by c24b's avatar c24b

CERN Crawler integration

parent 8625e425
...@@ -5,28 +5,31 @@ ...@@ -5,28 +5,31 @@
# **************************** # ****************************
# Author:c24b # Author:c24b
# Date: 27/05/2015 # Date: 27/05/2015
from ._Crawler import Crawler
import hmac, hashlib import hmac, hashlib
import requests import requests
import os import os
import random import random
import urllib.parse as uparse import urllib.parse as uparse
from lxml import etree from lxml import etree
from gargantext.settings import API_TOKENS from gargantext.settings import API_TOKENS
#from gargantext.util.files import build_corpus_path from ._Crawler import Crawler
from gargantext.util.db import session from gargantext.util.timeit_damnit import timing
from gargantext.models import Node
class CernCrawler(Crawler): class CernCrawler(Crawler):
'''CERN SCOAP3 API Interaction''' '''CERN SCOAP3 API Interaction'''
def __init__(self):
API = API_TOKENS["CERN"]
self.apikey = API["APIKEY"].encode("utf-8")
self.secret = bytearray(API["APISECRET"].encode("utf-8"))
self.BASE_URL = u"http://api.scoap3.org/search?"
def __generate_signature__(self, url): def __generate_signature__(self, url):
'''creation de la signature''' '''creation de la signature'''
#hmac-sha1 salted with secret #hmac-sha1 salted with secret
return hmac.new(self.secret,url, hashlib.sha1).hexdigest() return hmac.new(self.secret,url.encode("utf-8"), hashlib.sha1).hexdigest()
def __format_query__(self, query, of="xm", fields= None): def __format_query__(self, query, of="xm", fields= None):
''' for query filters params ''' for query filters params
...@@ -45,89 +48,71 @@ class CernCrawler(Crawler): ...@@ -45,89 +48,71 @@ class CernCrawler(Crawler):
def __format_url__(self, dict_q): def __format_url__(self, dict_q):
'''format the url with encoded query''' '''format the url with encoded query'''
#add the apikey #add the apikey at the end
dict_q["apikey"] = [self.apikey] dict_q["apikey"] = self.apikey
params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())]) #dict_q["p"] = dict_q["p"].replace(" ", "+") >> quote_plus
params = ("&").join([(str(k)+"="+uparse.quote_plus(v)) for k,v in sorted(dict_q.items())])
return self.BASE_URL+params return self.BASE_URL+params
def sign_url(self, dict_q): def sign_url(self, dict_q):
'''add signature''' '''add signature'''
API = API_TOKENS["CERN"]
self.apikey = API["APIKEY"]
self.secret = API["APISECRET"].encode("utf-8")
self.BASE_URL = u"http://api.scoap3.org/search?"
url = self.__format_url__(dict_q) url = self.__format_url__(dict_q)
return url+"&signature="+self.__generate_signature__(url.encode("utf-8")) return url+"&signature="+self.__generate_signature__(url)
@timing
def create_corpus(self): def download(self, query):
#create a corpus
corpus = Node(
name = self.query,
#user_id = self.user_id,
parent_id = self.project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : self.type["default_language"]
}
)
#add the resource
corpus.add_resource(
type = self.type["type"],
name = self.type["name"],
path = self.path)
try:
print("PARSING")
# p = eval(self.type["parser"])()
session.add(corpus)
session.commit()
self.corpus_id = corpus.id
parse_extract_indexhyperdata(corpus.id)
return self
except Exception as error:
print('WORKFLOW ERROR')
print(error)
session.rollback()
return self
def download(self):
import time
self.path = "/tmp/results.xml" self.path = "/tmp/results.xml"
query = self.__format_query__(self.query) query = self.__format_query__(query)
url = self.sign_url(query) url = self.sign_url(query)
start = time.time()
r = requests.get(url, stream=True) r = requests.get(url, stream=True)
downloaded = False downloaded = False
#the long part #the long part
with open(self.path, 'wb') as f: with open(self.path, 'wb') as f:
print("Downloading file") print("Downloading file")
for chunk in r.iter_content(chunk_size=1024): for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks if chunk: # filter out keep-alive new chunks
#print("===") #print("===")
f.write(chunk) f.write(chunk)
downloaded = True downloaded = True
end = time.time()
#print (">>>>>>>>>>LOAD results", end-start)
return downloaded return downloaded
def get_ids(self, query):
def scan_results(self): '''get results nb + individual ids of search query'''
'''scanner le nombre de resultat en récupérant 1 seul résultat dict_q = uparse.parse_qs(query)
#parameters for a global request
dict_q["p"] = query
dict_q["of"] = "id"
dict_q["rg"] = "10000"
#api key is added when formatting url
url = self.__format_url__(dict_q)
signed_url = url+"&signature="+self.__generate_signature__(url)
r = requests.get(signed_url)
print(signed_url)
self.ids = r.json()
#self.results_nb = len(self.ids)
#self.generate_urls()
return(self.ids)
def generate_urls(self):
''' generate raw urls of ONE record'''
self.urls = ["http://repo.scoap3.org/record/%i/export/xm?ln=en" %rid for rid in self.ids]
return self.urls
def fetch_records(self, ids):
''' for NEXT time'''
raise NotImplementedError
def scan_results(self, query):
'''[OLD]scanner le nombre de resultat en récupérant 1 seul résultat
qui affiche uniquement l'auteur de la page 1 qui affiche uniquement l'auteur de la page 1
on récupère le commentaire en haut de la page on récupère le commentaire en haut de la page
''' '''
import time
self.results_nb = 0 self.results_nb = 0
query = self.__format_query__(self.query, of="hb") query = self.__format_query__(query, of="hb")
query["ot"] = "100" query["ot"] = "100"
query["jrec"]='1' query["jrec"]='1'
query["rg"]='1' query["rg"]='1'
url = self.sign_url(query) url = self.sign_url(query)
print(url) #print(url)
#start = time.time() #start = time.time()
r = requests.get(url) r = requests.get(url)
#end = time.time() #end = time.time()
......
...@@ -101,7 +101,7 @@ class Crawler: ...@@ -101,7 +101,7 @@ class Crawler:
) )
session.add(corpus) session.add(corpus)
session.commit() session.commit()
scheduled(parse_extract_indexhyperdata)(corpus.id) scheduled(parse_extract_indexhyperdata(corpus.id))
else: else:
#add the resource #add the resource
corpus.add_resource( corpus.add_resource(
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# **************************** # ****************************
# ***** CERN Scrapper ***** # ***** CERN Crawler *****
# **************************** # ****************************
RESOURCE_TYPE_SCOAP = 9
import logging from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from logging.handlers import RotatingFileHandler
# création de l'objet logger qui va nous servir à écrire dans les logs
logger = logging.getLogger()
# on met le niveau du logger à DEBUG, comme ça il écrit tout
logger.setLevel(logging.DEBUG)
# création d'un formateur qui va ajouter le temps, le niveau
# de chaque message quand on écrira un message dans le log
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
# création d'un handler qui va rediriger une écriture du log vers
# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
#>>> Permission denied entre en conflit avec les los django
#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
# créé précédement et on ajoute ce handler au logger
#~ file_handler.setLevel(logging.DEBUG)
#~ file_handler.setFormatter(formatter)
#~ logger.addHandler(file_handler)
# création d'un second handler qui va rediriger chaque écriture de log
# sur la console
steam_handler = logging.StreamHandler()
steam_handler.setLevel(logging.DEBUG)
logger.addHandler(steam_handler)
import json
import datetime
from os import path
import threading
import hmac, hashlib
import requests
import lxml
import subprocess
import urllib.parse as uparse
from lxml import etree
from bs4 import BeautifulSoup, Comment
from collections import defaultdict
#from gargantext.util.files import download
from gargantext.settings import API_TOKENS as API
#from private import API_PERMISSIONS
def save( request , project_id ) :
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
raise HttpResponseForbidden()
from gargantext.constants import get_resource, load_crawler, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.util.http import JsonHttpResponse
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
if request.method == "POST":
query = request.POST["query"]
name = request.POST["string"]
corpus = project.add_child( name=name
, typename = "CORPUS"
)
corpus.add_resource( type = resourcetype('Cern (MARC21 XML)')
, path = filename
, url = None
)
print("Adding the resource")
def query( request ):
print(request.method)
alist = []
def query( request):
'''get GlobalResults()'''
if request.method == "POST": if request.method == "POST":
query = request.POST["query"] query = request.POST["query"]
N = int(request.POST["N"]) source = get_resource(RESOURCE_TYPE_SCOAP)
if source["crawler"] is not None:
crawlerbot = load_crawler(source)()
#old raw way to get results_nb
#results = crawlerbot.scan_results(query)
ids = crawlerbot.get_ids(query)
return JsonHttpResponse({"results_nb":int(len(ids)), "ids": ids})
def save(request, project_id):
'''save'''
if request.method == "POST":
query = request.POST.get("query")
try:
N = int(request.POST.get("N"))
except:
N = 0
print(query, N)
#for next time
#ids = request.POST["ids"]
source = get_resource(RESOURCE_TYPE_SCOAP)
if N == 0:
raise Http404()
if N > QUERY_SIZE_N_MAX: if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) N = QUERY_SIZE_N_MAX
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg) try:
project_id = int(project_id)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query ) except ValueError:
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N ) raise Http404()
#Here Requests API # do we have a valid project?
# project = session.query( Node ).filter(Node.id == project_id).first()
#API_TOKEN = API["CERN"] if project is None:
raise Http404()
#instancia = Scraper() user = cache.User[request.user.id]
if not user.owns(project):
# serialFetcher (n_last_years, query, query_size) raise HttpResponseForbidden()
#alist = instancia.serialFetcher( 5, query , N ) # corpus node instanciation as a Django model
data = alist corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : "en"
}
)
#download_file
crawler_bot = load_crawler(source)()
#for now no way to force downloading X records
#the long running command
filename = crawler_bot.download(query)
corpus.add_resource(
type = source["type"]
#, name = source["name"]
, path = crawler_bot.path
)
session.add(corpus)
session.commit()
#corpus_id = corpus.id
try:
scheduled(parse_extract_indexhyperdata)(corpus.id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
print(data)
return JsonHttpResponse(data) return JsonHttpResponse(data)
class CERN_API(object):
'''CERN SCOAP3 Interaction'''
def __init__(self,query, filename= "./results.xml"):
self.query = query
self.apikey = API["TOKEN"]
self.secret = API["SECRET"].encode("utf-8")
self.results = self.get_results(filename)
self.BASE_URL= u"http://api.scoap3.org/search?"
def __generate_signature__(self, url):
'''creation de la signature'''
#hmac-sha1 salted with secret
return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
def __format_url__(self):
'''format the url with encoded query'''
dict_q = uparse.parse_qs(self.query)
#add the apikey
dict_q["apikey"] = [self.apikey]
params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
return self.BASE_URL+params
def sign_url(self):
'''add signature'''
url = self.__format_url__()
return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
def get_results(self, filename):
url = self.sign_url()
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filename
def parse_xml(filename,MARCXML):
parser = etree.XMLParser()
with open(self.filename, 'r') as f:
root = etree.tostring(f.read())
data = f.read()
records = []
for record in data.split("<record>")[1:]:
soup = BeautifulSoup("<record>"+record, "lxml")
r = {v:[] for v in self.MARC21["700"].values()}
r["uid"] = soup.find("controlfield").text
for data in soup.find_all("datafield"):
tag = data.get("tag")
if tag in self.MARC21.keys():
for sub in data.find_all("subfield"):
code = sub.get("code")
if code in self.MARC21[tag].keys():
if tag == "700":
r[self.MARC21[tag][code]].append(sub.text)
else:
r[self.MARC21[tag][code]] = sub.text
records.append(r.decode('utf-8'))
return JsonHttpResponse(records)
#query="of=xm"
#a = CERN_API(query, "./full.xml")
#p = CERNParser("./full.xml")
#print(p.MARC21.keys())
#~ #p.parse()
#~ with open("./results_full.json", "r") as f:
#~ data = json.load(f)
#~ for record in data["records"]:
#~ print(record.keys())
...@@ -13,15 +13,13 @@ ...@@ -13,15 +13,13 @@
# Available databases : # Available databases :
## Pubmed ## Pubmed
## IsTex, ## IsTex,
## En cours CERN ## CERN
from django.conf.urls import url from django.conf.urls import url
import moissonneurs.pubmed as pubmed import moissonneurs.pubmed as pubmed
import moissonneurs.istex as istex import moissonneurs.istex as istex
# TODO
import moissonneurs.cern as cern import moissonneurs.cern as cern
# TODO # TODO
...@@ -38,8 +36,6 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query ) ...@@ -38,8 +36,6 @@ urlpatterns = [ url(r'^pubmed/query$' , pubmed.query )
, url(r'^istex/query$' , istex.query ) , url(r'^istex/query$' , istex.query )
, url(r'^istex/save/(\d+)' , istex.save ) , url(r'^istex/save/(\d+)' , istex.save )
, url(r'^cern/query$' , cern.query )
# TODO , url(r'^cern/save/(\d+)' , cern.save )
, url(r'^scoap3/query$' , cern.query )
, url(r'^scoap3/save/(\d+)' , cern.save )
] ]
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment