Commit 2e8dd028 authored by c24b's avatar c24b

Adding dependencies + refresh requirements.pip => [BUG Report]

parent d19c6877
amqp==1.4.9
anyjson==0.3.3
beautifulsoup4==4.4.1
billiard==3.3.0.22
celery==3.1.20
chardet==2.3.0
dateparser==0.3.2
decorator==4.0.9
Django==1.9.2
django-celery==3.1.17
django-pgfields==1.4.4
django-pgjsonb==0.0.16
djangorestframework==3.3.2
html5lib==0.9999999
jdatetime==1.7.2
kombu==3.0.33
lxml==3.5.0
networkx==1.11
nltk==3.1
numpy==1.10.4
pandas==0.18.0
pkg-resources==0.0.0
psycopg2==2.6.1
pycountry==1.20
python-dateutil==2.4.2
pytz==2015.7
PyYAML==3.11
RandomWords==0.1.12
requests==2.10.0
six==1.10.0
SQLAlchemy==1.1.0b1.dev0
ujson==1.35
umalqurra==0.2
# ****************************
# ***** CERN Scrapper *****
# ****************************
import json
import datetime
from os import path
import threading
import hmac, hashlib
import requests
import lxml
import subprocess
import urllib.parse as uparse
from lxml import etree
from bs4 import BeautifulSoup, Comment
from collections import defaultdict
#from gargantext.util.files import download
from gargantext.settings import API_TOKENS as API
#from private import API_PERMISSIONS
API_TOKEN = API["CERN"]
def query( request ):
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
def save(request , project_id):
print("testCERN:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : None
}
)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = resourcetype('ISTex')
, path = filename
)
dwnldsOK+=1
session.add(corpus)
session.commit()
corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
return JsonHttpResponse(data)
class CERN_API(object):
'''CERN SCOAP3 Interaction'''
def __init__(self,query, filename= "./results.xml"):
self.query = query
self.apikey = API["TOKEN"]
self.secret = API["SECRET"].encode("utf-8")
self.results = self.get_results(filename)
self.BASE_URL= u"http://api.scoap3.org/search?"
def __generate_signature__(self, url):
'''creation de la signature'''
#hmac-sha1 salted with secret
return hmac.new(self.secret,url, hashlib.sha1).hexdigest()
def __format_url__(self):
'''format the url with encoded query'''
dict_q = uparse.parse_qs(self.query)
#add the apikey
dict_q["apikey"] = [self.apikey]
params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
return self.BASE_URL+params
def sign_url(self):
'''add signature'''
url = self.__format_url__()
return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))
def get_results(self, filename):
url = self.sign_url()
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filename
def parse_xml(filename,MARCXML):
parser = etree.XMLParser()
with open(self.filename, 'r') as f:
root = etree.tostring(f.read())
data = f.read()
records = []
for record in data.split("<record>")[1:]:
soup = BeautifulSoup("<record>"+record, "lxml")
r = {v:[] for v in self.MARC21["700"].values()}
r["uid"] = soup.find("controlfield").text
for data in soup.find_all("datafield"):
tag = data.get("tag")
if tag in self.MARC21.keys():
for sub in data.find_all("subfield"):
code = sub.get("code")
if code in self.MARC21[tag].keys():
if tag == "700":
r[self.MARC21[tag][code]].append(sub.text)
else:
r[self.MARC21[tag][code]] = sub.text
records.append(r.decode('utf-8'))
return JsonHttpResponse(records)
#query="of=xm"
#a = CERN_API(query, "./full.xml")
#p = CERNParser("./full.xml")
#print(p.MARC21.keys())
#~ #p.parse()
#~ with open("./results_full.json", "r") as f:
#~ data = json.load(f)
#~ for record in data["records"]:
#~ print(record.keys())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment