cern.py 6 KB
Newer Older
1 2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
3 4 5
# ****************************
# *****  CERN Scrapper    *****
# ****************************
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35

import logging

from logging.handlers import RotatingFileHandler

# création de l'objet logger qui va nous servir à écrire dans les logs
logger = logging.getLogger()
# on met le niveau du logger à DEBUG, comme ça il écrit tout
logger.setLevel(logging.DEBUG)

# création d'un formateur qui va ajouter le temps, le niveau
# de chaque message quand on écrira un message dans le log
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')

# création d'un handler qui va rediriger une écriture du log vers
# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
#>>> Permission denied entre en conflit avec les los django
#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
# créé précédement et on ajoute ce handler au logger
#~ file_handler.setLevel(logging.DEBUG)
#~ file_handler.setFormatter(formatter)
#~ logger.addHandler(file_handler)

# création d'un second handler qui va rediriger chaque écriture de log
# sur la console
steam_handler = logging.StreamHandler()
steam_handler.setLevel(logging.DEBUG)
logger.addHandler(steam_handler)

36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
import json
import datetime
from os import path
import threading
import hmac, hashlib
import requests
import lxml
import subprocess
import urllib.parse as uparse
from lxml import etree
from bs4 import BeautifulSoup, Comment
from collections import defaultdict



#from gargantext.util.files import download

from gargantext.settings import API_TOKENS as API
#from private import API_PERMISSIONS
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81

def save( request , project_id ) :
    try:
        project_id = int(project_id)
    except ValueError:
        raise Http404()
    # do we have a valid project?
    project = session.query( Node ).filter(Node.id == project_id).first()
    if project is None:
        raise Http404()
    user = cache.User[request.user.id]
    if not user.owns(project):
        raise HttpResponseForbidden()


    if request.method == "POST":
        query = request.POST["query"]

        name    = request.POST["string"]
        corpus = project.add_child( name=name
                                , typename = "CORPUS"
                                  )
        corpus.add_resource( type = resourcetype('Cern (MARC21 XML)')
                                   , path = filename
                                   , url  = None
                                   )
        print("Adding the resource")
c24b's avatar
c24b committed
82

83 84 85
def query( request ):
    print(request.method)
    alist = []
86

87 88 89 90 91 92 93 94 95 96 97
    if request.method == "POST":
        query = request.POST["query"]
        N = int(request.POST["N"])

        if N > QUERY_SIZE_N_MAX:
            msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
            print("ERROR(scrap: pubmed stats): ",msg)
            raise ValueError(msg)

        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
98 99 100
        #Here Requests API
        #
        #API_TOKEN = API["CERN"]
101

102
        #instancia = Scraper()
103

104 105
        # serialFetcher (n_last_years, query, query_size)
        #alist = instancia.serialFetcher( 5, query , N )
106

107
    data = alist
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
    return JsonHttpResponse(data)


class CERN_API(object):
    '''CERN SCOAP3 Interaction'''
    def __init__(self,query, filename= "./results.xml"):
        self.query = query
        self.apikey = API["TOKEN"]
        self.secret  = API["SECRET"].encode("utf-8")
        self.results = self.get_results(filename)
        self.BASE_URL= u"http://api.scoap3.org/search?"
    def __generate_signature__(self, url):
        '''creation de la signature'''
        #hmac-sha1 salted with secret
        return hmac.new(self.secret,url, hashlib.sha1).hexdigest()

    def __format_url__(self):
        '''format the url with encoded query'''
        dict_q = uparse.parse_qs(self.query)
        #add the apikey
        dict_q["apikey"] = [self.apikey]
        params = "&".join([(str(k)+"="+str(uparse.quote(v[0]))) for k,v in sorted(dict_q.items())])
        return self.BASE_URL+params

    def sign_url(self):
        '''add signature'''
        url = self.__format_url__()
        return url+"&signature="+self.__generate_signature__(url.encode("utf-8"))

    def get_results(self, filename):
        url = self.sign_url()
        r = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        return filename
c24b's avatar
c24b committed
145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
    def parse_xml(filename,MARCXML):
        parser = etree.XMLParser()
        with open(self.filename, 'r') as f:
            root = etree.tostring(f.read())
            data = f.read()
            records = []
            for record in data.split("<record>")[1:]:
                soup = BeautifulSoup("<record>"+record, "lxml")
                r = {v:[] for v in self.MARC21["700"].values()}
                r["uid"]  = soup.find("controlfield").text

                for data in soup.find_all("datafield"):
                    tag = data.get("tag")
                    if tag in self.MARC21.keys():
                        for sub in data.find_all("subfield"):
                            code = sub.get("code")
                            if code in self.MARC21[tag].keys():
                                if tag == "700":
                                    r[self.MARC21[tag][code]].append(sub.text)
                                else:
                                    r[self.MARC21[tag][code]] = sub.text
                records.append(r.decode('utf-8'))
        return JsonHttpResponse(records)


#query="of=xm"
#a = CERN_API(query, "./full.xml")
#p = CERNParser("./full.xml")
#print(p.MARC21.keys())
#~ #p.parse()
#~ with open("./results_full.json", "r") as f:
    #~ data = json.load(f)
    #~ for record in data["records"]:
        #~ print(record.keys())