Commit 462c9ecc authored by c24b's avatar c24b

Integration de CERN [OK]:\n\t-Manque option search corpus\n\t-juste au stade send the job

parent 649f366b
...@@ -203,7 +203,7 @@ RESOURCETYPES = [ ...@@ -203,7 +203,7 @@ RESOURCETYPES = [
'accepted_formats':["zip",], 'accepted_formats':["zip",],
}, },
# type 10 # type 10
{ "name": 'Cern (MARC21 XML)', { "name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser, "parser": CernParser,
"default_language": "en", "default_language": "en",
'accepted_formats':["zip","xml"], 'accepted_formats':["zip","xml"],
......
from ._Parser import Parser from ._Parser import Parser
from datetime import datetime from datetime import datetime
from io import BytesIO from bs4 import BeautifulSoup
#from io import BytesIO
from io import StringIO
import json import json
from lxml import etree
class CernParser(Parser): class CernParser(Parser):
MARC21 = { MARC21 = {
...@@ -34,8 +36,9 @@ class CernParser(Parser): ...@@ -34,8 +36,9 @@ class CernParser(Parser):
} }
def parse(self, filebuf): def parse(self, filebuf):
tree = etree.tostring(filebuf) doc = etree.parse(filebuf)
#root = tree.getroot() tree = etree.tostring(doc)
#parser = etree.XMLParser()
hyperdata_list =[] hyperdata_list =[]
soup = BeautifulSoup(tree, "lxml") soup = BeautifulSoup(tree, "lxml")
for record in soup.find_all("record"): for record in soup.find_all("record"):
......
...@@ -4,7 +4,6 @@ from gargantext.util.db_cache import cache ...@@ -4,7 +4,6 @@ from gargantext.util.db_cache import cache
from gargantext.util.files import upload from gargantext.util.files import upload
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata from gargantext.util.toolchain import parse_extract_indexhyperdata
...@@ -17,7 +16,7 @@ import re ...@@ -17,7 +16,7 @@ import re
@requires_auth @requires_auth
def overview(request): def overview(request):
'''This view show all projects for a given user. '''This view show all projects for a given user.
Each project is described with hyperdata that are updateded on each following view. Each project is described with hyperdata that are updated on each following view.
To each project, we can link a resource that can be an image. To each project, we can link a resource that can be an image.
''' '''
...@@ -63,13 +62,20 @@ class NewCorpusForm(forms.Form): ...@@ -63,13 +62,20 @@ class NewCorpusForm(forms.Form):
choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES), choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'}) widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
) )
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' })) name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
file = forms.FileField() file = forms.FileField()
def clean_file(self): def clean_file(self):
file_ = self.cleaned_data.get('file') file_ = self.cleaned_data.get('file')
if len(file_) > 1024 ** 3 : # we don't accept more than 1GB if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB
raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).')) raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
return file_ return file_
def check_filename(self):
print(self.cleaned_data)
print (self.cleaned_data.get("file").split(".")[-1])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@requires_auth @requires_auth
...@@ -108,9 +114,9 @@ def project(request, project_id): ...@@ -108,9 +114,9 @@ def project(request, project_id):
}, },
) )
# corpora within this project # corpora within this project
corpora = project.children('CORPUS', order=True).all() corpora = project.children('CORPUS', order=True).all()
print(corpora)
sourcename2corpora = defaultdict(list) sourcename2corpora = defaultdict(list)
for corpus in corpora: for corpus in corpora:
# we only consider the first resource of the corpus to determine its type # we only consider the first resource of the corpus to determine its type
...@@ -118,8 +124,11 @@ def project(request, project_id): ...@@ -118,8 +124,11 @@ def project(request, project_id):
if len(resources): if len(resources):
resource = resources[0] resource = resources[0]
resource_type_name = RESOURCETYPES[resource['type']]['name'] resource_type_name = RESOURCETYPES[resource['type']]['name']
resource_type_accepted_formats = RESOURCETYPES[resource['type']]['accepted_formats']
else: else:
print("(WARNING) PROJECT view: no listed resource") print("(WARNING) PROJECT view: no listed resource")
print("(DEBUG) PROJECT view: one of the corpus has an invalid type")
raise Http404("One of the corpus has an invalid type")
# add some data for the viewer # add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count() corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status() status = corpus.status()
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# **************************** # ****************************
# ***** CERN Scrapper ***** # ***** CERN Scrapper *****
# **************************** # ****************************
import logging
from logging.handlers import RotatingFileHandler
# création de l'objet logger qui va nous servir à écrire dans les logs
logger = logging.getLogger()
# on met le niveau du logger à DEBUG, comme ça il écrit tout
logger.setLevel(logging.DEBUG)
# création d'un formateur qui va ajouter le temps, le niveau
# de chaque message quand on écrira un message dans le log
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
# création d'un handler qui va rediriger une écriture du log vers
# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
#>>> Permission denied entre en conflit avec les los django
#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
# créé précédement et on ajoute ce handler au logger
#~ file_handler.setLevel(logging.DEBUG)
#~ file_handler.setFormatter(formatter)
#~ logger.addHandler(file_handler)
# création d'un second handler qui va rediriger chaque écriture de log
# sur la console
steam_handler = logging.StreamHandler()
steam_handler.setLevel(logging.DEBUG)
logger.addHandler(steam_handler)
import json import json
import datetime import datetime
from os import path from os import path
...@@ -20,145 +52,59 @@ from collections import defaultdict ...@@ -20,145 +52,59 @@ from collections import defaultdict
from gargantext.settings import API_TOKENS as API from gargantext.settings import API_TOKENS as API
#from private import API_PERMISSIONS #from private import API_PERMISSIONS
API_TOKEN = API["CERN"]
def query( request ):
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
if N > QUERY_SIZE_N_MAX: def save( request , project_id ) :
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR(scrap: pubmed stats): ",msg)
raise ValueError(msg)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
def save(request , project_id):
print("testCERN:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try: try:
project_id = int(project_id) project_id = int(project_id)
except ValueError: except ValueError:
raise Http404() raise Http404()
# do we have a valid project? # do we have a valid project?
project = (session project = session.query( Node ).filter(Node.id == project_id).first()
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None: if project is None:
raise Http404() raise Http404()
user = cache.User[request.user.id]
# do we have a valid user? if not user.owns(project):
user = request.user raise HttpResponseForbidden()
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST": if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"] query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST: name = request.POST["string"]
N = int(request.POST["N"]) # query_size from views_opti corpus = project.add_child( name=name
if N > QUERY_SIZE_N_MAX: , typename = "CORPUS"
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : None
}
) )
corpus.add_resource( type = resourcetype('Cern (MARC21 XML)')
, path = filename
, url = None
)
print("Adding the resource")
def query( request ):
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
tasks = Scraper() if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
for i in range(8): print("ERROR(scrap: pubmed stats): ",msg)
t = threading.Thread(target=tasks.worker2) #thing to do raise ValueError(msg)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = resourcetype('ISTex')
, path = filename
)
dwnldsOK+=1
session.add(corpus) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
session.commit() print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
corpus_id = corpus.id #Here Requests API
#
#API_TOKEN = API["CERN"]
if dwnldsOK == 0 : #instancia = Scraper()
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
# serialFetcher (n_last_years, query, query_size)
#alist = instancia.serialFetcher( 5, query , N )
data = [query_string,query,N] data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
......
...@@ -483,7 +483,8 @@ ...@@ -483,7 +483,8 @@
selected = selected.toLowerCase() selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1); var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1); var is_istex = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex) { var is_cern = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex || is_cern) {
// if(selected=="pubmed") { // if(selected=="pubmed") {
console.log("show the button for: " + selected) console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible"); $("#pubmedcrawl").css("visibility", "visible");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment