Commit 2c9ccbe5 authored by c24b's avatar c24b

Integration de CERN [OK]:\n\t-Manque option search corpus\n\t-juste au stade send the job

parent 9cf232c0
......@@ -203,7 +203,7 @@ RESOURCETYPES = [
'accepted_formats':["zip",],
},
# type 10
{ "name": 'Cern (MARC21 XML)',
{ "name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
......
from ._Parser import Parser
from datetime import datetime
from io import BytesIO
from bs4 import BeautifulSoup
#from io import BytesIO
from io import StringIO
import json
from lxml import etree
class CernParser(Parser):
MARC21 = {
......@@ -34,8 +36,9 @@ class CernParser(Parser):
}
def parse(self, filebuf):
tree = etree.tostring(filebuf)
#root = tree.getroot()
doc = etree.parse(filebuf)
tree = etree.tostring(doc)
#parser = etree.XMLParser()
hyperdata_list =[]
soup = BeautifulSoup(tree, "lxml")
for record in soup.find_all("record"):
......
......@@ -4,7 +4,6 @@ from gargantext.util.db_cache import cache
from gargantext.util.files import upload
from gargantext.models import *
from gargantext.constants import *
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
......@@ -17,7 +16,7 @@ import re
@requires_auth
def overview(request):
'''This view show all projects for a given user.
Each project is described with hyperdata that are updateded on each following view.
Each project is described with hyperdata that are updated on each following view.
To each project, we can link a resource that can be an image.
'''
......@@ -63,13 +62,20 @@ class NewCorpusForm(forms.Form):
choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
)
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
file = forms.FileField()
def clean_file(self):
file_ = self.cleaned_data.get('file')
if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB
raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
return file_
def check_filename(self):
print(self.cleaned_data)
print (self.cleaned_data.get("file").split(".")[-1])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@requires_auth
......@@ -108,9 +114,9 @@ def project(request, project_id):
},
)
# corpora within this project
corpora = project.children('CORPUS', order=True).all()
print(corpora)
sourcename2corpora = defaultdict(list)
for corpus in corpora:
# we only consider the first resource of the corpus to determine its type
......@@ -118,8 +124,11 @@ def project(request, project_id):
if len(resources):
resource = resources[0]
resource_type_name = RESOURCETYPES[resource['type']]['name']
resource_type_accepted_formats = RESOURCETYPES[resource['type']]['accepted_formats']
else:
print("(WARNING) PROJECT view: no listed resource")
print("(DEBUG) PROJECT view: one of the corpus has an invalid type")
raise Http404("One of the corpus has an invalid type")
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ****************************
# ***** CERN Scrapper *****
# ****************************
import logging
from logging.handlers import RotatingFileHandler
# création de l'objet logger qui va nous servir à écrire dans les logs
logger = logging.getLogger()
# on met le niveau du logger à DEBUG, comme ça il écrit tout
logger.setLevel(logging.DEBUG)
# création d'un formateur qui va ajouter le temps, le niveau
# de chaque message quand on écrira un message dans le log
formatter = logging.Formatter('%(asctime)s :: %(levelname)s :: %(message)s')
# création d'un handler qui va rediriger une écriture du log vers
# un fichier en mode 'append', avec 1 backup et une taille max de 1Mo
#>>> Permission denied entre en conflit avec les los django
#file_handler = RotatingFileHandler('.activity.log', 'a', 1000000, 1)
# on lui met le niveau sur DEBUG, on lui dit qu'il doit utiliser le formateur
# créé précédement et on ajoute ce handler au logger
#~ file_handler.setLevel(logging.DEBUG)
#~ file_handler.setFormatter(formatter)
#~ logger.addHandler(file_handler)
# création d'un second handler qui va rediriger chaque écriture de log
# sur la console
steam_handler = logging.StreamHandler()
steam_handler.setLevel(logging.DEBUG)
logger.addHandler(steam_handler)
import json
import datetime
from os import path
......@@ -20,11 +52,38 @@ from collections import defaultdict
from gargantext.settings import API_TOKENS as API
#from private import API_PERMISSIONS
API_TOKEN = API["CERN"]
def save( request , project_id ) :
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = session.query( Node ).filter(Node.id == project_id).first()
if project is None:
raise Http404()
user = cache.User[request.user.id]
if not user.owns(project):
raise HttpResponseForbidden()
if request.method == "POST":
query = request.POST["query"]
name = request.POST["string"]
corpus = project.add_child( name=name
, typename = "CORPUS"
)
corpus.add_resource( type = resourcetype('Cern (MARC21 XML)')
, path = filename
, url = None
)
print("Adding the resource")
def query( request ):
print(request.method)
alist = []
if request.method == "POST":
query = request.POST["query"]
N = int(request.POST["N"])
......@@ -36,129 +95,16 @@ def query( request ):
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
#Here Requests API
#
#API_TOKEN = API["CERN"]
def save(request , project_id):
print("testCERN:")
print(request.method)
alist = ["bar","foo"]
# implicit global session
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
#instancia = Scraper()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.typename == 'PROJECT')
).first()
if project is None:
raise Http404()
# serialFetcher (n_last_years, query, query_size)
#alist = instancia.serialFetcher( 5, query , N )
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/auth/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = "-"
query_string = "-"
N = 0
if "query" in request.POST:
query = request.POST["query"]
query_string = query.replace(" ","+") # url encoded q
if "N" in request.POST:
N = int(request.POST["N"]) # query_size from views_opti
if N > QUERY_SIZE_N_MAX:
msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX)
print("ERROR (scrap: istex d/l ): ",msg)
raise ValueError(msg)
print("Scrapping Istex: '%s' (%i)" % (query_string , N))
urlreqs = []
pagesize = 50
tasks = Scraper()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
# corpus node instanciation as a Django model
corpus = Node(
name = query,
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scrapping data"
, "language_id" : None
}
)
tasks = Scraper()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
tasks.q.put( url ) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = resourcetype('ISTex')
, path = filename
)
dwnldsOK+=1
session.add(corpus)
session.commit()
corpus_id = corpus.id
if dwnldsOK == 0 :
return JsonHttpResponse(["fail"])
###########################
###########################
try:
scheduled(parse_extract_indexhyperdata)(corpus_id)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
data = [query_string,query,N]
data = alist
return JsonHttpResponse(data)
......
......@@ -98,7 +98,7 @@
<button type="button" class="btn btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content="
<ul>
<li
<li
onclick=&quot;
garganrest.nodes.delete({{corpus.id}}, function(){$('#corpus_'+{{corpus.id}}).remove()});
$(this).parent().parent().remove();
......@@ -142,9 +142,9 @@
</span>
</div>
{% endifequal %}
{% ifequal state.action "ngrams_extraction" %}
<div class="progress-bar progress-bar-striped
<div class="progress-bar progress-bar-striped
{% if state.complete %}
progress-bar-success
{% else %}
......@@ -162,7 +162,7 @@
</span>
</div>
{% endifequal %}
{% endfor %}
</div>
{% endif %}
......@@ -483,7 +483,8 @@
selected = selected.toLowerCase()
var is_pubmed = (selected.indexOf('pubmed') != -1);
var is_istex = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex) {
var is_cern = (selected.indexOf('istex') != -1);
if (is_pubmed || is_istex || is_cern) {
// if(selected=="pubmed") {
console.log("show the button for: " + selected)
$("#pubmedcrawl").css("visibility", "visible");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment