Commit 54976e29 authored by Romain Loth's avatar Romain Loth

Scrappers: fix istex resourcetype and enhance workflow error trapping

parent dd049cd0
......@@ -128,38 +128,47 @@ LANGUAGES = {
from gargantext.util.parsers import *
RESOURCETYPES = [
# type 0
{ 'name': 'Europress (English)',
'parser': EuropressParser,
'default_language': 'en',
},
# type 1
{ 'name': 'Europress (French)',
'parser': EuropressParser,
'default_language': 'fr',
},
# type 2
{ 'name': 'Jstor (RIS format)',
'parser': RISParser,
'default_language': 'en',
},
# type 3
{ 'name': 'Pubmed (XML format)',
'parser': PubmedParser,
'default_language': 'en',
},
# type 4
{ 'name': 'Scopus (RIS format)',
'parser': RISParser,
'default_language': 'en',
},
# type 5
{ 'name': 'Web of Science (ISI format)',
'parser': ISIParser,
'default_language': 'fr',
},
# type 6
{ 'name': 'Zotero (RIS format)',
'parser': RISParser,
'default_language': 'en',
},
# type 7
{ 'name': 'CSV',
'parser': CSVParser,
'default_language': 'en',
},
# type 8
{ 'name': 'ISTex',
'parser': ISTexParser,
'default_language': 'en',
......@@ -213,4 +222,3 @@ BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
from lxml import etree
from ._Parser import Parser
from datetime import datetime
from io import BytesIO
......
......@@ -145,5 +145,6 @@ class Parser:
try:
file.seek(0)
except:pass
# debug: print(self.parse) # do we have correct parser ?
for hyperdata in self.parse(file):
yield self.format_hyperdata(hyperdata)
......@@ -2,6 +2,7 @@
from time import sleep
import datetime
import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
......@@ -111,7 +112,7 @@ def save(request , project_id):
user_id = request.user.id,
parent_id = project_id,
typename = 'CORPUS',
hyperdata = { "action" : "Scraping data"
hyperdata = { "action" : "Scrapping data"
, "language_id" : None
}
)
......@@ -137,8 +138,8 @@ def save(request , project_id):
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
# add the uploaded resource to the corpus
corpus.add_resource( type = 3
corpus.add_resource(
type = 8 # cf. constants.RESOURCETYPES
, path = filename
)
dwnldsOK+=1
......@@ -152,14 +153,17 @@ def save(request , project_id):
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N]
return JsonHttpResponse(data)
......@@ -12,6 +12,7 @@ import json
import datetime
from os import path
import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect
......@@ -159,10 +160,16 @@ def save( request , project_id ) :
except Exception as error:
print('WORKFLOW ERROR')
print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id))
data = alist
return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment