Commit 54976e29 authored by Romain Loth's avatar Romain Loth

Scrappers: fix istex resourcetype and enhance workflow error trapping

parent dd049cd0
...@@ -128,38 +128,47 @@ LANGUAGES = { ...@@ -128,38 +128,47 @@ LANGUAGES = {
from gargantext.util.parsers import * from gargantext.util.parsers import *
RESOURCETYPES = [ RESOURCETYPES = [
# type 0
{ 'name': 'Europress (English)', { 'name': 'Europress (English)',
'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 1
{ 'name': 'Europress (French)', { 'name': 'Europress (French)',
'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'fr', 'default_language': 'fr',
}, },
# type 2
{ 'name': 'Jstor (RIS format)', { 'name': 'Jstor (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 3
{ 'name': 'Pubmed (XML format)', { 'name': 'Pubmed (XML format)',
'parser': PubmedParser, 'parser': PubmedParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 4
{ 'name': 'Scopus (RIS format)', { 'name': 'Scopus (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 5
{ 'name': 'Web of Science (ISI format)', { 'name': 'Web of Science (ISI format)',
'parser': ISIParser, 'parser': ISIParser,
'default_language': 'fr', 'default_language': 'fr',
}, },
# type 6
{ 'name': 'Zotero (RIS format)', { 'name': 'Zotero (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 7
{ 'name': 'CSV', { 'name': 'CSV',
'parser': CSVParser, 'parser': CSVParser,
'default_language': 'en', 'default_language': 'en',
}, },
# type 8
{ 'name': 'ISTex', { 'name': 'ISTex',
'parser': ISTexParser, 'parser': ISTexParser,
'default_language': 'en', 'default_language': 'en',
...@@ -213,4 +222,3 @@ BATCH_NGRAMSEXTRACTION_SIZE = 1024 ...@@ -213,4 +222,3 @@ BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config # Scrapers config
QUERY_SIZE_N_MAX = 1000 QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000 QUERY_SIZE_N_DEFAULT = 1000
from lxml import etree
from ._Parser import Parser from ._Parser import Parser
from datetime import datetime from datetime import datetime
from io import BytesIO from io import BytesIO
......
...@@ -145,5 +145,6 @@ class Parser: ...@@ -145,5 +145,6 @@ class Parser:
try: try:
file.seek(0) file.seek(0)
except:pass except:pass
# debug: print(self.parse) # do we have correct parser ?
for hyperdata in self.parse(file): for hyperdata in self.parse(file):
yield self.format_hyperdata(hyperdata) yield self.format_hyperdata(hyperdata)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from time import sleep from time import sleep
import datetime import datetime
import threading import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR #from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect from django.shortcuts import redirect
...@@ -111,7 +112,7 @@ def save(request , project_id): ...@@ -111,7 +112,7 @@ def save(request , project_id):
user_id = request.user.id, user_id = request.user.id,
parent_id = project_id, parent_id = project_id,
typename = 'CORPUS', typename = 'CORPUS',
hyperdata = { "action" : "Scraping data" hyperdata = { "action" : "Scrapping data"
, "language_id" : None , "language_id" : None
} }
) )
...@@ -137,9 +138,9 @@ def save(request , project_id): ...@@ -137,9 +138,9 @@ def save(request , project_id):
for filename in tasks.firstResults: for filename in tasks.firstResults:
if filename!=False: if filename!=False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
# add the uploaded resource to the corpus corpus.add_resource(
corpus.add_resource( type = 3 type = 8 # cf. constants.RESOURCETYPES
, path = filename , path = filename
) )
dwnldsOK+=1 dwnldsOK+=1
...@@ -152,14 +153,17 @@ def save(request , project_id): ...@@ -152,14 +153,17 @@ def save(request , project_id):
except Exception as error: except Exception as error:
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
sleep(1) sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id)) return HttpResponseRedirect('/projects/' + str(project_id))
data = [query_string,query,N] data = [query_string,query,N]
return JsonHttpResponse(data) return JsonHttpResponse(data)
...@@ -12,6 +12,7 @@ import json ...@@ -12,6 +12,7 @@ import json
import datetime import datetime
from os import path from os import path
import threading import threading
from traceback import print_tb
#from gargantext.settings import MEDIA_ROOT, BASE_DIR #from gargantext.settings import MEDIA_ROOT, BASE_DIR
from django.shortcuts import redirect from django.shortcuts import redirect
...@@ -159,10 +160,16 @@ def save( request , project_id ) : ...@@ -159,10 +160,16 @@ def save( request , project_id ) :
except Exception as error: except Exception as error:
print('WORKFLOW ERROR') print('WORKFLOW ERROR')
print(error) print(error)
try:
print_tb(error.__traceback__)
except:
pass
# IMPORTANT ---------------------------------
# sanitize session after interrupted transact
session.rollback()
# --------------------------------------------
sleep(1) sleep(1)
return HttpResponseRedirect('/projects/' + str(project_id)) return HttpResponseRedirect('/projects/' + str(project_id))
data = alist data = alist
return JsonHttpResponse(data) return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment