Commit a5a6ba76 authored by c24b's avatar c24b

FIX CERN=>DOC PARSER

parent 49736e59
...@@ -131,7 +131,9 @@ from gargantext.util.parsers import \ ...@@ -131,7 +131,9 @@ from gargantext.util.parsers import \
def get_resource(corpus_type): def get_resource(corpus_type):
'''get ressources values for a given ressource_type id''' '''get ressources values for a given ressource_type id'''
return [n for n in RESOURCES_TYPE if n["type"] == corpus_type][0] for n in RESOURCETYPES:
if n["type"] == corpus_type:
return n
RESOURCETYPES = [ RESOURCETYPES = [
# type 0 # type 0
...@@ -176,7 +178,7 @@ RESOURCETYPES = [ ...@@ -176,7 +178,7 @@ RESOURCETYPES = [
'accepted_formats':["zip",], 'accepted_formats':["zip",],
}, },
# type 6 # type 6
{ 'type': 6 { 'type': 6,
'name': 'Web of Science (ISI format)', 'name': 'Web of Science (ISI format)',
'parser': ISIParser, 'parser': ISIParser,
'default_language': 'en', 'default_language': 'en',
...@@ -209,8 +211,8 @@ RESOURCETYPES = [ ...@@ -209,8 +211,8 @@ RESOURCETYPES = [
"parser": CernParser, "parser": CernParser,
"default_language": "en", "default_language": "en",
'accepted_formats':["zip","xml"], 'accepted_formats':["zip","xml"],
"scrapper": CernScrapper, #~ "scrapper": CernScrapper,
"base_url": "http://api.scoap3.org/search?", #~ "base_url": "http://api.scoap3.org/search?",
}, },
] ]
......
...@@ -110,6 +110,7 @@ class Node(Base): ...@@ -110,6 +110,7 @@ class Node(Base):
if order is not None: if order is not None:
query = query.order_by(Node.name) query = query.order_by(Node.name)
return query return query
def add_child(self, **kwargs): def add_child(self, **kwargs):
......
...@@ -13,11 +13,11 @@ from .ngram_groups import compute_groups ...@@ -13,11 +13,11 @@ from .ngram_groups import compute_groups
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
from gargantext.util.files import check_format, upload
from datetime import datetime from datetime import datetime
from celery import shared_task from celery import shared_task
def add_corpus(request): def add_corpus(request, project):
'''adding a new corpus into project corpus: '''adding a new corpus into project corpus:
verifying two prerequisites before upload: verifying two prerequisites before upload:
- file size can exceed UPLOAD_LIMIT set in constants - file size can exceed UPLOAD_LIMIT set in constants
...@@ -28,6 +28,10 @@ def add_corpus(request): ...@@ -28,6 +28,10 @@ def add_corpus(request):
corpus_msg = None corpus_msg = None
#Corpus est du type Node #Corpus est du type Node
#print(corpus.__str__) #print(corpus.__str__)
corpus = project.add_child(
name = request.POST['name'],
typename = 'CORPUS',
)
#get ressource type #get ressource type
corpus_type = int(request.POST['type']) corpus_type = int(request.POST['type'])
#corpus.type = int(request.POST['type']) #corpus.type = int(request.POST['type'])
...@@ -43,25 +47,15 @@ def add_corpus(request): ...@@ -43,25 +47,15 @@ def add_corpus(request):
except OSError as e: except OSError as e:
corpus_status = False corpus_status = False
corpus_status_msg = str(e) corpus_status_msg = str(e)
if corpus_status:
corpus.add_resource( corpus.add_resource(
type, type=corpus_type,
path, path=path,
type = corpus_type,
format = corpus_format,
)
else:
corpus.add_resource(
type,
path,
type= corpus_type,
format = corpus_format,
status = corpus_status,
status_message = corpus_status_msg,
) )
print(session.add(corpus))
print(session.commit()) session.add(corpus)
return session.query(Node).filter(Node.id == corpus_id).first() session.commit()
return session.query(Node).filter(Node.id == corpus.id).first()
#@shared_task #@shared_task
def parse_extract(corpus): def parse_extract(corpus):
......
...@@ -94,11 +94,8 @@ def project(request, project_id): ...@@ -94,11 +94,8 @@ def project(request, project_id):
# add a new corpus into Node Project > Node Corpus > Ressource # add a new corpus into Node Project > Node Corpus > Ressource
if request.method == 'POST': if request.method == 'POST':
corpus = project.add_child(
name = request.POST['name'], corpus = add_corpus(request, project)
typename = 'CORPUS',
)
corpus = add_corpus(request)
if corpus.status: if corpus.status:
# parse_extract: fileparsing -> ngram extraction -> lists # parse_extract: fileparsing -> ngram extraction -> lists
...@@ -119,32 +116,31 @@ def project(request, project_id): ...@@ -119,32 +116,31 @@ def project(request, project_id):
for corpus in corpora: for corpus in corpora:
# we only consider the first resource of the corpus to determine its type # we only consider the first resource of the corpus to determine its type
resources = corpus.resources() resources = corpus.resources()
if len(resources): if len(resources) > 0:
resource = resources[0] resource = resources[0]
resource= get_resource(resource["type"])
##here map from RESSOURCES_TYPES_ID and NOT NAME ##here map from RESSOURCES_TYPES_ID and NOT NAME
resource_type_name = RESOURCETYPES[resource['type']]['name'] resource_type_name = resource['name']
resource_type_accepted_formats = RESOURCETYPES[resource['type']]['accepted_formats'] resource_type_accepted_formats = resource['accepted_formats']
else:
print("(WARNING) PROJECT view: no listed resource or one of the corpus has an invalid type") # add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
# add some data for the viewer status = corpus.status()
corpus.count = corpus.children('DOCUMENT').count() if status is not None and not status['complete']:
status = corpus.status() if not status['error']:
if status is not None and not status['complete']: corpus.status_message = '(in progress: %s, %d complete)' % (
if not status['error']: status['action'].replace('_', ' '),
corpus.status_message = '(in progress: %s, %d complete)' % ( status['progress'],
status['action'].replace('_', ' '), )
status['progress'], else:
) corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else: else:
corpus.status_message = '(aborted: "%s" after %i docs)' % ( corpus.status_message = ''
status['error'][-1], # add
status['progress'] sourcename2corpora[resource_type_name].append(corpus)
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
# source & their respective counts # source & their respective counts
total_documentscount = 0 total_documentscount = 0
sourcename2documentscount = defaultdict(int) sourcename2documentscount = defaultdict(int)
......
...@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings ...@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.models import Node from gargantext.models import Node
from gargantext.constants import resourcename from gargantext.constants import get_resource
from datetime import datetime from datetime import datetime
@requires_auth @requires_auth
...@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id): ...@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'corpus' : corpus, 'corpus' : corpus,
'resourcename' : resourcename(corpus), 'resourcename' : get_ressource(corpus)["name"],
'view': 'terms' 'view': 'terms'
}, },
) )
...@@ -8,7 +8,7 @@ from traceback import print_tb ...@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX from gargantext.constants import QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
...@@ -133,7 +133,7 @@ def save(request , project_id): ...@@ -133,7 +133,7 @@ def save(request , project_id):
if filename!=False: if filename!=False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
corpus.add_resource( corpus.add_resource(
type = resourcetype('ISTex') type = 9
, path = filename , path = filename
) )
dwnldsOK+=1 dwnldsOK+=1
......
...@@ -18,7 +18,7 @@ from traceback import print_tb ...@@ -18,7 +18,7 @@ from traceback import print_tb
from django.shortcuts import redirect from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
...@@ -134,7 +134,7 @@ def save( request , project_id ) : ...@@ -134,7 +134,7 @@ def save( request , project_id ) :
print(filename) print(filename)
if filename != False: if filename != False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
corpus.add_resource( type = resourcetype('Pubmed (XML format)') corpus.add_resource( type = 4
, path = filename , path = filename
, url = None , url = None
) )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment