Commit 20787ee4 authored by c24b's avatar c24b

FIX CERN=>DOC PARSER

parent 26c4ae31
......@@ -131,7 +131,9 @@ from gargantext.util.parsers import \
def get_resource(corpus_type):
'''get ressources values for a given ressource_type id'''
return [n for n in RESOURCES_TYPE if n["type"] == corpus_type][0]
for n in RESOURCETYPES:
if n["type"] == corpus_type:
return n
RESOURCETYPES = [
# type 0
......@@ -176,7 +178,7 @@ RESOURCETYPES = [
'accepted_formats':["zip",],
},
# type 6
{ 'type': 6
{ 'type': 6,
'name': 'Web of Science (ISI format)',
'parser': ISIParser,
'default_language': 'en',
......@@ -209,8 +211,8 @@ RESOURCETYPES = [
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
"scrapper": CernScrapper,
"base_url": "http://api.scoap3.org/search?",
#~ "scrapper": CernScrapper,
#~ "base_url": "http://api.scoap3.org/search?",
},
]
......
......@@ -110,6 +110,7 @@ class Node(Base):
if order is not None:
query = query.order_by(Node.name)
return query
def add_child(self, **kwargs):
......
......@@ -13,11 +13,11 @@ from .ngram_groups import compute_groups
from gargantext.util.db import session
from gargantext.models import Node
from gargantext.util.files import check_format, upload
from datetime import datetime
from celery import shared_task
def add_corpus(request):
def add_corpus(request, project):
'''adding a new corpus into project corpus:
verifying two prerequisites before upload:
- file size can exceed UPLOAD_LIMIT set in constants
......@@ -28,6 +28,10 @@ def add_corpus(request):
corpus_msg = None
#Corpus est du type Node
#print(corpus.__str__)
corpus = project.add_child(
name = request.POST['name'],
typename = 'CORPUS',
)
#get ressource type
corpus_type = int(request.POST['type'])
#corpus.type = int(request.POST['type'])
......@@ -43,25 +47,15 @@ def add_corpus(request):
except OSError as e:
corpus_status = False
corpus_status_msg = str(e)
if corpus_status:
corpus.add_resource(
type,
path,
type = corpus_type,
format = corpus_format,
)
else:
corpus.add_resource(
type,
path,
type= corpus_type,
format = corpus_format,
status = corpus_status,
status_message = corpus_status_msg,
corpus.add_resource(
type=corpus_type,
path=path,
)
print(session.add(corpus))
print(session.commit())
return session.query(Node).filter(Node.id == corpus_id).first()
session.add(corpus)
session.commit()
return session.query(Node).filter(Node.id == corpus.id).first()
#@shared_task
def parse_extract(corpus):
......
......@@ -94,11 +94,8 @@ def project(request, project_id):
# add a new corpus into Node Project > Node Corpus > Ressource
if request.method == 'POST':
corpus = project.add_child(
name = request.POST['name'],
typename = 'CORPUS',
)
corpus = add_corpus(request)
corpus = add_corpus(request, project)
if corpus.status:
# parse_extract: fileparsing -> ngram extraction -> lists
......@@ -119,32 +116,31 @@ def project(request, project_id):
for corpus in corpora:
# we only consider the first resource of the corpus to determine its type
resources = corpus.resources()
if len(resources):
if len(resources) > 0:
resource = resources[0]
resource= get_resource(resource["type"])
##here map from RESSOURCES_TYPES_ID and NOT NAME
resource_type_name = RESOURCETYPES[resource['type']]['name']
resource_type_accepted_formats = RESOURCETYPES[resource['type']]['accepted_formats']
else:
print("(WARNING) PROJECT view: no listed resource or one of the corpus has an invalid type")
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
if not status['error']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
resource_type_name = resource['name']
resource_type_accepted_formats = resource['accepted_formats']
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
if not status['error']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
# source & their respective counts
total_documentscount = 0
sourcename2documentscount = defaultdict(int)
......
......@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.models import Node
from gargantext.constants import resourcename
from gargantext.constants import get_resource
from datetime import datetime
@requires_auth
......@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : resourcename(corpus),
'resourcename' : get_ressource(corpus)["name"],
'view': 'terms'
},
)
......@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.constants import QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
......@@ -133,7 +133,7 @@ def save(request , project_id):
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = resourcetype('ISTex')
type = 9
, path = filename
)
dwnldsOK+=1
......
......@@ -18,7 +18,7 @@ from traceback import print_tb
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
......@@ -134,7 +134,7 @@ def save( request , project_id ) :
print(filename)
if filename != False:
# add the uploaded resource to the corpus
corpus.add_resource( type = resourcetype('Pubmed (XML format)')
corpus.add_resource( type = 4
, path = filename
, url = None
)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment