Commit aa4e82ee authored by delanoe's avatar delanoe

[FIX] Merge ready for unstable.

parent d5d87ef8
...@@ -112,18 +112,17 @@ INDEXED_HYPERDATA = { ...@@ -112,18 +112,17 @@ INDEXED_HYPERDATA = {
} }
#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
from gargantext.util.taggers import NltkTagger
LANGUAGES = { LANGUAGES = {
'en': { 'en': {
#'tagger': EnglishMeltTagger, #'tagger': EnglishMeltTagger,
#'tagger': TurboTagger, 'tagger': TurboTagger,
'tagger': NltkTagger, #'tagger': NltkTagger,
}, },
'fr': { 'fr': {
#'tagger': FrenchMeltTagger, 'tagger': FrenchMeltTagger,
# 'tagger': TreeTagger, # 'tagger': TreeTagger,
'tagger': NltkTagger,
}, },
} }
...@@ -131,96 +130,85 @@ LANGUAGES = { ...@@ -131,96 +130,85 @@ LANGUAGES = {
from gargantext.util.parsers import \ from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
#from gargantext.util.scrappers import \ def resourcetype(name):
# CernScraper '''
resourcetype :: String -> Int
Usage : resourcetype("Europress (English)") == 1
def get_resource(corpus_type): Examples in scrapers scripts (Pubmed or ISTex for instance).
'''get ressources values for a given ressource_type id''' '''
for n in RESOURCETYPES: return [n[0] for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]
if n["type"] == corpus_type:
return n def resourcename(corpus):
'''
resourcetype :: Corpus -> String
Usage : resourcename(corpus) == "ISTex"
'''
resource = corpus.resources()[0]
resourcename = RESOURCETYPES[resource['type']]['name']
return re.sub(r'\(.*', '', resourcename)
RESOURCETYPES = [ RESOURCETYPES = [
# type 0 # type 0
{ 'type':0, { 'name': 'Select database below',
'name': 'Select database below',
'parser': None, 'parser': None,
'default_language': None, 'default_language': None,
}, },
# type 1 # type 1
{ 'type':1, { 'name': 'Europress (English)',
'name': 'Europress (English)',
'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 2 # type 2
{ 'type':2, { 'name': 'Europress (French)',
'name': 'Europress (French)',
'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'fr', 'default_language': 'fr',
'accepted_formats':["zip",],
}, },
# type 3 # type 3
{ 'type':3, { 'name': 'Jstor (RIS format)',
'name': 'Jstor (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 4 # type 4
{ 'type':4, { 'name': 'Pubmed (XML format)',
'name': 'Pubmed (XML format)',
'parser': PubmedParser, 'parser': PubmedParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 5 # type 5
{ 'type':5, { 'name': 'Scopus (RIS format)',
'name': 'Scopus (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 6 # type 6
{ 'type': 6, { 'name': 'Web of Science (ISI format)',
'name': 'Web of Science (ISI format)',
'parser': ISIParser, 'parser': ISIParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 7 # type 7
{ 'type':7, { 'name': 'Zotero (RIS format)',
'name': 'Zotero (RIS format)',
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
# type 8 # type 8
{ 'type':8, { 'name': 'CSV',
'name': 'CSV',
'parser': CSVParser, 'parser': CSVParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["csv"],
}, },
# type 9 # type 9
{ "type":9, { 'name': 'ISTex',
'name': 'ISTex',
'parser': ISTexParser, 'parser': ISTexParser,
'default_language': 'en', 'default_language': 'en',
'accepted_formats':["zip",],
}, },
{ "type":10, # type 10
"name": 'SCOAP (XML MARC21 Format)', { "type":10,
"parser": CernParser, "name": 'SCOAP (XML MARC21 Format)',
"default_language": "en", "parser": CernParser,
'accepted_formats':["zip","xml"], "default_language": "en",
#~ "scrapper": CernScrapper, 'accepted_formats':["zip","xml"],
#~ "base_url": "http://api.scoap3.org/search?", #~ "scrapper": CernScrapper,
}, #~ "base_url": "http://api.scoap3.org/search?",
] },
]
# linguistic extraction parameters --------------------------------------------- # linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in % DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
...@@ -246,8 +234,8 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording ...@@ -246,8 +234,8 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
#  occurring at sentence beginning) #  occurring at sentence beginning)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# other parameters
# other parameters
# default number of docs POSTed to scrappers.views.py # default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample) # (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000 QUERY_SIZE_N_DEFAULT = 1000
...@@ -257,7 +245,7 @@ from .settings import BASE_DIR ...@@ -257,7 +245,7 @@ from .settings import BASE_DIR
# uploads/.gitignore prevents corpora indexing # uploads/.gitignore prevents corpora indexing
# copora can be either a folder or symlink towards specific partition # copora can be either a folder or symlink towards specific partition
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora') UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
UPLOAD_LIMIT = 1024* 1024 * 1024 UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
......
...@@ -110,7 +110,6 @@ class Node(Base): ...@@ -110,7 +110,6 @@ class Node(Base):
if order is not None: if order is not None:
query = query.order_by(Node.name) query = query.order_by(Node.name)
return query return query
def add_child(self, **kwargs): def add_child(self, **kwargs):
...@@ -136,7 +135,7 @@ class Node(Base): ...@@ -136,7 +135,7 @@ class Node(Base):
self['resources'] = MutableList() self['resources'] = MutableList()
return self['resources'] return self['resources']
def add_resource(self, type, path=None, url=None, **kwargs): def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node. """Attach a resource to a given node.
Mainly used for corpora. Mainly used for corpora.
...@@ -146,13 +145,10 @@ class Node(Base): ...@@ -146,13 +145,10 @@ class Node(Base):
{'extracted': True, {'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip', 'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1, 'type': 1,
'url': None, 'url': None}
'status':
'status_message':
}
""" """
self.resources().append(MutableDict( self.resources().append(MutableDict(
{'type': type, 'path':path, 'url':url, 'extracted': False, **kwargs} {'type': type, 'path':path, 'url':url, 'extracted': False}
)) ))
def status(self, action=None, progress=0, complete=False, error=None): def status(self, action=None, progress=0, complete=False, error=None):
......
...@@ -16,7 +16,6 @@ def requires_auth(func): ...@@ -16,7 +16,6 @@ def requires_auth(func):
Also passes the URL to redirect towards as a GET parameter. Also passes the URL to redirect towards as a GET parameter.
""" """
def _requires_auth(request, *args, **kwargs): def _requires_auth(request, *args, **kwargs):
#print(request.user.is_authenticated())
if not request.user.is_authenticated(): if not request.user.is_authenticated():
url = '/auth/login/?next=%s' % urlencode(request.path) url = '/auth/login/?next=%s' % urlencode(request.path)
return redirect(url) return redirect(url)
......
#from .TurboTagger import TurboTagger from .TurboTagger import TurboTagger
from .NltkTagger import NltkTagger from .NltkTagger import NltkTagger
#from .TreeTagger import TreeTagger from .TreeTagger import TreeTagger
#from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
...@@ -2,12 +2,11 @@ from gargantext.util.http import * ...@@ -2,12 +2,11 @@ from gargantext.util.http import *
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.util.files import upload from gargantext.util.files import upload
from gargantext.util.files import check_format
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from gargantext.util.scheduling import scheduled from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata from gargantext.util.toolchain import parse_extract_indexhyperdata
from gargantext.util.toolchain import add_corpus
from datetime import datetime from datetime import datetime
from collections import defaultdict from collections import defaultdict
...@@ -18,7 +17,7 @@ import re ...@@ -18,7 +17,7 @@ import re
@requires_auth @requires_auth
def overview(request): def overview(request):
'''This view show all projects for a given user. '''This view show all projects for a given user.
Each project is described with hyperdata that are updated on each following view. Each project is described with hyperdata that are updateded on each following view.
To each project, we can link a resource that can be an image. To each project, we can link a resource that can be an image.
''' '''
...@@ -60,25 +59,17 @@ def overview(request): ...@@ -60,25 +59,17 @@ def overview(request):
class NewCorpusForm(forms.Form): class NewCorpusForm(forms.Form):
'''OK: add corpus Form (NIY)'''
type = forms.ChoiceField( type = forms.ChoiceField(
choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES), choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'}) widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
) )
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' })) name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
file = forms.FileField() file = forms.FileField()
def clean_file(self): def clean_file(self):
file_ = self.cleaned_data.get('file') file_ = self.cleaned_data.get('file')
if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).')) raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
return file_ return file_
def check_filename(self):
print(self.cleaned_data)
print (self.cleaned_data.get("file").split(".")[-1])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@requires_auth @requires_auth
...@@ -92,55 +83,61 @@ def project(request, project_id): ...@@ -92,55 +83,61 @@ def project(request, project_id):
if not user.owns(project): if not user.owns(project):
raise HttpResponseForbidden() raise HttpResponseForbidden()
# add a new corpus into Node Project > Node Corpus > Ressource # new corpus
if request.method == 'POST': if request.method == 'POST':
corpus = project.add_child(
corpus = add_corpus(request, project) name = request.POST['name'],
typename = 'CORPUS',
if corpus.status: )
# parse_extract: fileparsing -> ngram extraction -> lists corpus.add_resource(
scheduled(parse_extract_indexhyperdata)(corpus.id) type = int(request.POST['type']),
return render( path = upload(request.FILES['file']),
template_name = 'pages/projects/wait.html', )
request = request, session.add(corpus)
context = { session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract_indexhyperdata)(corpus.id)
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user, 'user' : request.user,
'project': project, 'project': project,
}, },
) )
# list all the corpora within this project # corpora within this project
corpora = project.children('CORPUS', order=True).all() corpora = project.children('CORPUS', order=True).all()
#print(corpora)
sourcename2corpora = defaultdict(list) sourcename2corpora = defaultdict(list)
for corpus in corpora: for corpus in corpora:
# we only consider the first resource of the corpus to determine its type # we only consider the first resource of the corpus to determine its type
resources = corpus.resources() resources = corpus.resources()
if len(resources) > 0: if len(resources):
resource = resources[0] resource = resources[0]
resource= get_resource(resource["type"]) resource_type_name = RESOURCETYPES[resource['type']]['name']
##here map from RESSOURCES_TYPES_ID and NOT NAME else:
resource_type_name = resource['name'] print("(WARNING) PROJECT view: no listed resource")
resource_type_accepted_formats = resource['accepted_formats'] # add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
# add some data for the viewer status = corpus.status()
corpus.count = corpus.children('DOCUMENT').count() if status is not None and not status['complete']:
status = corpus.status() if not status['error']:
if status is not None and not status['complete']: corpus.status_message = '(in progress: %s, %d complete)' % (
if not status['error']: status['action'].replace('_', ' '),
corpus.status_message = '(in progress: %s, %d complete)' % ( status['progress'],
status['action'].replace('_', ' '), )
status['progress'],
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else: else:
corpus.status_message = '' corpus.status_message = '(aborted: "%s" after %i docs)' % (
# add status['error'][-1],
sourcename2corpora[resource_type_name].append(corpus) status['progress']
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
# source & their respective counts # source & their respective counts
total_documentscount = 0 total_documentscount = 0
sourcename2documentscount = defaultdict(int) sourcename2documentscount = defaultdict(int)
......
...@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings ...@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.models import Node from gargantext.models import Node
from gargantext.constants import get_resource from gargantext.constants import resourcename
from datetime import datetime from datetime import datetime
@requires_auth @requires_auth
...@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id): ...@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'corpus' : corpus, 'corpus' : corpus,
'resourcename' : get_ressource(corpus)["name"], 'resourcename' : resourcename(corpus),
'view': 'terms' 'view': 'terms'
}, },
) )
...@@ -8,7 +8,7 @@ from traceback import print_tb ...@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import QUERY_SIZE_N_MAX from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse from gargantext.util.http import JsonHttpResponse
...@@ -133,7 +133,7 @@ def save(request , project_id): ...@@ -133,7 +133,7 @@ def save(request , project_id):
if filename!=False: if filename!=False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
corpus.add_resource( corpus.add_resource(
type = 9 type = resourcetype('ISTex')
, path = filename , path = filename
) )
dwnldsOK+=1 dwnldsOK+=1
......
...@@ -18,7 +18,7 @@ from traceback import print_tb ...@@ -18,7 +18,7 @@ from traceback import print_tb
from django.shortcuts import redirect from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node from gargantext.models.nodes import Node
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
...@@ -134,7 +134,7 @@ def save( request , project_id ) : ...@@ -134,7 +134,7 @@ def save( request , project_id ) :
print(filename) print(filename)
if filename != False: if filename != False:
# add the uploaded resource to the corpus # add the uploaded resource to the corpus
corpus.add_resource( type = 4 corpus.add_resource( type = resourcetype('Pubmed (XML format)')
, path = filename , path = filename
, url = None , url = None
) )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment