Commit aa4e82ee authored by delanoe's avatar delanoe

[FIX] Merge ready for unstable.

parent d5d87ef8
......@@ -112,18 +112,17 @@ INDEXED_HYPERDATA = {
}
#from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
from gargantext.util.taggers import NltkTagger
from gargantext.util.taggers import FrenchMeltTagger, TurboTagger
LANGUAGES = {
'en': {
#'tagger': EnglishMeltTagger,
#'tagger': TurboTagger,
'tagger': NltkTagger,
'tagger': TurboTagger,
#'tagger': NltkTagger,
},
'fr': {
#'tagger': FrenchMeltTagger,
'tagger': FrenchMeltTagger,
# 'tagger': TreeTagger,
'tagger': NltkTagger,
},
}
......@@ -131,96 +130,85 @@ LANGUAGES = {
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
#from gargantext.util.scrappers import \
# CernScraper
def get_resource(corpus_type):
'''get ressources values for a given ressource_type id'''
for n in RESOURCETYPES:
if n["type"] == corpus_type:
return n
def resourcetype(name):
'''
resourcetype :: String -> Int
Usage : resourcetype("Europress (English)") == 1
Examples in scrapers scripts (Pubmed or ISTex for instance).
'''
return [n[0] for n in enumerate(r['name'] for r in RESOURCETYPES) if n[1] == name][0]
def resourcename(corpus):
'''
resourcetype :: Corpus -> String
Usage : resourcename(corpus) == "ISTex"
'''
resource = corpus.resources()[0]
resourcename = RESOURCETYPES[resource['type']]['name']
return re.sub(r'\(.*', '', resourcename)
RESOURCETYPES = [
# type 0
{ 'type':0,
'name': 'Select database below',
{ 'name': 'Select database below',
'parser': None,
'default_language': None,
},
# type 1
{ 'type':1,
'name': 'Europress (English)',
{ 'name': 'Europress (English)',
'parser': EuropressParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 2
{ 'type':2,
'name': 'Europress (French)',
{ 'name': 'Europress (French)',
'parser': EuropressParser,
'default_language': 'fr',
'accepted_formats':["zip",],
},
# type 3
{ 'type':3,
'name': 'Jstor (RIS format)',
{ 'name': 'Jstor (RIS format)',
'parser': RISParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 4
{ 'type':4,
'name': 'Pubmed (XML format)',
{ 'name': 'Pubmed (XML format)',
'parser': PubmedParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 5
{ 'type':5,
'name': 'Scopus (RIS format)',
{ 'name': 'Scopus (RIS format)',
'parser': RISParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 6
{ 'type': 6,
'name': 'Web of Science (ISI format)',
{ 'name': 'Web of Science (ISI format)',
'parser': ISIParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 7
{ 'type':7,
'name': 'Zotero (RIS format)',
{ 'name': 'Zotero (RIS format)',
'parser': RISParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
# type 8
{ 'type':8,
'name': 'CSV',
{ 'name': 'CSV',
'parser': CSVParser,
'default_language': 'en',
'accepted_formats':["csv"],
},
# type 9
{ "type":9,
'name': 'ISTex',
{ 'name': 'ISTex',
'parser': ISTexParser,
'default_language': 'en',
'accepted_formats':["zip",],
},
{ "type":10,
"name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
#~ "scrapper": CernScrapper,
#~ "base_url": "http://api.scoap3.org/search?",
},
]
# type 10
{ "type":10,
"name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
#~ "scrapper": CernScrapper,
#~ "base_url": "http://api.scoap3.org/search?",
},
]
# linguistic extraction parameters ---------------------------------------------
DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
......@@ -246,8 +234,8 @@ DEFAULT_ALL_LOWERCASE_FLAG = True # lowercase ngrams before recording
#  occurring at sentence beginning)
# ------------------------------------------------------------------------------
# other parameters
# other parameters
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000
......@@ -257,7 +245,7 @@ from .settings import BASE_DIR
# uploads/.gitignore prevents corpora indexing
# copora can be either a folder or symlink towards specific partition
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
UPLOAD_LIMIT = 1024* 1024 * 1024
UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
......
......@@ -110,7 +110,6 @@ class Node(Base):
if order is not None:
query = query.order_by(Node.name)
return query
def add_child(self, **kwargs):
......@@ -136,7 +135,7 @@ class Node(Base):
self['resources'] = MutableList()
return self['resources']
def add_resource(self, type, path=None, url=None, **kwargs):
def add_resource(self, type, path=None, url=None):
"""Attach a resource to a given node.
Mainly used for corpora.
......@@ -146,13 +145,10 @@ class Node(Base):
{'extracted': True,
'path': '/home/me/gargantext/uploads/corpora/0c/0c5b/0c5b50/0c5b50ad8ebdeb2ae33d8e54141a52ee_Corpus_Europresse-Français-2015-12-11.zip',
'type': 1,
'url': None,
'status':
'status_message':
}
'url': None}
"""
self.resources().append(MutableDict(
{'type': type, 'path':path, 'url':url, 'extracted': False, **kwargs}
{'type': type, 'path':path, 'url':url, 'extracted': False}
))
def status(self, action=None, progress=0, complete=False, error=None):
......
......@@ -16,7 +16,6 @@ def requires_auth(func):
Also passes the URL to redirect towards as a GET parameter.
"""
def _requires_auth(request, *args, **kwargs):
#print(request.user.is_authenticated())
if not request.user.is_authenticated():
url = '/auth/login/?next=%s' % urlencode(request.path)
return redirect(url)
......
#from .TurboTagger import TurboTagger
from .TurboTagger import TurboTagger
from .NltkTagger import NltkTagger
#from .TreeTagger import TreeTagger
#from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
from .TreeTagger import TreeTagger
from .MeltTagger import EnglishMeltTagger, FrenchMeltTagger
......@@ -2,12 +2,11 @@ from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import cache
from gargantext.util.files import upload
from gargantext.util.files import check_format
from gargantext.models import *
from gargantext.constants import *
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract_indexhyperdata
from gargantext.util.toolchain import add_corpus
from datetime import datetime
from collections import defaultdict
......@@ -18,7 +17,7 @@ import re
@requires_auth
def overview(request):
'''This view show all projects for a given user.
Each project is described with hyperdata that are updated on each following view.
Each project is described with hyperdata that are updateded on each following view.
To each project, we can link a resource that can be an image.
'''
......@@ -60,25 +59,17 @@ def overview(request):
class NewCorpusForm(forms.Form):
'''OK: add corpus Form (NIY)'''
type = forms.ChoiceField(
choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
)
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
file = forms.FileField()
def clean_file(self):
file_ = self.cleaned_data.get('file')
if len(file_) > UPLOAD_LIMIT : # we don't accept more than 1GB
if len(file_) > 1024 ** 3 : # we don't accept more than 1GB
raise forms.ValidationError(ugettext_lazy('File too heavy! (>1GB).'))
return file_
def check_filename(self):
print(self.cleaned_data)
print (self.cleaned_data.get("file").split(".")[-1])
#if self.cleaned_data.get("file").split(".")[-1] not in RESSOURCETYPES[choices]
#print RESOURCETYPES[self.cleaned_data.get("
pass
@requires_auth
......@@ -92,55 +83,61 @@ def project(request, project_id):
if not user.owns(project):
raise HttpResponseForbidden()
# add a new corpus into Node Project > Node Corpus > Ressource
# new corpus
if request.method == 'POST':
corpus = add_corpus(request, project)
if corpus.status:
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract_indexhyperdata)(corpus.id)
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
corpus = project.add_child(
name = request.POST['name'],
typename = 'CORPUS',
)
corpus.add_resource(
type = int(request.POST['type']),
path = upload(request.FILES['file']),
)
session.add(corpus)
session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract_indexhyperdata)(corpus.id)
return render(
template_name = 'pages/projects/wait.html',
request = request,
context = {
'user' : request.user,
'project': project,
},
)
},
)
# list all the corpora within this project
# corpora within this project
corpora = project.children('CORPUS', order=True).all()
#print(corpora)
sourcename2corpora = defaultdict(list)
for corpus in corpora:
# we only consider the first resource of the corpus to determine its type
resources = corpus.resources()
if len(resources) > 0:
if len(resources):
resource = resources[0]
resource= get_resource(resource["type"])
##here map from RESSOURCES_TYPES_ID and NOT NAME
resource_type_name = resource['name']
resource_type_accepted_formats = resource['accepted_formats']
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
if not status['error']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
resource_type_name = RESOURCETYPES[resource['type']]['name']
else:
print("(WARNING) PROJECT view: no listed resource")
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
if not status['error']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
# source & their respective counts
total_documentscount = 0
sourcename2documentscount = defaultdict(int)
......
......@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.models import Node
from gargantext.constants import get_resource
from gargantext.constants import resourcename
from datetime import datetime
@requires_auth
......@@ -32,7 +32,7 @@ def ngramtable(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : get_ressource(corpus)["name"],
'resourcename' : resourcename(corpus),
'view': 'terms'
},
)
......@@ -8,7 +8,7 @@ from traceback import print_tb
from django.shortcuts import redirect, render
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import QUERY_SIZE_N_MAX
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
......@@ -133,7 +133,7 @@ def save(request , project_id):
if filename!=False:
# add the uploaded resource to the corpus
corpus.add_resource(
type = 9
type = resourcetype('ISTex')
, path = filename
)
dwnldsOK+=1
......
......@@ -18,7 +18,7 @@ from traceback import print_tb
from django.shortcuts import redirect
from django.http import Http404, HttpResponseRedirect, HttpResponseForbidden
from gargantext.constants import get_resource, QUERY_SIZE_N_MAX
from gargantext.constants import resourcetype, QUERY_SIZE_N_MAX
from gargantext.models.nodes import Node
from gargantext.util.db import session
from gargantext.util.db_cache import cache
......@@ -134,7 +134,7 @@ def save( request , project_id ) :
print(filename)
if filename != False:
# add the uploaded resource to the corpus
corpus.add_resource( type = 4
corpus.add_resource( type = resourcetype('Pubmed (XML format)')
, path = filename
, url = None
)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment