Commit 76c1a3dd authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Project view: heavy optimization for speed

https://forge.iscpif.fr/issues/1438
parent ef55205e
...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url ...@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from django.contrib import admin from django.contrib import admin
from django.contrib.auth.views import login from django.contrib.auth.views import login
from gargantext_web import views from gargantext_web import views, views_optimized
import gargantext_web.api import gargantext_web.api
...@@ -27,7 +27,7 @@ urlpatterns = patterns('', ...@@ -27,7 +27,7 @@ urlpatterns = patterns('',
# Project Management # Project Management
url(r'^projects/$', views.projects), url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project), url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project), url(r'^project/(\d+)/$', views_optimized.project),
# Corpus management # Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus), url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
......
...@@ -170,248 +170,6 @@ def projects(request): ...@@ -170,248 +170,6 @@ def projects(request):
}) })
def project(request, project_id):
'''
This view represents all corpora in a panoramic way.
The title sums all corpora
The donut summerizes composition of the project.
The list of lists enalbles to navigate throw it.
'''
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
try:
offset = str(project_id)
except ValueError:
raise Http404()
user = request.user
date = datetime.datetime.now()
type_corpus = NodeType.objects.get(name='Corpus')
type_document = NodeType.objects.get(name='Document')
type_whitelist = NodeType.objects.get(name='WhiteList')
type_blacklist = NodeType.objects.get(name='BlackList')
type_cooclist = NodeType.objects.get(name='Cooccurrence')
project = Node.objects.get(id=project_id)
corpora = project.children.filter(type=type_corpus)
number = len(corpora)
# DONUT corpora representation
list_corpora = defaultdict(list)
donut_part = defaultdict(int)
docs_total = 0
# List of resources
# filter for each project here
whitelists = ""#.children.filter(type=type_whitelist)
blacklists = ""#.children.filter(type=type_blacklist)
cooclists = ""#.children.filter(type=type_cooclist)
for corpus in corpora:
docs_count = corpus.children.count()
docs_total += docs_count
corpus_view = dict()
corpus_view['id'] = corpus.pk
corpus_view['name'] = corpus.name
corpus_view['count'] = corpus.children.count()
for node_resource in Node_Resource.objects.filter(node=corpus):
donut_part[node_resource.resource.type] += docs_count
list_corpora[node_resource.resource.type.name].append(corpus_view)
list_corpora = dict(list_corpora)
if docs_total == 0 or docs_total is None:
docs_total = 1
donut = [ {'source': key,
'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \
for key in donut_part.keys() ]
if request.method == 'POST':
print("original file:")
print(request.FILES)
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] ))
print("-------------")
print(name,"|",resource_type,"|",thefile)
print("-------------")
print("new file:")
print(thefile)
try:
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
if resource_type.name == "europress_french":
language = Language.objects.get(iso2='fr')
elif resource_type.name == "europress_english":
language = Language.objects.get(iso2='en')
try:
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
language=language,
name=name,
)
except:
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=name,
)
corpus.save()
print(request.user, resource_type , thefile )
corpus.add_resource(
user=request.user,
type=resource_type,
file=thefile
)
try:
#corpus.parse_and_extract_ngrams()
#corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
except Exception as error:
print(error)
return HttpResponseRedirect('/project/' + str(project_id))
except Exception as error:
print('ee', error)
form = CorpusForm(request=request)
formResource = ResourceForm()
else:
print("bad form, bad form")
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : date,
'project' : project,
'donut' : donut,
'list_corpora' : list_corpora,
'whitelists' : whitelists,
'blacklists' : blacklists,
'cooclists' : cooclists,
'number' : number,
})
else:
form = CustomForm()
# if request.method == 'POST':
# #form = CorpusForm(request.POST, request.FILES)
# #print(str(request.POST))
# name = str(request.POST['name'])
# try:
# resource_type = ResourceType.objects.get(id=str(request.POST['type']))
# except Exception as error:
# print(error)
# resource_type = None
# try:
# file = request.FILES['file']
# except Exception as error:
# print(error)
# file = None
# #if name != "" and resource_type is not None and file is not None:
# try:
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# if resource_type.name == "europress_french":
# language = Language.objects.get(iso2='fr')
# elif resource_type.name == "europress_english":
# language = Language.objects.get(iso2='en')
# try:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# language=language,
# name=name,
# )
# except:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# print(request.user, resource_type , file )
# print(corpus.language)
# corpus.add_resource(
# user=request.user,
# type=resource_type,
# file=file
# )
# try:
# #corpus.parse_and_extract_ngrams()
# #corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
# if DEBUG is True:
# corpus.workflow()
# else:
# corpus.workflow.apply_async((), countdown=3)
# except Exception as error:
# print(error)
# return HttpResponseRedirect('/project/' + str(project_id))
# except Exception as error:
# print('ee', error)
# form = CorpusForm(request=request)
# formResource = ResourceForm()
# else:
# form = CorpusForm(request=request)
# formResource = ResourceForm()
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : date,
'project' : project,
'donut' : donut,
'list_corpora' : list_corpora,
'whitelists' : whitelists,
'blacklists' : blacklists,
'cooclists' : cooclists,
'number' : number,
})
def corpus(request, project_id, corpus_id): def corpus(request, project_id, corpus_id):
if not request.user.is_authenticated(): if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path) return redirect('/login/?next=%s' % request.path)
......
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
from datetime import datetime
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG
def project(request, project_id):
# SQLAlchemy session
session = Session()
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
# Let's find out about the children nodes of the project
ChildrenNode = aliased(Node)
corpus_query = (session
.query(Node, Resource, func.count(ChildrenNode.id))
.outerjoin(ChildrenNode, ChildrenNode.parent_id == Node.id)
.join(Node_Resource, Node_Resource.node_id == Node.id)
.join(Resource, Resource.id == Node_Resource.resource_id)
.filter(Node.parent_id == project.id)
.group_by(Node, Resource)
)
corpora_by_resourcetype = defaultdict(list)
documents_count_by_resourcetype = defaultdict(int)
corpora_count = 0
for corpus, resource, document_count in corpus_query:
resourcetype = cache.ResourceType[resource.type_id]
resourcetype_name = resourcetype.name
corpora_by_resourcetype[resourcetype_name].append({
'id': corpus.id,
'name': corpus.name,
'count': document_count,
})
documents_count_by_resourcetype[resourcetype_name] += document_count
corpora_count += 1
# do the donut
total_documents_count = sum(documents_count_by_resourcetype.values())
donut = [
{ 'source': key,
'count': value,
'part' : round(value * 100 / total_documents_count),
}
for key, value in documents_count_by_resourcetype.items()
]
# deal with the form
if request.method == 'POST':
# fomr validation
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
# extract information from the form
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "europress_french":
language_id = cache.Language['fr'].id
elif resourcetype.name == "europress_english":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model
from node import models
dj_corpus = models.Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
)
dj_corpus.save()
# add the uploaded resource to the corpus
dj_corpus.add_resource(
user_id = request.user.id,
type_id = resourcetype.id,
file = thefile,
)
# let's start the workflow
try:
if DEBUG is True:
dj_corpus.workflow()
else:
dj_corpus.workflow.apply_async((), countdown=3)
except Exception as error:
print('WORKFLOW ERROR')
print(error)
# redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
else:
print('ERROR: BAD FORM')
else:
form = CustomForm()
# HTML output
return render(request, 'project.html', {
'form' : form,
'user' : user,
'date' : datetime.now(),
'project' : project,
'donut' : donut,
'list_corpora' : dict(corpora_by_resourcetype),
'whitelists' : '',
'blacklists' : '',
'cooclists' : '',
'number' : corpora_count,
})
\ No newline at end of file
from collections import defaultdict
from gargantext_web.db import *
from .FileParsers import *
_parsers = {
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
'europress' : EuropressFileParser,
'europress_french' : EuropressFileParser,
'europress_english' : EuropressFileParser,
}
def parse_corpus_resources(corpus, user=None, user_id=None):
session = Session()
type_id = cache.NodeType['Document']
if user_id is None and user is not None:
user_id = user.id
# keep all the parsers in a cache
parsers = defaultdict(lambda key: _parsers[key]())
# find resource of the corpus
resources_query = (session
.query(Resource, ResourceType)
.join(ResourceType, ResourceType.id == Resource.type_id)
.join(Node_Resource, Node_Resource.resource_id == Resource)
.join(Node, Node.id == Node_Resource.node_id)
.filter(Node.parent_id == corpus.id)
)
# make a new node for every parsed document of the corpus
nodes = list()
for resource, resourcetype in resources_query:
parser = parsers[resourcetype.name]
for metadata_dict in resource:
# retrieve language ID from metadata
if 'language_iso2' in metadata_dict:
try:
language_id = cache.Langage[metadata_dict['language_iso2']]
except KeyError:
language_id = None
else:
language_id = None
# create new node
node = Node(
name = metadata.get('title', ''),
parent_id = corpus.id,
user_id = user_id,
type_id = type_id,
language_id = language_id,
metadata = metadata_dict,
)
nodes.append(node)
session.add_bulk(nodes)
session.commit()
# now, index the metadata
for node in nodes:
node_id = node.id
for metadata_key, metadata_value in node.metadata.items():
metadata = cache.Metadata[key]
if metadata.type == 'string':
metadata_value = metadata_value[:255]
node_metadata = Node_Metadata(**{
'node_id': node_id,
'metadata_id': metadata.id,
'value_'+metadata.type: value,
})
session.add(node_metadata)
session.commit()
# mark the corpus as parsed
corpus.parsed = True
def parse_corpus(corpus):
# prepare the cache for ngrams
from nodes import models
ngrams = ModelCache(models.Node)
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment