Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
76c1a3dd
Commit
76c1a3dd
authored
Feb 27, 2015
by
Mathieu Rodic
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[OPTI] Project view: heavy optimization for speed
https://forge.iscpif.fr/issues/1438
parent
ef55205e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
219 additions
and
244 deletions
+219
-244
urls.py
gargantext_web/urls.py
+2
-2
views.py
gargantext_web/views.py
+0
-242
views_optimized.py
gargantext_web/views_optimized.py
+137
-0
corpus.py
parsing/corpus.py
+80
-0
No files found.
gargantext_web/urls.py
View file @
76c1a3dd
...
...
@@ -3,7 +3,7 @@ from django.conf.urls import patterns, include, url
from
django.contrib
import
admin
from
django.contrib.auth.views
import
login
from
gargantext_web
import
views
from
gargantext_web
import
views
,
views_optimized
import
gargantext_web.api
...
...
@@ -27,7 +27,7 @@ urlpatterns = patterns('',
# Project Management
url
(
r'^projects/$'
,
views
.
projects
),
url
(
r'^project/(\d+)/delete/$'
,
views
.
delete_project
),
url
(
r'^project/(\d+)/$'
,
views
.
project
),
url
(
r'^project/(\d+)/$'
,
views
_optimized
.
project
),
# Corpus management
url
(
r'^project/(\d+)/corpus/(\d+)/$'
,
views
.
corpus
),
...
...
gargantext_web/views.py
View file @
76c1a3dd
...
...
@@ -170,248 +170,6 @@ def projects(request):
})
def
project
(
request
,
project_id
):
'''
This view represents all corpora in a panoramic way.
The title sums all corpora
The donut summerizes composition of the project.
The list of lists enalbles to navigate throw it.
'''
if
not
request
.
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
try
:
offset
=
str
(
project_id
)
except
ValueError
:
raise
Http404
()
user
=
request
.
user
date
=
datetime
.
datetime
.
now
()
type_corpus
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_document
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
type_whitelist
=
NodeType
.
objects
.
get
(
name
=
'WhiteList'
)
type_blacklist
=
NodeType
.
objects
.
get
(
name
=
'BlackList'
)
type_cooclist
=
NodeType
.
objects
.
get
(
name
=
'Cooccurrence'
)
project
=
Node
.
objects
.
get
(
id
=
project_id
)
corpora
=
project
.
children
.
filter
(
type
=
type_corpus
)
number
=
len
(
corpora
)
# DONUT corpora representation
list_corpora
=
defaultdict
(
list
)
donut_part
=
defaultdict
(
int
)
docs_total
=
0
# List of resources
# filter for each project here
whitelists
=
""
#.children.filter(type=type_whitelist)
blacklists
=
""
#.children.filter(type=type_blacklist)
cooclists
=
""
#.children.filter(type=type_cooclist)
for
corpus
in
corpora
:
docs_count
=
corpus
.
children
.
count
()
docs_total
+=
docs_count
corpus_view
=
dict
()
corpus_view
[
'id'
]
=
corpus
.
pk
corpus_view
[
'name'
]
=
corpus
.
name
corpus_view
[
'count'
]
=
corpus
.
children
.
count
()
for
node_resource
in
Node_Resource
.
objects
.
filter
(
node
=
corpus
):
donut_part
[
node_resource
.
resource
.
type
]
+=
docs_count
list_corpora
[
node_resource
.
resource
.
type
.
name
]
.
append
(
corpus_view
)
list_corpora
=
dict
(
list_corpora
)
if
docs_total
==
0
or
docs_total
is
None
:
docs_total
=
1
donut
=
[
{
'source'
:
key
,
'count'
:
donut_part
[
key
]
,
'part'
:
round
(
donut_part
[
key
]
*
100
/
docs_total
)
}
\
for
key
in
donut_part
.
keys
()
]
if
request
.
method
==
'POST'
:
print
(
"original file:"
)
print
(
request
.
FILES
)
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
if
form
.
is_valid
():
name
=
form
.
cleaned_data
[
'name'
]
thefile
=
form
.
cleaned_data
[
'file'
]
resource_type
=
ResourceType
.
objects
.
get
(
id
=
str
(
form
.
cleaned_data
[
'type'
]
))
print
(
"-------------"
)
print
(
name
,
"|"
,
resource_type
,
"|"
,
thefile
)
print
(
"-------------"
)
print
(
"new file:"
)
print
(
thefile
)
try
:
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
if
resource_type
.
name
==
"europress_french"
:
language
=
Language
.
objects
.
get
(
iso2
=
'fr'
)
elif
resource_type
.
name
==
"europress_english"
:
language
=
Language
.
objects
.
get
(
iso2
=
'en'
)
try
:
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
language
=
language
,
name
=
name
,
)
except
:
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
name
,
)
corpus
.
save
()
print
(
request
.
user
,
resource_type
,
thefile
)
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
thefile
)
try
:
#corpus.parse_and_extract_ngrams()
#corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
except
Exception
as
error
:
print
(
error
)
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
except
Exception
as
error
:
print
(
'ee'
,
error
)
form
=
CorpusForm
(
request
=
request
)
formResource
=
ResourceForm
()
else
:
print
(
"bad form, bad form"
)
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'user'
:
user
,
'date'
:
date
,
'project'
:
project
,
'donut'
:
donut
,
'list_corpora'
:
list_corpora
,
'whitelists'
:
whitelists
,
'blacklists'
:
blacklists
,
'cooclists'
:
cooclists
,
'number'
:
number
,
})
else
:
form
=
CustomForm
()
# if request.method == 'POST':
# #form = CorpusForm(request.POST, request.FILES)
# #print(str(request.POST))
# name = str(request.POST['name'])
# try:
# resource_type = ResourceType.objects.get(id=str(request.POST['type']))
# except Exception as error:
# print(error)
# resource_type = None
# try:
# file = request.FILES['file']
# except Exception as error:
# print(error)
# file = None
# #if name != "" and resource_type is not None and file is not None:
# try:
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# if resource_type.name == "europress_french":
# language = Language.objects.get(iso2='fr')
# elif resource_type.name == "europress_english":
# language = Language.objects.get(iso2='en')
# try:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# language=language,
# name=name,
# )
# except:
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# print(request.user, resource_type , file )
# print(corpus.language)
# corpus.add_resource(
# user=request.user,
# type=resource_type,
# file=file
# )
# try:
# #corpus.parse_and_extract_ngrams()
# #corpus.parse_and_extract_ngrams.apply_async((), countdown=3)
# if DEBUG is True:
# corpus.workflow()
# else:
# corpus.workflow.apply_async((), countdown=3)
# except Exception as error:
# print(error)
# return HttpResponseRedirect('/project/' + str(project_id))
# except Exception as error:
# print('ee', error)
# form = CorpusForm(request=request)
# formResource = ResourceForm()
# else:
# form = CorpusForm(request=request)
# formResource = ResourceForm()
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'user'
:
user
,
'date'
:
date
,
'project'
:
project
,
'donut'
:
donut
,
'list_corpora'
:
list_corpora
,
'whitelists'
:
whitelists
,
'blacklists'
:
blacklists
,
'cooclists'
:
cooclists
,
'number'
:
number
,
})
def
corpus
(
request
,
project_id
,
corpus_id
):
if
not
request
.
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
...
...
gargantext_web/views_optimized.py
0 → 100644
View file @
76c1a3dd
from
django.shortcuts
import
redirect
from
django.shortcuts
import
render
from
django.http
import
Http404
,
HttpResponse
,
HttpResponseRedirect
from
sqlalchemy
import
func
from
sqlalchemy.orm
import
aliased
from
collections
import
defaultdict
from
datetime
import
datetime
from
node.admin
import
CustomForm
from
gargantext_web.db
import
*
from
gargantext_web.settings
import
DEBUG
def
project
(
request
,
project_id
):
# SQLAlchemy session
session
=
Session
()
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Project'
]
.
id
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
# Let's find out about the children nodes of the project
ChildrenNode
=
aliased
(
Node
)
corpus_query
=
(
session
.
query
(
Node
,
Resource
,
func
.
count
(
ChildrenNode
.
id
))
.
outerjoin
(
ChildrenNode
,
ChildrenNode
.
parent_id
==
Node
.
id
)
.
join
(
Node_Resource
,
Node_Resource
.
node_id
==
Node
.
id
)
.
join
(
Resource
,
Resource
.
id
==
Node_Resource
.
resource_id
)
.
filter
(
Node
.
parent_id
==
project
.
id
)
.
group_by
(
Node
,
Resource
)
)
corpora_by_resourcetype
=
defaultdict
(
list
)
documents_count_by_resourcetype
=
defaultdict
(
int
)
corpora_count
=
0
for
corpus
,
resource
,
document_count
in
corpus_query
:
resourcetype
=
cache
.
ResourceType
[
resource
.
type_id
]
resourcetype_name
=
resourcetype
.
name
corpora_by_resourcetype
[
resourcetype_name
]
.
append
({
'id'
:
corpus
.
id
,
'name'
:
corpus
.
name
,
'count'
:
document_count
,
})
documents_count_by_resourcetype
[
resourcetype_name
]
+=
document_count
corpora_count
+=
1
# do the donut
total_documents_count
=
sum
(
documents_count_by_resourcetype
.
values
())
donut
=
[
{
'source'
:
key
,
'count'
:
value
,
'part'
:
round
(
value
*
100
/
total_documents_count
),
}
for
key
,
value
in
documents_count_by_resourcetype
.
items
()
]
# deal with the form
if
request
.
method
==
'POST'
:
# fomr validation
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
if
form
.
is_valid
():
# extract information from the form
name
=
form
.
cleaned_data
[
'name'
]
thefile
=
form
.
cleaned_data
[
'file'
]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
# which default language shall be used?
if
resourcetype
.
name
==
"europress_french"
:
language_id
=
cache
.
Language
[
'fr'
]
.
id
elif
resourcetype
.
name
==
"europress_english"
:
language_id
=
cache
.
Language
[
'en'
]
.
id
else
:
language_id
=
None
# corpus node instanciation as a Django model
from
node
import
models
dj_corpus
=
models
.
Node
(
name
=
name
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
language_id
,
)
dj_corpus
.
save
()
# add the uploaded resource to the corpus
dj_corpus
.
add_resource
(
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
thefile
,
)
# let's start the workflow
try
:
if
DEBUG
is
True
:
dj_corpus
.
workflow
()
else
:
dj_corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
except
Exception
as
error
:
print
(
'WORKFLOW ERROR'
)
print
(
error
)
# redirect to the main project page
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
else
:
print
(
'ERROR: BAD FORM'
)
else
:
form
=
CustomForm
()
# HTML output
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'user'
:
user
,
'date'
:
datetime
.
now
(),
'project'
:
project
,
'donut'
:
donut
,
'list_corpora'
:
dict
(
corpora_by_resourcetype
),
'whitelists'
:
''
,
'blacklists'
:
''
,
'cooclists'
:
''
,
'number'
:
corpora_count
,
})
\ No newline at end of file
parsing/corpus.py
0 → 100644
View file @
76c1a3dd
from
collections
import
defaultdict
from
gargantext_web.db
import
*
from
.FileParsers
import
*
_parsers
=
{
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
}
def
parse_corpus_resources
(
corpus
,
user
=
None
,
user_id
=
None
):
session
=
Session
()
type_id
=
cache
.
NodeType
[
'Document'
]
if
user_id
is
None
and
user
is
not
None
:
user_id
=
user
.
id
# keep all the parsers in a cache
parsers
=
defaultdict
(
lambda
key
:
_parsers
[
key
]())
# find resource of the corpus
resources_query
=
(
session
.
query
(
Resource
,
ResourceType
)
.
join
(
ResourceType
,
ResourceType
.
id
==
Resource
.
type_id
)
.
join
(
Node_Resource
,
Node_Resource
.
resource_id
==
Resource
)
.
join
(
Node
,
Node
.
id
==
Node_Resource
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
)
# make a new node for every parsed document of the corpus
nodes
=
list
()
for
resource
,
resourcetype
in
resources_query
:
parser
=
parsers
[
resourcetype
.
name
]
for
metadata_dict
in
resource
:
# retrieve language ID from metadata
if
'language_iso2'
in
metadata_dict
:
try
:
language_id
=
cache
.
Langage
[
metadata_dict
[
'language_iso2'
]]
except
KeyError
:
language_id
=
None
else
:
language_id
=
None
# create new node
node
=
Node
(
name
=
metadata
.
get
(
'title'
,
''
),
parent_id
=
corpus
.
id
,
user_id
=
user_id
,
type_id
=
type_id
,
language_id
=
language_id
,
metadata
=
metadata_dict
,
)
nodes
.
append
(
node
)
session
.
add_bulk
(
nodes
)
session
.
commit
()
# now, index the metadata
for
node
in
nodes
:
node_id
=
node
.
id
for
metadata_key
,
metadata_value
in
node
.
metadata
.
items
():
metadata
=
cache
.
Metadata
[
key
]
if
metadata
.
type
==
'string'
:
metadata_value
=
metadata_value
[:
255
]
node_metadata
=
Node_Metadata
(
**
{
'node_id'
:
node_id
,
'metadata_id'
:
metadata
.
id
,
'value_'
+
metadata
.
type
:
value
,
})
session
.
add
(
node_metadata
)
session
.
commit
()
# mark the corpus as parsed
corpus
.
parsed
=
True
def
parse_corpus
(
corpus
):
# prepare the cache for ngrams
from
nodes
import
models
ngrams
=
ModelCache
(
models
.
Node
)
#
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment