Commit 07bec393 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] greatly improved status management

parent b1edb743
......@@ -89,14 +89,30 @@ class Node(Base):
{'type': type, 'path':path, 'url':url, 'extracted': False}
))
def status(self, action=None, progress=None):
if 'status' not in self.hyperdata:
self['status'] = MutableDict(
{'action': action, 'progress': progress}
)
else:
if action is not None:
self['status']['action'] = action
if progress is not None:
self['status']['progress'] = progress
return self['status']
def status(self, action=None, progress=None, complete=False):
"""Get the status of the given action
"""
# if the hyperdata do not have data about status
if 'statuses' not in self.hyperdata:
self['statuses'] = MutableList()
# if no action name is given, return the last appended status
if action is None:
for status in self['statuses']:
if not status['complete']:
return status
if len(self['statuses']):
return self['statuses'][-1]
return None
# retrieve the status concerning by the given action name
for status in self['statuses']:
if status['action'] == action:
if progress is not None:
status['progress'] = progress
if complete:
status['complete'] = complete
return status
# if no status has been found for the action, append a new one
self['statuses'].append(MutableDict(
{'action': action, 'progress': progress, 'complete': complete}
))
return self['statuses'][-1]
......@@ -44,7 +44,7 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
resource_type_index = corpus.resources()[0]['type']
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language']
for document in corpus.children('DOCUMENT'):
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
......@@ -68,5 +68,12 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % 1024 == 0:
corpus.status('ngrams_extraction', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
......@@ -21,12 +21,13 @@ def parse(corpus):
)
session.add(document)
if documents_count % 64 == 0:
corpus.status(action='parsing', progress=documents_count)
corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata()
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
corpus.save_hyperdata()
# commit all changes
corpus.status(action='parsing', progress=documents_count)
corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
session.commit()
......@@ -35,7 +35,7 @@ def corpus(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus': corpus,
# 'processing': processing,
# 'processing': corpus['extracted'],
# 'number': number,
'view': 'documents'
},
......
......@@ -102,10 +102,20 @@ def project(request, project_id):
sourcename2corpora = defaultdict(list)
for corpus in corpora:
# we only consider the first resource of the corpus to determine its type
corpus.count = corpus.children('DOCUMENT').count()
resource = corpus.resources()[0]
resource_type = RESOURCETYPES[resource['type']]
sourcename2corpora[resource_type['name']].append(corpus)
resource_type_name = RESOURCETYPES[resource['type']]['name']
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
corpus.status_message = ''
# add
sourcename2corpora[resource_type_name].append(corpus)
# source & their respective counts
total_documentscount = 0
sourcename2documentscount = defaultdict(int)
......
......@@ -82,7 +82,9 @@
<img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}">
Processing, drink a cup of tea, and refresh the page :)
{% else %}
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}"> {{corpus.name}} , {{ corpus.count }} documents</a>
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}">
{{corpus.name}}, {{ corpus.count }} documents {{ corpus.status_message }}
</a>
{% endifequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content="
......@@ -108,7 +110,9 @@
<h3>
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}">{{corpus.name}}</a>
</h3>
<h4>{{ corpus.count }} Documents </h4>
<h4>
{{ corpus.count }} documents
</h4>
<h5>Activity:</h5>
<div class="chart" data-percent="73">73%</div>
</div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment