Commit 7e8d0634 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] status has even more info

[FEAT] serializing of JSON is much better and has its own module
[CODE] upgraded to Python 3.5 (`traceback.TracebackException`)
[CODE] did some cleaning, here and there...
parent 07bec393
# Installation
First, install Python 3.5 (see https://www.python.org/downloads/ for
download links).
```bash
cd /tmp
wget https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tar.xz
tar xvfJ Python-3.5.1.tar.xz
cd Python-3.5.1
./configure
make -j4 # option is for multithreading
sudo make install
```
Other components are required:
```bash
sudo apt-get install python3.4
sudo pip3 install virtualenv
sudo pip3.5 install virtualenv
sudo apt-get install rabbitmq-server
virtualenv-3.4 VENV
```
Then build a virtual environment:
```bash
virtualenv-3.5 VENV
source VENV/bin/activate
pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
pip install -U -r requirements.txt
pip3.5 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
pip3.5 install -U -r requirements.txt
```
......@@ -39,6 +58,6 @@ pip install -U -r requirements.txt
# Start the Django server
```bash
manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
manage.py runserver
./manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
./manage.py runserver
```
......@@ -2,10 +2,7 @@
Be more careful about authorizations.
# Constants
Remove "magic numbers" (such as 4096, etc.) from the code and put them in
`constants.py`.
cf. "ng-resource".
# Projects
......
......@@ -77,3 +77,8 @@ from .settings import BASE_DIR
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
......@@ -89,9 +89,10 @@ class Node(Base):
{'type': type, 'path':path, 'url':url, 'extracted': False}
))
def status(self, action=None, progress=None, complete=False):
def status(self, action=None, progress=0, complete=False, error=None):
"""Get the status of the given action
"""
date = datetime.now()
# if the hyperdata do not have data about status
if 'statuses' not in self.hyperdata:
self['statuses'] = MutableList()
......@@ -106,13 +107,17 @@ class Node(Base):
# retrieve the status concerning by the given action name
for status in self['statuses']:
if status['action'] == action:
if progress is not None:
if error:
status['error'] = error
if progress:
status['progress'] = progress
if complete:
status['complete'] = complete
if error or progress or complete:
status['date'] = date
return status
# if no status has been found for the action, append a new one
self['statuses'].append(MutableDict(
{'action': action, 'progress': progress, 'complete': complete}
{'action':action, 'progress':progress, 'complete':complete, 'error':error, 'date':date}
))
return self['statuses'][-1]
......@@ -5,13 +5,17 @@ from gargantext import settings
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
from gargantext.util.json import json_dumps
def get_engine():
from sqlalchemy import create_engine
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
**settings.DATABASES['default']
)
return create_engine(url, use_native_hstore=True)
return create_engine(url,
use_native_hstore = True,
json_serializer = json_dumps,
)
engine = get_engine()
......
......@@ -36,8 +36,6 @@ def get(url):
def get_parameters(request):
parameters = {}
print(request.GET)
print(request.GET._iterlists())
for key, value in request.GET._iterlists():
if key.endswith('[]'):
parameters[key[:-2]] = value
......@@ -53,18 +51,7 @@ from rest_framework.views import APIView
# provide a JSON response
import json
import datetime
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()[:19] + 'Z'
elif isinstance(obj, (set, tuple)):
return list(obj)
else:
return super(self.__class__, self).default(obj)
json_encoder = JSONEncoder(indent=4)
from gargantext.util.json import json_encoder
def JsonHttpResponse(data, status=200):
return HttpResponse(
content = json_encoder.encode(data),
......
import json
import datetime
import traceback
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()[:19] + 'Z'
elif isinstance(obj, (set, tuple)):
return list(obj)
elif isinstance(obj, Exception):
tbe = traceback.TracebackException.from_exception(obj)
return list(line.strip() for line in tbe.format())
else:
return super(self.__class__, self).default(obj)
json_encoder = JSONEncoder(indent=4)
def json_dumps(obj):
return json.dumps(obj, cls=JSONEncoder)
......@@ -37,43 +37,49 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
The result is then inserted into database.
Only fields indicated in `keys` are tagged.
"""
db, cursor = get_cursor()
nodes_ngrams_count = defaultdict(int)
ngrams_data = set()
# extract ngrams
resource_type_index = corpus.resources()[0]['type']
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language']
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
print('Unrecognized language: `%s`' % (language_iso2, ))
continue
# extract ngrams on each of the considered keys
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
try:
db, cursor = get_cursor()
nodes_ngrams_count = defaultdict(int)
ngrams_data = set()
# extract ngrams
resource_type_index = corpus.resources()[0]['type']
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language']
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
print('Unrecognized language: `%s`' % (language_iso2, ))
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens)
nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# extract ngrams on each of the considered keys
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue
# get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens)
nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('ngrams_extraction', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= 4096:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % 1024 == 0:
corpus.status('ngrams_extraction', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('ngrams_extraction', error=error)
corpus.save_hyperdata()
session.commit()
raise error
......@@ -4,30 +4,37 @@ from gargantext.constants import *
def parse(corpus):
# retrieve resource information
documents_count = 0
for resource in corpus.resources():
# information about the resource
if resource['extracted']:
continue
resource_parser = RESOURCETYPES[resource['type']]['parser']
resource_path = resource['path']
# extract and insert documents from corpus resource into database
for hyperdata in resource_parser(resource_path):
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
if documents_count % 64 == 0:
corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata()
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
# commit all changes
corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
session.commit()
try:
documents_count = 0
corpus.status('parsing', progress=0)
# retrieve resource information
for resource in corpus.resources():
# information about the resource
if resource['extracted']:
continue
resource_parser = RESOURCETYPES[resource['type']]['parser']
resource_path = resource['path']
# extract and insert documents from corpus resource into database
for hyperdata in resource_parser(resource_path):
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
session.add(document)
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('parsing', progress=documents_count)
corpus.save_hyperdata()
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
# commit all changes
corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('parsing', error=error)
corpus.save_hyperdata()
session.commit()
raise error
......@@ -18,7 +18,6 @@ class NodeListResource(APIView):
parameters = validate(parameters, {'type': dict, 'items': {
'pagination_limit': {'type': int, 'default': 10},
'pagination_offset': {'type': int, 'default': 0},
'pagination_offset': {'type': int, 'default': 0},
'fields': {'type': list, 'default': self._fields, 'items': {
'type': str, 'range': self._fields,
}},
......
......@@ -94,7 +94,6 @@ def project(request, project_id):
)
session.add(corpus)
session.commit()
parse_extract
scheduled(parse_extract)(corpus.id)
# corpora within this project
......
Django==1.9.2
PyYAML==3.11
RandomWords==0.1.12
SQLAlchemy==1.1.0b1dev
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.22
celery==3.1.20
dateparser==0.3.2
Django==1.9.2
django-celery==3.1.17
django-pgfields==1.4.4
django-pgjsonb==0.0.16
......@@ -21,6 +18,10 @@ psycopg2==2.6.1
pycountry==1.20
python-dateutil==2.4.2
pytz==2015.7
PyYAML==3.11
RandomWords==0.1.12
six==1.10.0
SQLAlchemy==1.1.0b1.dev0
ujson==1.35
umalqurra==0.2
wheel==0.29.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment