Commit 4d4384a4 authored by delanoe's avatar delanoe

Merge branch 'refactoring' into refactoring-alex

parents 5badb85d 6e5bf987
# Installation # Installation
First, install Python 3.5 (see https://www.python.org/downloads/ for
download links).
```bash
cd /tmp
wget https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tar.xz
tar xvfJ Python-3.5.1.tar.xz
cd Python-3.5.1
./configure
make -j4 # option is for multithreading
sudo make install
```
Other components are required:
```bash ```bash
sudo apt-get install python3.4 sudo pip3.5 install virtualenv
sudo pip3 install virtualenv
sudo apt-get install rabbitmq-server sudo apt-get install rabbitmq-server
virtualenv-3.4 VENV ```
Then build a virtual environment:
```bash
virtualenv-3.5 VENV
source VENV/bin/activate source VENV/bin/activate
pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1 pip3.5 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
pip install -U -r requirements.txt pip3.5 install -U -r requirements.txt
``` ```
...@@ -39,6 +58,6 @@ pip install -U -r requirements.txt ...@@ -39,6 +58,6 @@ pip install -U -r requirements.txt
# Start the Django server # Start the Django server
```bash ```bash
manage.py celeryd --loglevel=INFO # to ensure Celery is properly started ./manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
manage.py runserver ./manage.py runserver
``` ```
...@@ -2,10 +2,7 @@ ...@@ -2,10 +2,7 @@
Be more careful about authorizations. Be more careful about authorizations.
# Constants cf. "ng-resource".
Remove "magic numbers" (such as 4096, etc.) from the code and put them in
`constants.py`.
# Projects # Projects
......
...@@ -77,3 +77,8 @@ from .settings import BASE_DIR ...@@ -77,3 +77,8 @@ from .settings import BASE_DIR
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads') UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
UPLOAD_LIMIT = 1024 * 1024 * 1024 UPLOAD_LIMIT = 1024 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE = 256
BATCH_NGRAMSEXTRACTION_SIZE = 1024
...@@ -89,9 +89,10 @@ class Node(Base): ...@@ -89,9 +89,10 @@ class Node(Base):
{'type': type, 'path':path, 'url':url, 'extracted': False} {'type': type, 'path':path, 'url':url, 'extracted': False}
)) ))
def status(self, action=None, progress=None, complete=False): def status(self, action=None, progress=0, complete=False, error=None):
"""Get the status of the given action """Get the status of the given action
""" """
date = datetime.now()
# if the hyperdata do not have data about status # if the hyperdata do not have data about status
if 'statuses' not in self.hyperdata: if 'statuses' not in self.hyperdata:
self['statuses'] = MutableList() self['statuses'] = MutableList()
...@@ -106,13 +107,17 @@ class Node(Base): ...@@ -106,13 +107,17 @@ class Node(Base):
# retrieve the status concerning by the given action name # retrieve the status concerning by the given action name
for status in self['statuses']: for status in self['statuses']:
if status['action'] == action: if status['action'] == action:
if progress is not None: if error:
status['error'] = error
if progress:
status['progress'] = progress status['progress'] = progress
if complete: if complete:
status['complete'] = complete status['complete'] = complete
if error or progress or complete:
status['date'] = date
return status return status
# if no status has been found for the action, append a new one # if no status has been found for the action, append a new one
self['statuses'].append(MutableDict( self['statuses'].append(MutableDict(
{'action': action, 'progress': progress, 'complete': complete} {'action':action, 'progress':progress, 'complete':complete, 'error':error, 'date':date}
)) ))
return self['statuses'][-1] return self['statuses'][-1]
...@@ -5,13 +5,17 @@ from gargantext import settings ...@@ -5,13 +5,17 @@ from gargantext import settings
from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from gargantext.util.json import json_dumps
def get_engine(): def get_engine():
from sqlalchemy import create_engine from sqlalchemy import create_engine
url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format( url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
**settings.DATABASES['default'] **settings.DATABASES['default']
) )
return create_engine(url, use_native_hstore=True) return create_engine(url,
use_native_hstore = True,
json_serializer = json_dumps,
)
engine = get_engine() engine = get_engine()
......
...@@ -36,8 +36,6 @@ def get(url): ...@@ -36,8 +36,6 @@ def get(url):
def get_parameters(request): def get_parameters(request):
parameters = {} parameters = {}
print(request.GET)
print(request.GET._iterlists())
for key, value in request.GET._iterlists(): for key, value in request.GET._iterlists():
if key.endswith('[]'): if key.endswith('[]'):
parameters[key[:-2]] = value parameters[key[:-2]] = value
...@@ -53,18 +51,7 @@ from rest_framework.views import APIView ...@@ -53,18 +51,7 @@ from rest_framework.views import APIView
# provide a JSON response # provide a JSON response
import json from gargantext.util.json import json_encoder
import datetime
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()[:19] + 'Z'
elif isinstance(obj, (set, tuple)):
return list(obj)
else:
return super(self.__class__, self).default(obj)
json_encoder = JSONEncoder(indent=4)
def JsonHttpResponse(data, status=200): def JsonHttpResponse(data, status=200):
return HttpResponse( return HttpResponse(
content = json_encoder.encode(data), content = json_encoder.encode(data),
......
import json
import datetime
import traceback
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()[:19] + 'Z'
elif isinstance(obj, (set, tuple)):
return list(obj)
elif isinstance(obj, Exception):
tbe = traceback.TracebackException.from_exception(obj)
return list(line.strip() for line in tbe.format())
else:
return super(self.__class__, self).default(obj)
json_encoder = JSONEncoder(indent=4)
def json_dumps(obj):
return json.dumps(obj, cls=JSONEncoder)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# In case this bash file is placed in another directory (e.g., /etc/init.d), # In case this bash file is placed in another directory (e.g., /etc/init.d),
# the following line should be changed to an absolute path # the following line should be changed to an absolute path
PYTHON_VERSION=3.4
DAEMON_DIR=$( cd "$(dirname "$BASH_SOURCE[0]")" && pwd) DAEMON_DIR=$( cd "$(dirname "$BASH_SOURCE[0]")" && pwd)
DAEMON_SCRIPT=$DAEMON_DIR/server.py DAEMON_SCRIPT=$DAEMON_DIR/server.py
DAEMON_NAME=nlpserver DAEMON_NAME=nlpserver
...@@ -17,7 +18,7 @@ do_start () { ...@@ -17,7 +18,7 @@ do_start () {
log_daemon_msg "Starting system '$DAEMON_NAME' daemon..." log_daemon_msg "Starting system '$DAEMON_NAME' daemon..."
/sbin/start-stop-daemon --start --quiet \ /sbin/start-stop-daemon --start --quiet \
--make-pidfile --pidfile $DAEMON_PID --background \ --make-pidfile --pidfile $DAEMON_PID --background \
--startas /bin/bash -- -c "python3 $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1" --startas /bin/bash -- -c "python$PYTHON_VERSION $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1"
# --exec $DAEMON_SCRIPT \ # --exec $DAEMON_SCRIPT \
# --user $DAEMON_USER --chuid $DAEMON_USER # --user $DAEMON_USER --chuid $DAEMON_USER
log_end_msg $? log_end_msg $?
...@@ -25,6 +26,7 @@ do_start () { ...@@ -25,6 +26,7 @@ do_start () {
do_stop () { do_stop () {
log_daemon_msg "Stopping system '$DAEMON_NAME' daemon..." log_daemon_msg "Stopping system '$DAEMON_NAME' daemon..."
/sbin/start-stop-daemon --stop --pidfile $DAEMON_PID --retry 10 /sbin/start-stop-daemon --stop --pidfile $DAEMON_PID --retry 10
rm $DAEMON_PID
log_end_msg $? log_end_msg $?
} }
......
...@@ -37,43 +37,49 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr ...@@ -37,43 +37,49 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
The result is then inserted into database. The result is then inserted into database.
Only fields indicated in `keys` are tagged. Only fields indicated in `keys` are tagged.
""" """
db, cursor = get_cursor() try:
nodes_ngrams_count = defaultdict(int) db, cursor = get_cursor()
ngrams_data = set() nodes_ngrams_count = defaultdict(int)
# extract ngrams ngrams_data = set()
resource_type_index = corpus.resources()[0]['type'] # extract ngrams
resource_type = RESOURCETYPES[resource_type_index] resource_type_index = corpus.resources()[0]['type']
default_language_iso2 = resource_type['default_language'] resource_type = RESOURCETYPES[resource_type_index]
for documents_count, document in enumerate(corpus.children('DOCUMENT')): default_language_iso2 = resource_type['default_language']
# get ngrams extractor for the current document for documents_count, document in enumerate(corpus.children('DOCUMENT')):
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2) # get ngrams extractor for the current document
try: language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
ngramsextractor = ngramsextractors[language_iso2] try:
except KeyError: ngramsextractor = ngramsextractors[language_iso2]
print('Unrecognized language: `%s`' % (language_iso2, )) except KeyError:
continue print('Unrecognized language: `%s`' % (language_iso2, ))
# extract ngrams on each of the considered keys
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
continue continue
# get ngrams # extract ngrams on each of the considered keys
for ngram in ngramsextractor.extract(value): for key in keys:
tokens = tuple(token[0] for token in ngram) value = document.hyperdata.get(key, None)
terms = ' '.join(tokens) if not isinstance(value, str):
nodes_ngrams_count[(document.id, terms)] += 1 continue
ngrams_data.add((terms[:255], len(tokens), )) # get ngrams
for ngram in ngramsextractor.extract(value):
tokens = tuple(token[0] for token in ngram)
terms = ' '.join(tokens)
nodes_ngrams_count[(document.id, terms)] += 1
ngrams_data.add((terms[:255], len(tokens), ))
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('ngrams_extraction', progress=documents_count+1)
corpus.save_hyperdata()
session.commit()
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= 4096: _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
nodes_ngrams_count.clear() corpus.save_hyperdata()
ngrams_data.clear() session.commit()
if documents_count % 1024 == 0: except Exception as error:
corpus.status('ngrams_extraction', progress=documents_count+1) corpus.status('ngrams_extraction', error=error)
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
# integrate ngrams and nodes-ngrams raise error
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
...@@ -4,30 +4,37 @@ from gargantext.constants import * ...@@ -4,30 +4,37 @@ from gargantext.constants import *
def parse(corpus): def parse(corpus):
# retrieve resource information try:
documents_count = 0 documents_count = 0
for resource in corpus.resources(): corpus.status('parsing', progress=0)
# information about the resource # retrieve resource information
if resource['extracted']: for resource in corpus.resources():
continue # information about the resource
resource_parser = RESOURCETYPES[resource['type']]['parser'] if resource['extracted']:
resource_path = resource['path'] continue
# extract and insert documents from corpus resource into database resource_parser = RESOURCETYPES[resource['type']]['parser']
for hyperdata in resource_parser(resource_path): resource_path = resource['path']
document = corpus.add_child( # extract and insert documents from corpus resource into database
typename = 'DOCUMENT', for hyperdata in resource_parser(resource_path):
name = hyperdata.get('title', '')[:255], document = corpus.add_child(
hyperdata = hyperdata, typename = 'DOCUMENT',
) name = hyperdata.get('title', '')[:255],
session.add(document) hyperdata = hyperdata,
if documents_count % 64 == 0: )
corpus.status('parsing', progress=documents_count) session.add(document)
corpus.save_hyperdata() if documents_count % BATCH_PARSING_SIZE == 0:
session.commit() corpus.status('parsing', progress=documents_count)
documents_count += 1 corpus.save_hyperdata()
# update info about the resource session.commit()
resource['extracted'] = True documents_count += 1
# commit all changes # update info about the resource
corpus.status('parsing', progress=documents_count, complete=True) resource['extracted'] = True
corpus.save_hyperdata() # commit all changes
session.commit() corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('parsing', error=error)
corpus.save_hyperdata()
session.commit()
raise error
...@@ -18,7 +18,6 @@ class NodeListResource(APIView): ...@@ -18,7 +18,6 @@ class NodeListResource(APIView):
parameters = validate(parameters, {'type': dict, 'items': { parameters = validate(parameters, {'type': dict, 'items': {
'pagination_limit': {'type': int, 'default': 10}, 'pagination_limit': {'type': int, 'default': 10},
'pagination_offset': {'type': int, 'default': 0}, 'pagination_offset': {'type': int, 'default': 0},
'pagination_offset': {'type': int, 'default': 0},
'fields': {'type': list, 'default': self._fields, 'items': { 'fields': {'type': list, 'default': self._fields, 'items': {
'type': str, 'range': self._fields, 'type': str, 'range': self._fields,
}}, }},
......
...@@ -94,7 +94,6 @@ def project(request, project_id): ...@@ -94,7 +94,6 @@ def project(request, project_id):
) )
session.add(corpus) session.add(corpus)
session.commit() session.commit()
parse_extract
scheduled(parse_extract)(corpus.id) scheduled(parse_extract)(corpus.id)
# corpora within this project # corpora within this project
......
Django==1.9.2
PyYAML==3.11
RandomWords==0.1.12
SQLAlchemy==1.1.0b1dev
amqp==1.4.9 amqp==1.4.9
anyjson==0.3.3 anyjson==0.3.3
billiard==3.3.0.22 billiard==3.3.0.22
celery==3.1.20 celery==3.1.20
dateparser==0.3.2 dateparser==0.3.2
Django==1.9.2
django-celery==3.1.17 django-celery==3.1.17
django-pgfields==1.4.4 django-pgfields==1.4.4
django-pgjsonb==0.0.16 django-pgjsonb==0.0.16
...@@ -21,6 +18,10 @@ psycopg2==2.6.1 ...@@ -21,6 +18,10 @@ psycopg2==2.6.1
pycountry==1.20 pycountry==1.20
python-dateutil==2.4.2 python-dateutil==2.4.2
pytz==2015.7 pytz==2015.7
PyYAML==3.11
RandomWords==0.1.12
six==1.10.0 six==1.10.0
SQLAlchemy==1.1.0b1.dev0
ujson==1.35 ujson==1.35
umalqurra==0.2 umalqurra==0.2
wheel==0.29.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment