Merge branch 'refactoring' into refactoring-alex

4d4384a4 · delanoe · 5badb85d · 6e5bf987 · 4d4384a4 · 4d4384a4
Commit 4d4384a4 authored Feb 25, 2016 by delanoe
13 changed files
--- a/README.md
+++ b/README.md
 # Installation
+First, install Python 3.5 (see https://www.python.org/downloads/ for
+download links).
+```bash
+cd /tmp
+wget https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tar.xz
+tar xvfJ Python-3.5.1.tar.xz
+cd Python-3.5.1
+./configure
+make -j4 # option is for multithreading
+sudo make install
+```
+Other components are required:
 ```bash
-sudo apt-get install python3.4
+sudo pip3.5 install virtualenv
-sudo pip3 install virtualenv
 sudo apt-get install rabbitmq-server
-virtualenv-3.4 VENV
+```
+Then build a virtual environment:
+```bash
+virtualenv-3.5 VENV
 source VENV/bin/activate
-pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
+pip3.5 install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1
-pip install -U -r requirements.txt
+pip3.5 install -U -r requirements.txt
 ```
@@ -39,6 +58,6 @@ pip install -U -r requirements.txt
 # Start the Django server
 ```bash
-manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
+./manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
-manage.py runserver
+./manage.py runserver
 ```
--- a/TODO.md
+++ b/TODO.md
@@ -2,10 +2,7 @@
 Be more careful about authorizations.
-# Constants
+cf. "ng-resource".
-Remove "magic numbers" (such as 4096, etc.) from the code and put them in
-`constants.py`.
 # Projects

--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -77,3 +77,8 @@ from .settings import BASE_DIR
 UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
 UPLOAD_LIMIT = 1024 * 1024 * 1024
 DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
+# about batch processing...
+BATCH_PARSING_SIZE = 256
+BATCH_NGRAMSEXTRACTION_SIZE = 1024
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -89,9 +89,10 @@ class Node(Base):
            {'type': type, 'path':path, 'url':url, 'extracted': False}
        ))
-    def status(self, action=None, progress=None, complete=False):
+    def status(self, action=None, progress=0, complete=False, error=None):
        """Get the status of the given action
        """
+        date = datetime.now()
        # if the hyperdata do not have data about status
        if 'statuses' not in self.hyperdata:
            self['statuses'] = MutableList()
@@ -106,13 +107,17 @@ class Node(Base):
        # retrieve the status concerning by the given action name
        for status in self['statuses']:
            if status['action'] == action:
-                if progress is not None:
+                if error:
+                    status['error'] = error
+                if progress:
                    status['progress'] = progress
                if complete:
                    status['complete'] = complete
+                if error or progress or complete:
+                    status['date'] = date
                return status
        # if no status has been found for the action, append a new one
        self['statuses'].append(MutableDict(
-            {'action': action, 'progress': progress, 'complete': complete}
+            {'action':action, 'progress':progress, 'complete':complete, 'error':error, 'date':date}
        ))
        return self['statuses'][-1]
--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -5,13 +5,17 @@ from gargantext import settings
 from sqlalchemy.orm import sessionmaker, scoped_session
 from sqlalchemy.ext.declarative import declarative_base
+from gargantext.util.json import json_dumps
 def get_engine():
    from sqlalchemy import create_engine
    url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
        **settings.DATABASES['default']
    )
-    return create_engine(url, use_native_hstore=True)
+    return create_engine(url,
+        use_native_hstore = True,
+        json_serializer = json_dumps,
+    )
 engine = get_engine()

--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -36,8 +36,6 @@ def get(url):
 def get_parameters(request):
    parameters = {}
-    print(request.GET)
-    print(request.GET._iterlists())
    for key, value in request.GET._iterlists():
        if key.endswith('[]'):
            parameters[key[:-2]] = value
@@ -53,18 +51,7 @@ from rest_framework.views import APIView
 # provide a JSON response
-import json
+from gargantext.util.json import json_encoder
-import datetime
-class JSONEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, datetime.datetime):
-            return obj.isoformat()[:19] + 'Z'
-        elif isinstance(obj, (set, tuple)):
-            return list(obj)
-        else:
-            return super(self.__class__, self).default(obj)
-json_encoder = JSONEncoder(indent=4)
 def JsonHttpResponse(data, status=200):
    return HttpResponse(
        content      = json_encoder.encode(data),

--- a/gargantext/util/json.py
+++ b/gargantext/util/json.py
+import json
+import datetime
+import traceback
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, datetime.datetime):
+            return obj.isoformat()[:19] + 'Z'
+        elif isinstance(obj, (set, tuple)):
+            return list(obj)
+        elif isinstance(obj, Exception):
+            tbe = traceback.TracebackException.from_exception(obj)
+            return list(line.strip() for line in tbe.format())
+        else:
+            return super(self.__class__, self).default(obj)
+json_encoder = JSONEncoder(indent=4)
+def json_dumps(obj):
+    return json.dumps(obj, cls=JSONEncoder)
--- a/gargantext/util/taggers/lib/nlpserver/nlpserver
+++ b/gargantext/util/taggers/lib/nlpserver/nlpserver
@@ -2,6 +2,7 @@
 # In case this bash file is placed in another directory (e.g., /etc/init.d),
 # the following line should be changed to an absolute path
+PYTHON_VERSION=3.4
 DAEMON_DIR=$( cd "$(dirname "$BASH_SOURCE[0]")" && pwd)
 DAEMON_SCRIPT=$DAEMON_DIR/server.py
 DAEMON_NAME=nlpserver
@@ -17,7 +18,7 @@ do_start () {
    log_daemon_msg "Starting system '$DAEMON_NAME' daemon..."
    /sbin/start-stop-daemon --start --quiet \
        --make-pidfile --pidfile $DAEMON_PID --background \
-        --startas /bin/bash -- -c "python3 $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1"
+        --startas /bin/bash -- -c "python$PYTHON_VERSION $DAEMON_SCRIPT $DAEMON_ARGS > /tmp/$DAEMON_NAME.log 2>&1"
        # --exec $DAEMON_SCRIPT \
        # --user $DAEMON_USER --chuid $DAEMON_USER
    log_end_msg $?
@@ -25,6 +26,7 @@ do_start () {
 do_stop () {
    log_daemon_msg "Stopping system '$DAEMON_NAME' daemon..."
    /sbin/start-stop-daemon --stop --pidfile $DAEMON_PID --retry 10
+    rm $DAEMON_PID
    log_end_msg $?
 }

--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -37,43 +37,49 @@ def extract_ngrams(corpus, rule='{<JJ.*>*<NN.*>+<JJ.*>*}', keys=('title', 'abstr
    The result is then inserted into database.
    Only fields indicated in `keys` are tagged.
    """
-    db, cursor = get_cursor()
+    try:
-    nodes_ngrams_count = defaultdict(int)
+        db, cursor = get_cursor()
-    ngrams_data = set()
+        nodes_ngrams_count = defaultdict(int)
-    # extract ngrams
+        ngrams_data = set()
-    resource_type_index = corpus.resources()[0]['type']
+        # extract ngrams
-    resource_type = RESOURCETYPES[resource_type_index]
+        resource_type_index = corpus.resources()[0]['type']
-    default_language_iso2 = resource_type['default_language']
+        resource_type = RESOURCETYPES[resource_type_index]
-    for documents_count, document in enumerate(corpus.children('DOCUMENT')):
+        default_language_iso2 = resource_type['default_language']
-        # get ngrams extractor for the current document
+        for documents_count, document in enumerate(corpus.children('DOCUMENT')):
-        language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
+            # get ngrams extractor for the current document
-        try:
+            language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
-            ngramsextractor = ngramsextractors[language_iso2]
+            try:
-        except KeyError:
+                ngramsextractor = ngramsextractors[language_iso2]
-            print('Unrecognized language: `%s`' % (language_iso2, ))
+            except KeyError:
-            continue
+                print('Unrecognized language: `%s`' % (language_iso2, ))
-        # extract ngrams on each of the considered keys
-        for key in keys:
-            value = document.hyperdata.get(key, None)
-            if not isinstance(value, str):
                continue
-            # get ngrams
+            # extract ngrams on each of the considered keys
-            for ngram in ngramsextractor.extract(value):
+            for key in keys:
-                tokens = tuple(token[0] for token in ngram)
+                value = document.hyperdata.get(key, None)
-                terms = ' '.join(tokens)
+                if not isinstance(value, str):
-                nodes_ngrams_count[(document.id, terms)] += 1
+                    continue
-                ngrams_data.add((terms[:255], len(tokens), ))
+                # get ngrams
+                for ngram in ngramsextractor.extract(value):
+                    tokens = tuple(token[0] for token in ngram)
+                    terms = ' '.join(tokens)
+                    nodes_ngrams_count[(document.id, terms)] += 1
+                    ngrams_data.add((terms[:255], len(tokens), ))
+            # integrate ngrams and nodes-ngrams
+            if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
+                _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+                nodes_ngrams_count.clear()
+                ngrams_data.clear()
+            if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
+                corpus.status('ngrams_extraction', progress=documents_count+1)
+                corpus.save_hyperdata()
+                session.commit()
        # integrate ngrams and nodes-ngrams
-        if len(nodes_ngrams_count) >= 4096:
+        _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-            _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
+        corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
-            nodes_ngrams_count.clear()
+        corpus.save_hyperdata()
-            ngrams_data.clear()
+        session.commit()
-        if documents_count % 1024 == 0:
+    except Exception as error:
-            corpus.status('ngrams_extraction', progress=documents_count+1)
+        corpus.status('ngrams_extraction', error=error)
-            corpus.save_hyperdata()
+        corpus.save_hyperdata()
-            session.commit()
+        session.commit()
-    # integrate ngrams and nodes-ngrams
+        raise error
-    _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
-    corpus.status('ngrams_extraction', progress=documents_count+1, complete=True)
-    corpus.save_hyperdata()
-    session.commit()
--- a/gargantext/util/toolchain/parsing.py
+++ b/gargantext/util/toolchain/parsing.py
@@ -4,30 +4,37 @@ from gargantext.constants import *
 def parse(corpus):
-    # retrieve resource information
+    try:
-    documents_count = 0
+        documents_count = 0
-    for resource in corpus.resources():
+        corpus.status('parsing', progress=0)
-        # information about the resource
+        # retrieve resource information
-        if resource['extracted']:
+        for resource in corpus.resources():
-            continue
+            # information about the resource
-        resource_parser = RESOURCETYPES[resource['type']]['parser']
+            if resource['extracted']:
-        resource_path = resource['path']
+                continue
-        # extract and insert documents from corpus resource into database
+            resource_parser = RESOURCETYPES[resource['type']]['parser']
-        for hyperdata in resource_parser(resource_path):
+            resource_path = resource['path']
-            document = corpus.add_child(
+            # extract and insert documents from corpus resource into database
-                typename = 'DOCUMENT',
+            for hyperdata in resource_parser(resource_path):
-                name = hyperdata.get('title', '')[:255],
+                document = corpus.add_child(
-                hyperdata = hyperdata,
+                    typename = 'DOCUMENT',
-            )
+                    name = hyperdata.get('title', '')[:255],
-            session.add(document)
+                    hyperdata = hyperdata,
-            if documents_count % 64 == 0:
+                )
-                corpus.status('parsing', progress=documents_count)
+                session.add(document)
-                corpus.save_hyperdata()
+                if documents_count % BATCH_PARSING_SIZE == 0:
-                session.commit()
+                    corpus.status('parsing', progress=documents_count)
-            documents_count += 1
+                    corpus.save_hyperdata()
-        # update info about the resource
+                    session.commit()
-        resource['extracted'] = True
+                documents_count += 1
-    # commit all changes
+            # update info about the resource
-    corpus.status('parsing', progress=documents_count, complete=True)
+            resource['extracted'] = True
-    corpus.save_hyperdata()
+        # commit all changes
-    session.commit()
+        corpus.status('parsing', progress=documents_count, complete=True)
+        corpus.save_hyperdata()
+        session.commit()
+    except Exception as error:
+        corpus.status('parsing', error=error)
+        corpus.save_hyperdata()
+        session.commit()
+        raise error
--- a/gargantext/views/api/nodes.py
+++ b/gargantext/views/api/nodes.py
@@ -18,7 +18,6 @@ class NodeListResource(APIView):
        parameters = validate(parameters, {'type': dict, 'items': {
            'pagination_limit': {'type': int, 'default': 10},
            'pagination_offset': {'type': int, 'default': 0},
-            'pagination_offset': {'type': int, 'default': 0},
            'fields': {'type': list, 'default': self._fields, 'items': {
                'type': str, 'range': self._fields,
            }},

--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -94,7 +94,6 @@ def project(request, project_id):
        )
        session.add(corpus)
        session.commit()
-        parse_extract
        scheduled(parse_extract)(corpus.id)
    # corpora within this project

--- a/requirements.txt
+++ b/requirements.txt
-Django==1.9.2
-PyYAML==3.11
-RandomWords==0.1.12
-SQLAlchemy==1.1.0b1dev
 amqp==1.4.9
 anyjson==0.3.3
 billiard==3.3.0.22
 celery==3.1.20
 dateparser==0.3.2
+Django==1.9.2
 django-celery==3.1.17
 django-pgfields==1.4.4
 django-pgjsonb==0.0.16
@@ -21,6 +18,10 @@ psycopg2==2.6.1
 pycountry==1.20
 python-dateutil==2.4.2
 pytz==2015.7
+PyYAML==3.11
+RandomWords==0.1.12
 six==1.10.0
+SQLAlchemy==1.1.0b1.dev0
 ujson==1.35
 umalqurra==0.2
+wheel==0.29.0