[FEAT] uploading a corpus has been made possible

[FEAT] added some file managers (save, upload, download) [FEAT] added Celery workers [CODE] planned an interface for PubMed scrapping, see `gargantext.util.scrapping.pubmed` [DOC] added a README and a TODO

[FEAT] uploading a corpus has been made possible
[FEAT] added some file managers (save, upload, download) [FEAT] added Celery workers [CODE] planned an interface for PubMed scrapping, see `gargantext.util.scrapping.pubmed` [DOC] added a README and a TODO
6e2bc79c · Mathieu Rodic · 204bfc6d · 6e2bc79c · 6e2bc79c · 6e2bc79c
Commit 6e2bc79c authored Feb 16, 2016 by Mathieu Rodic
16 changed files
--- a/README.md
+++ b/README.md
+# Installation
+```bash
+sudo apt-get install python3.4
+sudo pip3 install virtualenv
+sudo apt-get install rabbitmq-server
+virtualenv-3.4 VENV
+source VENV/bin/activate
+pip install -U -r requirements.txt
+```
+# Migrate database
+## Django models
+```bash
+./manage.py makemigrations
+./manage.py migrate --fake-initial
+```
+...or...
+```bash
+./manage.py makemigrations
+./manage.py migrate --run-syncdb
+```
+(see [Django documentation](https://docs.djangoproject.com/en/1.9/topics/migrations/))
+## SQLAlchemy models
+```bash
+./dbmigrate.py
+```
+# Start the Django server
+```bash
+manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
+manage.py runserver
+```
--- a/TODO.md
+++ b/TODO.md
+# Projects
+## Overview of all projects
+- re-implement deletion
+## Single project view
+- re-implement deletion
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
+# WARNING: to ensure consistency and retrocompatibility, lists should keep the
+#   initial order (ie., new elements should be appended at the end of the lists)
 NODETYPES = [
    None,
    'USER',
@@ -16,22 +19,14 @@ LANGUAGES = {
 from gargantext.util.parsers import *
 RESOURCETYPES = [
-    # {   'name': 'CSV',
-    #     # 'parser': CSVParser,
-    #     'default_language': 'en',
-    # },
    {   'name': 'Europress (English)',
        'parser': EuropressParser,
        'default_language': 'en',
    },
    {   'name': 'Europress (French)',
-        # 'parser': EuropressParser,
+        'parser': EuropressParser,
        'default_language': 'fr',
    },
-    # {   'name': 'ISTex',
-    #     # 'parser': ISTexParser,
-    #     'default_language': 'en',
-    # },
    {   'name': 'Jstor (RIS format)',
        # 'parser': RISParser,
        'default_language': 'en',
@@ -52,4 +47,24 @@ RESOURCETYPES = [
        # 'parser': RISParser,
        'default_language': 'en',
    },
+    # {   'name': 'CSV',
+    #     # 'parser': CSVParser,
+    #     'default_language': 'en',
+    # },
+    # {   'name': 'ISTex',
+    #     # 'parser': ISTexParser,
+    #     'default_language': 'en',
+    # },
 ]
+# other parameters
+# default number of docs POSTed to scrappers.views.py
+#  (at page  project > add a corpus > scan/process sample)
+QUERY_SIZE_N_DEFAULT = 1000
+import os
+from .settings import BASE_DIR
+UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
+UPLOAD_LIMIT = 16 * 1024 * 1024
+DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
 from gargantext.util.db import *
+from gargantext.util.files import upload
+from gargantext.util import workflow
 from gargantext.constants import *
 from datetime import datetime
@@ -23,10 +25,51 @@ class NodeType(TypeDecorator):
 class Node(Base):
    __tablename__ = 'nodes'
    id = Column(Integer, primary_key=True)
-    type = Column(NodeType, index=True)
+    typename = Column(NodeType, index=True)
    user_id = Column(Integer, ForeignKey(User.id))
+    parent_id = Column(Integer, ForeignKey('nodes.id'))
    # main data
-    name = Column(String(255), unique=True)
+    name = Column(String(255))
    date  = Column(DateTime(), default=datetime.now)
    # metadata
    hyperdata = Column(JSONB, default={})
+    def __getitem__(self, key):
+        return self.hyperdata[key]
+    def __setitem__(self, key, value):
+        self.hyperdata[key] = value
+    def children(self, typename=None):
+        """Return a query to all the direct children of the current node.
+        Allows filtering by typename (see `constants.py`)
+        """
+        query = session.query(Node).filter(Node.parent_id == self.id)
+        if typename is not None:
+            query = query.filter(Node.typename == typename)
+        return query
+    def add_child(self, typename, **kwargs):
+        """Create and return a new direct child of the current node.
+        """
+        return Node(
+            user_id = self.user_id,
+            typename = typename,
+            parent_id = self.id,
+            **kwargs
+        )
+    def add_corpus(self, name, resource_type, resource_upload=None, resource_url=None):
+        if resource_upload is not None:
+            resource_path = upload(resource_upload)
+        else:
+            resource_path = None
+        corpus = self.add_child('CORPUS', name=name, hyperdata={
+            'resource_type': int(resource_type),
+            'resource_path': resource_path,
+            'resource_url': resource_url,
+        })
+        session.add(corpus)
+        session.commit()
+        workflow.parse(corpus)
+        return corpus
--- a/gargantext/models/users.py
+++ b/gargantext/models/users.py
@@ -22,7 +22,7 @@ class User(Base):
    is_active = Column(Boolean())
    date_joined = DateTime(timezone=False)
-    def get_contacts(self):
+    def contacts(self):
        """get all contacts in relation with the user"""
        Friend = aliased(User)
        query = (session
@@ -32,7 +32,7 @@ class User(Base):
        )
        return query.all()
-    def get_nodes(self, type=None):
+    def nodes(self, typename=None):
        """get all nodes belonging to the user"""
        # ↓ this below is a workaround because of Python's lame import system
        from .nodes import Node
@@ -41,13 +41,23 @@ class User(Base):
            .filter(Node.user_id == self.id)
            .order_by(Node.date)
        )
-        if type is not None:
+        if typename is not None:
-            query = query.filter(Node.type == type)
+            query = query.filter(Node.typename == typename)
        return query.all()
-    def owns(user, node):
+    def contacts_nodes(self, typename=None):
+        for contact in self.contacts():
+            contact_nodes = (session
+                .query(Node)
+                .filter(Node.user_id == contact.id)
+                .filter(Node.typename == typename)
+                .order_by(Node.date)
+            ).all()
+            yield contact, contact_nodes
+    def owns(self, node):
        """check if a given node is owned by the user"""
-        return True
+        return (node.user_id == self.id) or node.id in (contact.id for contact in self.contacts())
 class Contact(Base):

--- a/gargantext/settings.py
+++ b/gargantext/settings.py
@@ -29,6 +29,14 @@ MAINTENANCE = False
 ALLOWED_HOSTS = []
+# Asynchronous tasks
+import djcelery
+djcelery.setup_loader()
+BROKER_URL = 'amqp://guest:guest@localhost:5672/'
+CELERY_IMPORTS = ('gargantext.util.workflow', )
 # Application definition
 INSTALLED_APPS = [
@@ -38,6 +46,7 @@ INSTALLED_APPS = [
    'django.contrib.sessions',
    'django.contrib.messages',
    'django.contrib.staticfiles',
+    'djcelery',
 ]
 MIDDLEWARE_CLASSES = [

--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -27,3 +27,8 @@ from sqlalchemy.types import *
 from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.ext.hybrid import hybrid_property
+# other useful database stuff
+from sqlalchemy import func
--- a/gargantext/util/digest.py
+++ b/gargantext/util/digest.py
+import hashlib
+import binascii
+def digest(value, algorithm='md5'):
+    m = hashlib.new(algorithm)
+    m.update(value)
+    return m.digest()
+def str_digest(value, algorithm='md5'):
+    return binascii.hexlify(digest(value, algorithm)).decode()
--- a/gargantext/util/files.py
+++ b/gargantext/util/files.py
+from gargantext.constants import *
+from gargantext.util.digest import str_digest
+from gargantext.util import http
+def save(contents, name='', basedir=''):
+    digest = str_digest(contents)
+    path = basedir
+    for i in range(2, 8, 2):
+        path += '/' + digest[:i]
+    if not os.path.exists(path):
+        os.makedirs(path)
+    # save file and return its path
+    path = '%s/%s_%s' % (path, digest, name, )
+    open(path, 'wb').write(contents)
+    return path
+def download(url, name=''):
+    save(
+        contents = http.get(url),
+        name = name,
+        basedir = DOWNLOAD_DIRECTORY,
+    )
+def upload(uploaded):
+    if uploaded.size > UPLOAD_LIMIT:
+        raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
+            uploaded.size,
+            UPLOAD_LIMIT,
+        ))
+    save(
+        contents = uploaded.file.read(),
+        name = uploaded.name,
+        basedir = UPLOAD_DIRECTORY,
+    )
--- a/gargantext/util/http.py
+++ b/gargantext/util/http.py
@@ -19,3 +19,10 @@ def requires_auth(func):
            return redirect(url)
        return func(request, *args, **kwargs)
    return _requires_auth
+import urllib.request
+def get(url):
+    response = urllib.request.urlopen(url)
+    html = response.read()
--- a/gargantext/util/scrappers/pubmed.py
+++ b/gargantext/util/scrappers/pubmed.py
+def suggest(keywords):
+    return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
+def count(keywords):
+    return 42
+def query_save(keywords):
+    return 'path/to/query.xml'
--- a/gargantext/util/workflow.py
+++ b/gargantext/util/workflow.py
+from celery import shared_task
+from time import sleep
+@shared_task
+def _parse(corpus_id):
+    print('ABOUT TO PARSE CORPUS #%d' % corpus_id)
+    sleep(2)
+    print('PARSED CORPUS #%d' % corpus_id)
+def parse(corpus):
+    print('ABOUT TO PLAN PARSING')
+    _parse.apply_async((corpus.id,),)
+    print('PLANNED PARSING')
--- a/gargantext/views/pages/projects.py
+++ b/gargantext/views/pages/projects.py
@@ -5,6 +5,8 @@ from gargantext.models import *
 from gargantext.constants import *
 from datetime import datetime
+from collections import defaultdict
+import re
 @requires_auth
@@ -22,26 +24,17 @@ def overview(request):
        if name != '':
            new_project = Node(
                user_id = user.id,
-                type = 'PROJECT',
+                typename = 'PROJECT',
                name = name,
            )
            session.add(new_project)
            session.commit()
    # list of projects created by the logged user
-    user_projects = user.get_nodes(type='PROJECT')
+    user_projects = user.nodes(typename='PROJECT')
    # list of contacts of the logged user
-    contacts = user.get_contacts()
+    contacts_projects = list(user.contacts_nodes(typename='PROJECT'))
-    contacts_projects = []
-    for contact in contacts:
-        contact_projects = (session
-            .query(Node)
-            .filter(Node.user_id == contact.id)
-            .filter(Node.type == 'PROJECT')
-            .order_by(Node.date)
-        ).all()
-        contacts_projects += contact_projects
    # render page
    return render(
@@ -54,8 +47,8 @@ def overview(request):
            'number': len(user_projects),
            'projects': user_projects,
            # projects owned by the user's contacts
-            'common_users': contacts if len(contacts) else False,
+            'common_users': (contact for contact, projects in contacts_projects),
-            'common_projects': contacts_projects if len(contacts_projects) else False,
+            'common_projects': sum((projects for contact, projects in contacts_projects), []),
        },
    )
@@ -63,7 +56,7 @@ def overview(request):
 from django.utils.translation import ugettext_lazy
 class NewCorpusForm(forms.Form):
    type = forms.ChoiceField(
-        choices = enumerate(resourcetype['name'] for resourcetype in RESOURCETYPES),
+        choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
        widget = forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'})
    )
    name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
@@ -76,7 +69,46 @@ class NewCorpusForm(forms.Form):
 @requires_auth
 def project(request, project_id):
-    project = session.query(Node).filter(project_id == project_id).first()
+    # current user
+    user = cache.User[request.user.username]
+    # viewed project
+    project = session.query(Node).filter(Node.id == project_id).first()
+    if project is None:
+        raise Http404()
+    if not user.owns(project):
+        raise HttpResponseForbidden()
+    # new corpus
+    if request.method == 'POST':
+        corpus = project.add_corpus(
+            name = request.POST['name'],
+            resource_type = request.POST['type'],
+            resource_upload = request.FILES['file'],
+        )
+    # corpora within this project
+    corpora = project.children('CORPUS').all()
+    corpora_by_source = defaultdict(list)
+    for corpus in corpora:
+        resource_type = RESOURCETYPES[corpus['resource_type']]
+        corpora_by_source[resource_type['name']].append(corpus)
+    # source & their respective counts
+    total_count = 0
+    sources_counts = defaultdict(int)
+    for document in corpora:
+        source = RESOURCETYPES[document['resource_type']]
+        sourcename = re.sub(' \(.*$', '', source['name'])
+        count = document.children('DOCUMENT').count()
+        sources_counts[sourcename] += count
+        count += total_count
+    donut = [
+        {   'source': sourcename,
+            'count': count,
+            'part' : round(count * 100.0 / total_count) if total_count else 0,
+        }
+        for sourcename, count in sources_counts.items()
+    ]
+    # response!
    return render(
        template_name = 'pages/projects/project.html',
        request = request,
@@ -86,11 +118,11 @@ def project(request, project_id):
            'date': datetime.now(),
            'project': project,
            'donut': donut,
-            # 'list_corpora'  : dict(corpora_by_resourcetype),
+            'list_corpora': dict(corpora_by_source),
            'whitelists': [],
            'blacklists': [],
            'cooclists': [],
-            # 'number'        : corpora_count,
+            'number': len(corpora),
-            # 'query_size'    : QUERY_SIZE_N_DEFAULT,
+            'query_size': QUERY_SIZE_N_DEFAULT,
        },
    )
--- a/requirements.txt
+++ b/requirements.txt
 Django==1.9.2
+PyYAML==3.11
 RandomWords==0.1.12
 SQLAlchemy==1.0.11
+amqp==1.4.9
+anyjson==0.3.3
+billiard==3.3.0.22
+celery==3.1.20
+dateparser==0.3.2
+django-celery==3.1.17
 django-pgfields==1.4.4
 django-pgjsonb==0.0.16
+html5lib==0.9999999
+jdatetime==1.7.2
+kombu==3.0.33
+lxml==3.5.0
 psycopg2==2.6.1
+python-dateutil==2.4.2
 pytz==2015.7
 six==1.10.0
 ujson==1.35
+umalqurra==0.2
--- a/static/js/raphael-min.js
+++ b/static/js/raphael-min.js
--- a/uploads/.gitignore
+++ b/uploads/.gitignore
+*