Commit 6e2bc79c authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] uploading a corpus has been made possible

[FEAT] added some file managers (save, upload, download)
[FEAT] added Celery workers
[CODE] planned an interface for PubMed scrapping, see `gargantext.util.scrapping.pubmed`
[DOC] added a README and a TODO
parent 204bfc6d
# Installation
```bash
sudo apt-get install python3.4
sudo pip3 install virtualenv
sudo apt-get install rabbitmq-server
virtualenv-3.4 VENV
source VENV/bin/activate
pip install -U -r requirements.txt
```
# Migrate database
## Django models
```bash
./manage.py makemigrations
./manage.py migrate --fake-initial
```
...or...
```bash
./manage.py makemigrations
./manage.py migrate --run-syncdb
```
(see [Django documentation](https://docs.djangoproject.com/en/1.9/topics/migrations/))
## SQLAlchemy models
```bash
./dbmigrate.py
```
# Start the Django server
```bash
manage.py celeryd --loglevel=INFO # to ensure Celery is properly started
manage.py runserver
```
# Projects
## Overview of all projects
- re-implement deletion
## Single project view
- re-implement deletion
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
NODETYPES = [ NODETYPES = [
None, None,
'USER', 'USER',
...@@ -16,22 +19,14 @@ LANGUAGES = { ...@@ -16,22 +19,14 @@ LANGUAGES = {
from gargantext.util.parsers import * from gargantext.util.parsers import *
RESOURCETYPES = [ RESOURCETYPES = [
# { 'name': 'CSV',
# # 'parser': CSVParser,
# 'default_language': 'en',
# },
{ 'name': 'Europress (English)', { 'name': 'Europress (English)',
'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'en', 'default_language': 'en',
}, },
{ 'name': 'Europress (French)', { 'name': 'Europress (French)',
# 'parser': EuropressParser, 'parser': EuropressParser,
'default_language': 'fr', 'default_language': 'fr',
}, },
# { 'name': 'ISTex',
# # 'parser': ISTexParser,
# 'default_language': 'en',
# },
{ 'name': 'Jstor (RIS format)', { 'name': 'Jstor (RIS format)',
# 'parser': RISParser, # 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
...@@ -52,4 +47,24 @@ RESOURCETYPES = [ ...@@ -52,4 +47,24 @@ RESOURCETYPES = [
# 'parser': RISParser, # 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# { 'name': 'CSV',
# # 'parser': CSVParser,
# 'default_language': 'en',
# },
# { 'name': 'ISTex',
# # 'parser': ISTexParser,
# 'default_language': 'en',
# },
] ]
# other parameters
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000
import os
from .settings import BASE_DIR
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads')
UPLOAD_LIMIT = 16 * 1024 * 1024
DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.util.files import upload
from gargantext.util import workflow
from gargantext.constants import * from gargantext.constants import *
from datetime import datetime from datetime import datetime
...@@ -23,10 +25,51 @@ class NodeType(TypeDecorator): ...@@ -23,10 +25,51 @@ class NodeType(TypeDecorator):
class Node(Base): class Node(Base):
__tablename__ = 'nodes' __tablename__ = 'nodes'
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
type = Column(NodeType, index=True) typename = Column(NodeType, index=True)
user_id = Column(Integer, ForeignKey(User.id)) user_id = Column(Integer, ForeignKey(User.id))
parent_id = Column(Integer, ForeignKey('nodes.id'))
# main data # main data
name = Column(String(255), unique=True) name = Column(String(255))
date = Column(DateTime(), default=datetime.now) date = Column(DateTime(), default=datetime.now)
# metadata # metadata
hyperdata = Column(JSONB, default={}) hyperdata = Column(JSONB, default={})
def __getitem__(self, key):
return self.hyperdata[key]
def __setitem__(self, key, value):
self.hyperdata[key] = value
def children(self, typename=None):
"""Return a query to all the direct children of the current node.
Allows filtering by typename (see `constants.py`)
"""
query = session.query(Node).filter(Node.parent_id == self.id)
if typename is not None:
query = query.filter(Node.typename == typename)
return query
def add_child(self, typename, **kwargs):
"""Create and return a new direct child of the current node.
"""
return Node(
user_id = self.user_id,
typename = typename,
parent_id = self.id,
**kwargs
)
def add_corpus(self, name, resource_type, resource_upload=None, resource_url=None):
if resource_upload is not None:
resource_path = upload(resource_upload)
else:
resource_path = None
corpus = self.add_child('CORPUS', name=name, hyperdata={
'resource_type': int(resource_type),
'resource_path': resource_path,
'resource_url': resource_url,
})
session.add(corpus)
session.commit()
workflow.parse(corpus)
return corpus
...@@ -22,7 +22,7 @@ class User(Base): ...@@ -22,7 +22,7 @@ class User(Base):
is_active = Column(Boolean()) is_active = Column(Boolean())
date_joined = DateTime(timezone=False) date_joined = DateTime(timezone=False)
def get_contacts(self): def contacts(self):
"""get all contacts in relation with the user""" """get all contacts in relation with the user"""
Friend = aliased(User) Friend = aliased(User)
query = (session query = (session
...@@ -32,7 +32,7 @@ class User(Base): ...@@ -32,7 +32,7 @@ class User(Base):
) )
return query.all() return query.all()
def get_nodes(self, type=None): def nodes(self, typename=None):
"""get all nodes belonging to the user""" """get all nodes belonging to the user"""
# ↓ this below is a workaround because of Python's lame import system # ↓ this below is a workaround because of Python's lame import system
from .nodes import Node from .nodes import Node
...@@ -41,13 +41,23 @@ class User(Base): ...@@ -41,13 +41,23 @@ class User(Base):
.filter(Node.user_id == self.id) .filter(Node.user_id == self.id)
.order_by(Node.date) .order_by(Node.date)
) )
if type is not None: if typename is not None:
query = query.filter(Node.type == type) query = query.filter(Node.typename == typename)
return query.all() return query.all()
def owns(user, node): def contacts_nodes(self, typename=None):
for contact in self.contacts():
contact_nodes = (session
.query(Node)
.filter(Node.user_id == contact.id)
.filter(Node.typename == typename)
.order_by(Node.date)
).all()
yield contact, contact_nodes
def owns(self, node):
"""check if a given node is owned by the user""" """check if a given node is owned by the user"""
return True return (node.user_id == self.id) or node.id in (contact.id for contact in self.contacts())
class Contact(Base): class Contact(Base):
......
...@@ -29,6 +29,14 @@ MAINTENANCE = False ...@@ -29,6 +29,14 @@ MAINTENANCE = False
ALLOWED_HOSTS = [] ALLOWED_HOSTS = []
# Asynchronous tasks
import djcelery
djcelery.setup_loader()
BROKER_URL = 'amqp://guest:guest@localhost:5672/'
CELERY_IMPORTS = ('gargantext.util.workflow', )
# Application definition # Application definition
INSTALLED_APPS = [ INSTALLED_APPS = [
...@@ -38,6 +46,7 @@ INSTALLED_APPS = [ ...@@ -38,6 +46,7 @@ INSTALLED_APPS = [
'django.contrib.sessions', 'django.contrib.sessions',
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'djcelery',
] ]
MIDDLEWARE_CLASSES = [ MIDDLEWARE_CLASSES = [
......
...@@ -27,3 +27,8 @@ from sqlalchemy.types import * ...@@ -27,3 +27,8 @@ from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.ext.hybrid import hybrid_property
# other useful database stuff
from sqlalchemy import func
import hashlib
import binascii
def digest(value, algorithm='md5'):
m = hashlib.new(algorithm)
m.update(value)
return m.digest()
def str_digest(value, algorithm='md5'):
return binascii.hexlify(digest(value, algorithm)).decode()
from gargantext.constants import *
from gargantext.util.digest import str_digest
from gargantext.util import http
def save(contents, name='', basedir=''):
digest = str_digest(contents)
path = basedir
for i in range(2, 8, 2):
path += '/' + digest[:i]
if not os.path.exists(path):
os.makedirs(path)
# save file and return its path
path = '%s/%s_%s' % (path, digest, name, )
open(path, 'wb').write(contents)
return path
def download(url, name=''):
save(
contents = http.get(url),
name = name,
basedir = DOWNLOAD_DIRECTORY,
)
def upload(uploaded):
if uploaded.size > UPLOAD_LIMIT:
raise IOError('Uploaded file is bigger than allowed: %d > %d' % (
uploaded.size,
UPLOAD_LIMIT,
))
save(
contents = uploaded.file.read(),
name = uploaded.name,
basedir = UPLOAD_DIRECTORY,
)
...@@ -19,3 +19,10 @@ def requires_auth(func): ...@@ -19,3 +19,10 @@ def requires_auth(func):
return redirect(url) return redirect(url)
return func(request, *args, **kwargs) return func(request, *args, **kwargs)
return _requires_auth return _requires_auth
import urllib.request
def get(url):
response = urllib.request.urlopen(url)
html = response.read()
def suggest(keywords):
return ['Suggestion #1', 'Suggestion #2', 'Suggestion #3', 'Suggestion #4', 'Suggestion #5']
def count(keywords):
return 42
def query_save(keywords):
return 'path/to/query.xml'
from celery import shared_task
from time import sleep
@shared_task
def _parse(corpus_id):
print('ABOUT TO PARSE CORPUS #%d' % corpus_id)
sleep(2)
print('PARSED CORPUS #%d' % corpus_id)
def parse(corpus):
print('ABOUT TO PLAN PARSING')
_parse.apply_async((corpus.id,),)
print('PLANNED PARSING')
...@@ -5,6 +5,8 @@ from gargantext.models import * ...@@ -5,6 +5,8 @@ from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
from datetime import datetime from datetime import datetime
from collections import defaultdict
import re
@requires_auth @requires_auth
...@@ -22,26 +24,17 @@ def overview(request): ...@@ -22,26 +24,17 @@ def overview(request):
if name != '': if name != '':
new_project = Node( new_project = Node(
user_id = user.id, user_id = user.id,
type = 'PROJECT', typename = 'PROJECT',
name = name, name = name,
) )
session.add(new_project) session.add(new_project)
session.commit() session.commit()
# list of projects created by the logged user # list of projects created by the logged user
user_projects = user.get_nodes(type='PROJECT') user_projects = user.nodes(typename='PROJECT')
# list of contacts of the logged user # list of contacts of the logged user
contacts = user.get_contacts() contacts_projects = list(user.contacts_nodes(typename='PROJECT'))
contacts_projects = []
for contact in contacts:
contact_projects = (session
.query(Node)
.filter(Node.user_id == contact.id)
.filter(Node.type == 'PROJECT')
.order_by(Node.date)
).all()
contacts_projects += contact_projects
# render page # render page
return render( return render(
...@@ -54,8 +47,8 @@ def overview(request): ...@@ -54,8 +47,8 @@ def overview(request):
'number': len(user_projects), 'number': len(user_projects),
'projects': user_projects, 'projects': user_projects,
# projects owned by the user's contacts # projects owned by the user's contacts
'common_users': contacts if len(contacts) else False, 'common_users': (contact for contact, projects in contacts_projects),
'common_projects': contacts_projects if len(contacts_projects) else False, 'common_projects': sum((projects for contact, projects in contacts_projects), []),
}, },
) )
...@@ -63,7 +56,7 @@ def overview(request): ...@@ -63,7 +56,7 @@ def overview(request):
from django.utils.translation import ugettext_lazy from django.utils.translation import ugettext_lazy
class NewCorpusForm(forms.Form): class NewCorpusForm(forms.Form):
type = forms.ChoiceField( type = forms.ChoiceField(
choices = enumerate(resourcetype['name'] for resourcetype in RESOURCETYPES), choices = enumerate(resource_type['name'] for resource_type in RESOURCETYPES),
widget = forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) widget = forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'})
) )
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' })) name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
...@@ -76,7 +69,46 @@ class NewCorpusForm(forms.Form): ...@@ -76,7 +69,46 @@ class NewCorpusForm(forms.Form):
@requires_auth @requires_auth
def project(request, project_id): def project(request, project_id):
project = session.query(Node).filter(project_id == project_id).first() # current user
user = cache.User[request.user.username]
# viewed project
project = session.query(Node).filter(Node.id == project_id).first()
if project is None:
raise Http404()
if not user.owns(project):
raise HttpResponseForbidden()
# new corpus
if request.method == 'POST':
corpus = project.add_corpus(
name = request.POST['name'],
resource_type = request.POST['type'],
resource_upload = request.FILES['file'],
)
# corpora within this project
corpora = project.children('CORPUS').all()
corpora_by_source = defaultdict(list)
for corpus in corpora:
resource_type = RESOURCETYPES[corpus['resource_type']]
corpora_by_source[resource_type['name']].append(corpus)
# source & their respective counts
total_count = 0
sources_counts = defaultdict(int)
for document in corpora:
source = RESOURCETYPES[document['resource_type']]
sourcename = re.sub(' \(.*$', '', source['name'])
count = document.children('DOCUMENT').count()
sources_counts[sourcename] += count
count += total_count
donut = [
{ 'source': sourcename,
'count': count,
'part' : round(count * 100.0 / total_count) if total_count else 0,
}
for sourcename, count in sources_counts.items()
]
# response!
return render( return render(
template_name = 'pages/projects/project.html', template_name = 'pages/projects/project.html',
request = request, request = request,
...@@ -86,11 +118,11 @@ def project(request, project_id): ...@@ -86,11 +118,11 @@ def project(request, project_id):
'date': datetime.now(), 'date': datetime.now(),
'project': project, 'project': project,
'donut': donut, 'donut': donut,
# 'list_corpora' : dict(corpora_by_resourcetype), 'list_corpora': dict(corpora_by_source),
'whitelists': [], 'whitelists': [],
'blacklists': [], 'blacklists': [],
'cooclists': [], 'cooclists': [],
# 'number' : corpora_count, 'number': len(corpora),
# 'query_size' : QUERY_SIZE_N_DEFAULT, 'query_size': QUERY_SIZE_N_DEFAULT,
}, },
) )
Django==1.9.2 Django==1.9.2
PyYAML==3.11
RandomWords==0.1.12 RandomWords==0.1.12
SQLAlchemy==1.0.11 SQLAlchemy==1.0.11
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.22
celery==3.1.20
dateparser==0.3.2
django-celery==3.1.17
django-pgfields==1.4.4 django-pgfields==1.4.4
django-pgjsonb==0.0.16 django-pgjsonb==0.0.16
html5lib==0.9999999
jdatetime==1.7.2
kombu==3.0.33
lxml==3.5.0
psycopg2==2.6.1 psycopg2==2.6.1
python-dateutil==2.4.2
pytz==2015.7 pytz==2015.7
six==1.10.0 six==1.10.0
ujson==1.35 ujson==1.35
umalqurra==0.2
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment