Commit 1eb89617 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'testing' into stable

parents 0b4f0739 c9254ca2
......@@ -6,6 +6,21 @@ Keep in mind that Alembic only handles SQLAlchemy models: tables created from
Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
alembic.ini.
To bootstrap Alembic where a gargantext database is already existing see
below: TELL ALEMBIC TO NOT START FROM SCRATCH.
USUAL WORKFLOW WITH ALEMBIC
1. Make change to models in gargantext/models
2. Autogenerate revision (see below GENERATE A REVISION)
3. Manually check and edit revision file in alembic/versions
4. Commit alembic revision (it should never be reverted)
5. Commit changes in models (it can be reverted if needed)
To create, drop or modify views, schemas, roles, stored procedures, triggers or
policies see below: REPLACEABLE OBJECTS.
TELL ALEMBIC TO NOT START FROM SCRATCH
......@@ -29,25 +44,76 @@ DOWNGRADE TO INITIAL DATABASE STATE
alembic downgrade base
GENERATE A NEW REVISION
GENERATE A REVISION
alembic revision -m "Message for this migration"
alembic revision --autogenerate -m "Message for this migration"
# A migration script is then created in alembic/versions directory. For
# example alembic/versions/3adcc9a56557_message_for_this_migration.py
# where 3adcc9a56557 is the revision id generated by Alembic.
#
# Alembic should generate a script reflecting changes already made in
# models or database. However it is always a good idea to check it and edit
# it manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
GENERATE AN EMPTY REVISION
alembic revision -m "Message for this migration"
# This script must be edited to write the migration itself, mainly
# in `upgrade` and `downgrade` functions. See Alembic documentation for
# further details.
GENERATE A REVISION FROM CURRENT STATE
REPLACEABLE OBJECTS
alembic revision --autogenerate -m "Message for this migration"
There is no specific way no handle views, schemas, roles, stored procedures,
triggers or policies with Alembic. To ease revisions of such objects, avoid
boilerplate code and too much op.execute we use an enhanced version of
ReplaceableObject recipe (see Alembic documentation).
# Alembic should generate a script reflecting changes already made in
# database. However it is always a good idea to check it and edit it
# manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
To create, drop or modify such object you need to make a ReplaceableObject
instance, and then use create_*, drop_* or replace_* method of alembic.op.
Conversion between ReplaceableObject and SQL is implemented in
gargantext/util/alembic.py.
* Views: create_view(ReplaceableObject(<name>, <query>))
* Roles: create_role(ReplaceableObject(<name>, <options>))
* Schemas: create_schema(ReplaceableObject(<name>))
* Stored procedures: create_sp(ReplaceableObject(<name(arguments)>, <body>)
* Triggers: create_trigger(ReplaceableObject(<name>, <when>, <table>, <body>))
* Policies: create_policy(ReplaceableObject(<name>, <table>, <body>))
Here is an example with a stored procedure:
...
from gargantext.util.alembic import ReplaceableObject
revision = '08230100f512'
...
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 42 $$ LANGUAGE sql")
def upgrade():
op.create_sp(my_function_sp)
def downgrade():
op.drop_sp(my_function_sp)
To modify this stored procedure in a later revision:
...
from gargantext.util.alembic import ReplaceableObject
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 43 $$ LANGUAGE sql")
def upgrade():
op.replace_sp(my_function_sp, replaces="08230100f512.my_function_sp")
def downgrade():
op.replace_sp(my_function_sp, replace_with="08230100f512.my_function_sp")
......@@ -52,6 +52,14 @@ def include_object(obj, name, typ, reflected, compare_to):
return True
context_opts = dict(
target_metadata=target_metadata,
include_object=include_object,
compare_server_default=True,
compare_type=True,
)
def run_migrations_offline():
"""Run migrations in 'offline' mode.
......@@ -65,9 +73,7 @@ def run_migrations_offline():
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True,
include_object=include_object)
context.configure(url=url, literal_binds=True, **context_opts)
with context.begin_transaction():
context.run_migrations()
......@@ -86,11 +92,7 @@ def run_migrations_online():
poolclass=pool.NullPool)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
include_object=include_object
)
context.configure(connection=connection, **context_opts)
with context.begin_transaction():
context.run_migrations()
......
"""Add server side sensible defaults for nodes
Revision ID: 73304ae9f1fb
Revises: 159a5154362b
Create Date: 2017-10-05 14:17:58.326646
"""
from alembic import op
import sqlalchemy as sa
import gargantext
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '73304ae9f1fb'
down_revision = '159a5154362b'
branch_labels = None
depends_on = None
def upgrade():
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=sa.text('CURRENT_TIMESTAMP'),
nullable=False)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default='',
nullable=False)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=False)
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=False)
def downgrade():
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default=None,
nullable=True)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=None,
nullable=True)
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=None,
nullable=True)
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
Integer, REAL, Boolean, DateTime, String, Text
from sqlalchemy_utils.types import TSVectorType
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import text
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
"text",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TSVectorType",
......@@ -27,6 +29,16 @@ Base = declarative_base()
DjangoBase = declarative_base()
class Float(REAL):
"""Reflect exact REAL type for PostgreSQL in order to avoid confusion
within Alembic type comparison"""
def __init__(self, *args, **kwargs):
if kwargs.get('precision') == 24:
kwargs.pop('precision')
super(Float, self).__init__(*args, **kwargs)
class ValidatorMixin(object):
def enforce_length(self, key, value):
"""Truncate a string according to its column length
......
......@@ -6,7 +6,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, TSVectorType, \
MutableList, MutableDict, validates, ValidatorMixin
MutableList, MutableDict, validates, ValidatorMixin, text
from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode']
......@@ -44,7 +44,7 @@ class Node(ValidatorMixin, Base):
>>> session.query(Node).filter_by(typename='USER').first() # doctest: +ELLIPSIS
<UserNode(...)>
But beware, there are some caveats with bulk queries. In this case typename
But beware, there are some pitfalls with bulk queries. In this case typename
MUST be specified manually.
>>> session.query(UserNode).delete() # doctest: +SKIP
......@@ -59,20 +59,23 @@ class Node(ValidatorMixin, Base):
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
typename = Column(NodeType, index=True, nullable=False)
__mapper_args__ = { 'polymorphic_on': typename }
# foreign keys
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'),
nullable=False)
user = relationship(User)
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id])
name = Column(String(255))
date = Column(DateTime(timezone=True), default=datetime.now)
name = Column(String(255), nullable=False, server_default='')
date = Column(DateTime(timezone=True), nullable=False,
server_default=text('CURRENT_TIMESTAMP'))
hyperdata = Column(JSONB, default=dict)
hyperdata = Column(JSONB, default=dict, nullable=False,
server_default=text("'{}'::jsonb"))
# Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
# We need to create a trigger to update this column on update and insert,
......
......@@ -7,6 +7,8 @@ the name of "ReplaceableObject" class.
This recipe is directly borrowed from Alembic documentation, see
http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects
**2017-10-09** ReversibleOp.define has been added to reduce boilerplate code.
"""
from alembic.operations import Operations, MigrateOperation
......@@ -58,44 +60,34 @@ class ReversibleOp(MigrateOperation):
operations.invoke(drop_old)
operations.invoke(create_new)
@classmethod
def define(cls, name, cname=None, register=Operations.register_operation):
def create(self):
return CreateOp(self.target)
@Operations.register_operation("create_view", "invoke_for_target")
@Operations.register_operation("replace_view", "replace")
class CreateViewOp(ReversibleOp):
def reverse(self):
return DropViewOp(self.target)
@Operations.register_operation("drop_view", "invoke_for_target")
class DropViewOp(ReversibleOp):
def reverse(self):
return CreateViewOp(self.view)
def drop(self):
return DropOp(self.target)
@Operations.register_operation("create_sp", "invoke_for_target")
@Operations.register_operation("replace_sp", "replace")
class CreateSPOp(ReversibleOp):
def reverse(self):
return DropSPOp(self.target)
name = name.lower()
cname = cname or name.capitalize()
CreateOp = type('Create%sOp' % cname, (ReversibleOp,), {'reverse': drop})
DropOp = type('Drop%sOp' % cname, (ReversibleOp,), {'reverse': create})
@Operations.register_operation("drop_sp", "invoke_for_target")
class DropSPOp(ReversibleOp):
def reverse(self):
return CreateSPOp(self.target)
CreateOp = register('create_' + name, 'invoke_for_target')(CreateOp)
CreateOp = register('replace_' + name, 'replace')(CreateOp)
DropOp = register('drop_' + name, 'invoke_for_target')(DropOp)
@Operations.register_operation("create_trigger", "invoke_for_target")
@Operations.register_operation("replace_trigger", "replace")
class CreateTriggerOp(ReversibleOp):
def reverse(self):
return DropTriggerOp(self.target)
return (CreateOp, DropOp)
@Operations.register_operation("drop_trigger", "invoke_for_target")
class DropTriggerOp(ReversibleOp):
def reverse(self):
return CreateTriggerOp(self.target)
CreateViewOp, DropViewOp = ReversibleOp.define('view')
CreateRoleOp, DropRoleOp = ReversibleOp.define('role')
CreateSchemaOp, DropSchemaOp = ReversibleOp.define('schema')
CreateSPOp, DropSPOp = ReversibleOp.define('sp', 'SP')
CreateTriggerOp, DropTriggerOp = ReversibleOp.define('trigger')
CreatePolicyOp, DropPolicyOp = ReversibleOp.define('policy')
@Operations.implementation_for(CreateViewOp)
......@@ -111,6 +103,32 @@ def drop_view(operations, operation):
operations.execute("DROP VIEW %s" % operation.target.name)
@Operations.implementation_for(CreateRoleOp)
def create_role(operations, operation):
args = operation.target.args
operations.execute(
"CREATE ROLE %s WITH %s" % (
operation.target.name,
args[0] if len(args) else 'NOLOGIN'
)
)
@Operations.implementation_for(DropRoleOp)
def drop_role(operations, operation):
operations.execute("DROP ROLE %s" % operation.target.name)
@Operations.implementation_for(CreateSchemaOp)
def create_schema(operations, operation):
operations.execute("CREATE SCHEMA %s" % operation.target.name)
@Operations.implementation_for(DropSchemaOp)
def drop_schema(operations, operation):
operations.execute("DROP SCHEMA %s" % operation.target.name)
@Operations.implementation_for(CreateSPOp)
def create_sp(operations, operation):
operations.execute(
......@@ -143,3 +161,24 @@ def drop_trigger(operations, operation):
operation.target.args[1]
)
)
@Operations.implementation_for(CreatePolicyOp)
def create_policy(operations, operation):
operations.execute(
"CREATE POLICY %s ON %s %s" % (
operation.target.name,
operation.target.args[0],
operation.target.args[1],
)
)
@Operations.implementation_for(DropPolicyOp)
def drop_policy(operations, operation):
operations.execute(
"DROP POLICY %s ON %s" % (
operation.target.name,
operation.target.args[0],
)
)
......@@ -2,6 +2,7 @@ import os
from gargantext.settings import MEDIA_ROOT
from datetime import MINYEAR
from dateutil.parser import parse as parse_datetime_flexible
from django.utils.dateparse import parse_datetime
from django.utils.timezone import datetime as _datetime, utc as UTC, now as utcnow
......@@ -19,7 +20,8 @@ class datetime(_datetime):
@staticmethod
def parse(s):
dt = parse_datetime(s)
dt = parse_datetime(s) or \
parse_datetime_flexible(s, default=datetime(MINYEAR, 1, 1))
return dt.astimezone(UTC) if dt.tzinfo else dt.replace(tzinfo=UTC)
......
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import pandas
import io
class CSVParser(Parser):
DELIMITERS = ", \t;|:"
ENCODING = "utf-8"
def detect_delimiter(self, lines, sample_size=10):
sample = lines[:sample_size]
def open(self, file):
f = super(CSVParser, self).open(file)
# Compute frequency of each delimiter on each input line
delimiters_freqs = {
d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
if isinstance(file, str) and file.endswith('.zip'):
return f
# Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
return io.TextIOWrapper(f, encoding=self.ENCODING)
if selected_delimiters:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
return sorted_delimiters[-1][0]
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter)
# Parse CSV
reader = csv.reader(contents, delimiter=delimiter)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \
next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
(0, []))
# Get first not empty column of the first row, or 0
first_col = next((i for i, field in enumerate(headers) if field), 0)
# Strip out potential empty fields in headers
headers = headers[first_col:]
def parse(self, fp=None):
fp = fp or self._file
df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
# Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for i, fields in enumerate(reader):
for i, fields in enumerate(df.itertuples(index=False)):
if i % 500 == 0:
print("CSV: parsing row #%s..." % (i+1))
if any(fields):
yield dict(zip(headers, fields[first_col:]))
# See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
yield fields._asdict()
......@@ -12,7 +12,7 @@ class ISIParser(RISParser):
"DI": {"type": "hyperdata", "key": "doi"},
"SO": {"type": "hyperdata", "key": "source"},
"PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"},
"PD": {"type": "hyperdata", "key": "publication_date_to_parse"},
"LA": {"type": "hyperdata", "key": "language_fullname"},
"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"},
......
......@@ -3,10 +3,7 @@ import zipfile
import re
import dateparser as date_parser
from gargantext.util.languages import languages
from gargantext.util import datetime, convert_to_datetime, MINYEAR
DEFAULT_DATE = datetime(MINYEAR, 1, 1)
from gargantext.util import datetime, convert_to_datetime
class Parser:
......@@ -14,15 +11,14 @@ class Parser:
"""
def __init__(self, file):
if isinstance(file, str):
self._file = open(file, 'rb')
else:
self._file = file
self._file = self.open(file)
def __del__(self):
if hasattr(self, '_file'):
self._file.close()
def open(self, file):
return open(file, 'rb') if isinstance(file, str) else file
def detect_encoding(self, string):
"""Useful method to detect the encoding of a document.
......@@ -47,10 +43,7 @@ class Parser:
if date_string is not None:
date_string = re.sub(r'\/\/+(\w*|\d*)', '', date_string)
try:
hyperdata['publication_date'] = dateutil.parser.parse(
date_string,
default=DEFAULT_DATE
)
hyperdata['publication_date'] = datetime.parse(date_string)
except Exception as error:
print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.now()
......@@ -165,11 +158,10 @@ class Parser:
file = self._file
# if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
f = zipArchive.open(filename, 'r')
yield from self.__iter__(f)
f.close()
with zipfile.ZipFile(file) as zf:
for filename in zf.namelist():
with zf.open(filename) as df, self.open(df) as f:
yield from self.__iter__(f)
# ...otherwise, let's parse it directly!
else:
try:
......
......@@ -61,12 +61,14 @@ def nodes(parent=None, group_by='typename', order_by='typename', has_child='chec
def node_show(node, prefix='', maxlen=60):
if node.children > 0 or node.cnt == 1:
node_id = '<{}> '.format(node.id)
name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name
label = Fore.CYAN + name + Fore.RESET
label = node_id + Fore.CYAN + name + Fore.RESET
else:
label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='')
typename = Fore.GREEN + node.typename + Fore.RESET
print(prefix, '%s %s' % (typename, label), sep='')
def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):
......
......@@ -6,7 +6,7 @@ from gargantext.settings import BASE_URL
drafts = {
drafts = {
'workflowEnd' : '''
Bonjour,
votre analyse sur Gargantext vient de se terminer.
......@@ -42,18 +42,33 @@ drafts = {
''',
'recountDone': '''
Bonjour,
le recalcul que vous avez lancé est terminé.
Vous pouvez accéder à votre corpus intitulé
\"%s\"
à l'adresse:
http://%s/projects/%d/corpora/%d
}
Nous restons à votre disposition pour tout complément d'information.
Cordialement
--
L'équipe de Gargantext (CNRS)
'''
}
def notification(corpus,draft):
def notification(corpus, draft, subject='Update'):
user = session.query(User).filter(User.id == corpus.user_id).first()
message = draft % (corpus.name, BASE_URL, corpus.parent_id, corpus.id)
if user.email != "" :
send_mail('[Gargantext] Update'
send_mail('[Gargantext] %s' % subject
, message
, 'contact@gargantext.org'
, [user.email], fail_silently=False )
......@@ -63,11 +78,12 @@ def notification(corpus,draft):
def notify_owner(corpus):
notification(corpus, drafts['workflowEnd'])
notification(corpus, drafts['workflowEnd'], 'Corpus updated')
def notify_listMerged(corpus):
notification(corpus, drafts['listMerged'])
notification(corpus, drafts['listMerged'], 'List merged')
def notify_recount(corpus):
notification(corpus, drafts['recountDone'], 'Recount done')
......@@ -13,7 +13,7 @@ from .ngram_coocs import compute_coocs
#from .ngram_coocs_old_sqlalchemy_version import compute_coocs
from .metric_specgen import compute_specgen
from .list_map import do_maplist
from .mail_notification import notify_owner
from .mail_notification import notify_owner, notify_recount
from gargantext.util.db import session
from gargantext.models import Node
......@@ -299,5 +299,10 @@ def recount(corpus_id):
corpus.save_hyperdata()
session.commit()
if not DEBUG:
print('RECOUNT #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
notify_recount(corpus)
def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
......@@ -1170,10 +1170,10 @@
// REST and callback
garganrest.metrics.update(corpusId, function(){
statusDiv.innerHTML = '<div class="statusinfo">Corpus updated</div>'
statusDiv.innerHTML = '<div class="statusinfo">Recount is started, please wait, you will be sent a notification email.</div>'
// revert visual
setTimeout(function(){ statusDiv.innerHTML = previousStatus }, 2000);
//setTimeout(function(){ statusDiv.innerHTML = previousStatus }, 2000);
})
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment