Commit 372d44d1 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/simon-unstable-fix-csv-issue' into dev-merge

parents 3ef753ca 7367f14d
...@@ -6,6 +6,21 @@ Keep in mind that Alembic only handles SQLAlchemy models: tables created from ...@@ -6,6 +6,21 @@ Keep in mind that Alembic only handles SQLAlchemy models: tables created from
Django ORM must be put out of Alembic sight. See [alembic:exclude] section in Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
alembic.ini. alembic.ini.
To bootstrap Alembic where a gargantext database is already existing see
below: TELL ALEMBIC TO NOT START FROM SCRATCH.
USUAL WORKFLOW WITH ALEMBIC
1. Make change to models in gargantext/models
2. Autogenerate revision (see below GENERATE A REVISION)
3. Manually check and edit revision file in alembic/versions
4. Commit alembic revision (it should never be reverted)
5. Commit changes in models (it can be reverted if needed)
To create, drop or modify views, schemas, roles, stored procedures, triggers or
policies see below: REPLACEABLE OBJECTS.
TELL ALEMBIC TO NOT START FROM SCRATCH TELL ALEMBIC TO NOT START FROM SCRATCH
...@@ -29,25 +44,76 @@ DOWNGRADE TO INITIAL DATABASE STATE ...@@ -29,25 +44,76 @@ DOWNGRADE TO INITIAL DATABASE STATE
alembic downgrade base alembic downgrade base
GENERATE A NEW REVISION GENERATE A REVISION
alembic revision -m "Message for this migration" alembic revision --autogenerate -m "Message for this migration"
# A migration script is then created in alembic/versions directory. For # A migration script is then created in alembic/versions directory. For
# example alembic/versions/3adcc9a56557_message_for_this_migration.py # example alembic/versions/3adcc9a56557_message_for_this_migration.py
# where 3adcc9a56557 is the revision id generated by Alembic. # where 3adcc9a56557 is the revision id generated by Alembic.
# #
# Alembic should generate a script reflecting changes already made in
# models or database. However it is always a good idea to check it and edit
# it manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
GENERATE AN EMPTY REVISION
alembic revision -m "Message for this migration"
# This script must be edited to write the migration itself, mainly # This script must be edited to write the migration itself, mainly
# in `upgrade` and `downgrade` functions. See Alembic documentation for # in `upgrade` and `downgrade` functions. See Alembic documentation for
# further details. # further details.
GENERATE A REVISION FROM CURRENT STATE REPLACEABLE OBJECTS
alembic revision --autogenerate -m "Message for this migration" There is no specific way no handle views, schemas, roles, stored procedures,
triggers or policies with Alembic. To ease revisions of such objects, avoid
boilerplate code and too much op.execute we use an enhanced version of
ReplaceableObject recipe (see Alembic documentation).
# Alembic should generate a script reflecting changes already made in To create, drop or modify such object you need to make a ReplaceableObject
# database. However it is always a good idea to check it and edit it instance, and then use create_*, drop_* or replace_* method of alembic.op.
# manually, Alembic is not always accurate and can't see all alterations. Conversion between ReplaceableObject and SQL is implemented in
# It should work with basic changes such as model or column creation. See gargantext/util/alembic.py.
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
* Views: create_view(ReplaceableObject(<name>, <query>))
* Roles: create_role(ReplaceableObject(<name>, <options>))
* Schemas: create_schema(ReplaceableObject(<name>))
* Stored procedures: create_sp(ReplaceableObject(<name(arguments)>, <body>)
* Triggers: create_trigger(ReplaceableObject(<name>, <when>, <table>, <body>))
* Policies: create_policy(ReplaceableObject(<name>, <table>, <body>))
Here is an example with a stored procedure:
...
from gargantext.util.alembic import ReplaceableObject
revision = '08230100f512'
...
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 42 $$ LANGUAGE sql")
def upgrade():
op.create_sp(my_function_sp)
def downgrade():
op.drop_sp(my_function_sp)
To modify this stored procedure in a later revision:
...
from gargantext.util.alembic import ReplaceableObject
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 43 $$ LANGUAGE sql")
def upgrade():
op.replace_sp(my_function_sp, replaces="08230100f512.my_function_sp")
def downgrade():
op.replace_sp(my_function_sp, replace_with="08230100f512.my_function_sp")
...@@ -52,6 +52,14 @@ def include_object(obj, name, typ, reflected, compare_to): ...@@ -52,6 +52,14 @@ def include_object(obj, name, typ, reflected, compare_to):
return True return True
context_opts = dict(
target_metadata=target_metadata,
include_object=include_object,
compare_server_default=True,
compare_type=True,
)
def run_migrations_offline(): def run_migrations_offline():
"""Run migrations in 'offline' mode. """Run migrations in 'offline' mode.
...@@ -65,9 +73,7 @@ def run_migrations_offline(): ...@@ -65,9 +73,7 @@ def run_migrations_offline():
""" """
url = config.get_main_option("sqlalchemy.url") url = config.get_main_option("sqlalchemy.url")
context.configure( context.configure(url=url, literal_binds=True, **context_opts)
url=url, target_metadata=target_metadata, literal_binds=True,
include_object=include_object)
with context.begin_transaction(): with context.begin_transaction():
context.run_migrations() context.run_migrations()
...@@ -86,11 +92,7 @@ def run_migrations_online(): ...@@ -86,11 +92,7 @@ def run_migrations_online():
poolclass=pool.NullPool) poolclass=pool.NullPool)
with connectable.connect() as connection: with connectable.connect() as connection:
context.configure( context.configure(connection=connection, **context_opts)
connection=connection,
target_metadata=target_metadata,
include_object=include_object
)
with context.begin_transaction(): with context.begin_transaction():
context.run_migrations() context.run_migrations()
......
"""Add server side sensible defaults for nodes
Revision ID: 73304ae9f1fb
Revises: 159a5154362b
Create Date: 2017-10-05 14:17:58.326646
"""
from alembic import op
import sqlalchemy as sa
import gargantext
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '73304ae9f1fb'
down_revision = '159a5154362b'
branch_labels = None
depends_on = None
def upgrade():
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=sa.text('CURRENT_TIMESTAMP'),
nullable=False)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default='',
nullable=False)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=False)
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=False)
def downgrade():
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default=None,
nullable=True)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=None,
nullable=True)
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=None,
nullable=True)
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship, validates from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \ from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text Integer, REAL, Boolean, DateTime, String, Text
from sqlalchemy_utils.types import TSVectorType from sqlalchemy_utils.types import TSVectorType
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import text
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship", __all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
"text",
"validates", "ValidatorMixin", "validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text", "Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TSVectorType", "TSVectorType",
...@@ -27,6 +29,16 @@ Base = declarative_base() ...@@ -27,6 +29,16 @@ Base = declarative_base()
DjangoBase = declarative_base() DjangoBase = declarative_base()
class Float(REAL):
"""Reflect exact REAL type for PostgreSQL in order to avoid confusion
within Alembic type comparison"""
def __init__(self, *args, **kwargs):
if kwargs.get('precision') == 24:
kwargs.pop('precision')
super(Float, self).__init__(*args, **kwargs)
class ValidatorMixin(object): class ValidatorMixin(object):
def enforce_length(self, key, value): def enforce_length(self, key, value):
"""Truncate a string according to its column length """Truncate a string according to its column length
......
...@@ -6,7 +6,7 @@ from datetime import datetime ...@@ -6,7 +6,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \ from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, TSVectorType, \ Integer, Float, String, DateTime, JSONB, TSVectorType, \
MutableList, MutableDict, validates, ValidatorMixin MutableList, MutableDict, validates, ValidatorMixin, text
from .users import User from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode'] __all__ = ['Node', 'NodeNode', 'CorpusNode']
...@@ -59,20 +59,23 @@ class Node(ValidatorMixin, Base): ...@@ -59,20 +59,23 @@ class Node(ValidatorMixin, Base):
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True) typename = Column(NodeType, index=True, nullable=False)
__mapper_args__ = { 'polymorphic_on': typename } __mapper_args__ = { 'polymorphic_on': typename }
# foreign keys # foreign keys
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE')) user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'),
nullable=False)
user = relationship(User) user = relationship(User)
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE')) parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id]) parent = relationship('Node', remote_side=[id])
name = Column(String(255)) name = Column(String(255), nullable=False, server_default='')
date = Column(DateTime(timezone=True), default=datetime.now) date = Column(DateTime(timezone=True), nullable=False,
server_default=text('CURRENT_TIMESTAMP'))
hyperdata = Column(JSONB, default=dict) hyperdata = Column(JSONB, default=dict, nullable=False,
server_default=text("'{}'::jsonb"))
# Create a TSVECTOR column to use fulltext search feature of PostgreSQL. # Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
# We need to create a trigger to update this column on update and insert, # We need to create a trigger to update this column on update and insert,
......
...@@ -7,6 +7,8 @@ the name of "ReplaceableObject" class. ...@@ -7,6 +7,8 @@ the name of "ReplaceableObject" class.
This recipe is directly borrowed from Alembic documentation, see This recipe is directly borrowed from Alembic documentation, see
http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects
**2017-10-09** ReversibleOp.define has been added to reduce boilerplate code.
""" """
from alembic.operations import Operations, MigrateOperation from alembic.operations import Operations, MigrateOperation
...@@ -58,44 +60,34 @@ class ReversibleOp(MigrateOperation): ...@@ -58,44 +60,34 @@ class ReversibleOp(MigrateOperation):
operations.invoke(drop_old) operations.invoke(drop_old)
operations.invoke(create_new) operations.invoke(create_new)
@classmethod
def define(cls, name, cname=None, register=Operations.register_operation):
def create(self):
return CreateOp(self.target)
@Operations.register_operation("create_view", "invoke_for_target") def drop(self):
@Operations.register_operation("replace_view", "replace") return DropOp(self.target)
class CreateViewOp(ReversibleOp):
def reverse(self):
return DropViewOp(self.target)
@Operations.register_operation("drop_view", "invoke_for_target")
class DropViewOp(ReversibleOp):
def reverse(self):
return CreateViewOp(self.view)
@Operations.register_operation("create_sp", "invoke_for_target") name = name.lower()
@Operations.register_operation("replace_sp", "replace") cname = cname or name.capitalize()
class CreateSPOp(ReversibleOp):
def reverse(self):
return DropSPOp(self.target)
CreateOp = type('Create%sOp' % cname, (ReversibleOp,), {'reverse': drop})
DropOp = type('Drop%sOp' % cname, (ReversibleOp,), {'reverse': create})
@Operations.register_operation("drop_sp", "invoke_for_target") CreateOp = register('create_' + name, 'invoke_for_target')(CreateOp)
class DropSPOp(ReversibleOp): CreateOp = register('replace_' + name, 'replace')(CreateOp)
def reverse(self):
return CreateSPOp(self.target)
DropOp = register('drop_' + name, 'invoke_for_target')(DropOp)
@Operations.register_operation("create_trigger", "invoke_for_target") return (CreateOp, DropOp)
@Operations.register_operation("replace_trigger", "replace")
class CreateTriggerOp(ReversibleOp):
def reverse(self):
return DropTriggerOp(self.target)
@Operations.register_operation("drop_trigger", "invoke_for_target") CreateViewOp, DropViewOp = ReversibleOp.define('view')
class DropTriggerOp(ReversibleOp): CreateRoleOp, DropRoleOp = ReversibleOp.define('role')
def reverse(self): CreateSchemaOp, DropSchemaOp = ReversibleOp.define('schema')
return CreateTriggerOp(self.target) CreateSPOp, DropSPOp = ReversibleOp.define('sp', 'SP')
CreateTriggerOp, DropTriggerOp = ReversibleOp.define('trigger')
CreatePolicyOp, DropPolicyOp = ReversibleOp.define('policy')
@Operations.implementation_for(CreateViewOp) @Operations.implementation_for(CreateViewOp)
...@@ -111,6 +103,32 @@ def drop_view(operations, operation): ...@@ -111,6 +103,32 @@ def drop_view(operations, operation):
operations.execute("DROP VIEW %s" % operation.target.name) operations.execute("DROP VIEW %s" % operation.target.name)
@Operations.implementation_for(CreateRoleOp)
def create_role(operations, operation):
args = operation.target.args
operations.execute(
"CREATE ROLE %s WITH %s" % (
operation.target.name,
args[0] if len(args) else 'NOLOGIN'
)
)
@Operations.implementation_for(DropRoleOp)
def drop_role(operations, operation):
operations.execute("DROP ROLE %s" % operation.target.name)
@Operations.implementation_for(CreateSchemaOp)
def create_schema(operations, operation):
operations.execute("CREATE SCHEMA %s" % operation.target.name)
@Operations.implementation_for(DropSchemaOp)
def drop_schema(operations, operation):
operations.execute("DROP SCHEMA %s" % operation.target.name)
@Operations.implementation_for(CreateSPOp) @Operations.implementation_for(CreateSPOp)
def create_sp(operations, operation): def create_sp(operations, operation):
operations.execute( operations.execute(
...@@ -143,3 +161,24 @@ def drop_trigger(operations, operation): ...@@ -143,3 +161,24 @@ def drop_trigger(operations, operation):
operation.target.args[1] operation.target.args[1]
) )
) )
@Operations.implementation_for(CreatePolicyOp)
def create_policy(operations, operation):
operations.execute(
"CREATE POLICY %s ON %s %s" % (
operation.target.name,
operation.target.args[0],
operation.target.args[1],
)
)
@Operations.implementation_for(DropPolicyOp)
def drop_policy(operations, operation):
operations.execute(
"DROP POLICY %s ON %s" % (
operation.target.name,
operation.target.args[0],
)
)
from ._Parser import Parser from ._Parser import Parser
# from ..NgramsExtractors import * import pandas
import sys import io
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
class CSVParser(Parser): class CSVParser(Parser):
DELIMITERS = ", \t;|:" ENCODING = "utf-8"
def detect_delimiter(self, lines, sample_size=10): def open(self, file):
sample = lines[:sample_size] f = super(CSVParser, self).open(file)
# Compute frequency of each delimiter on each input line if isinstance(file, str) and file.endswith('.zip'):
delimiters_freqs = { return f
d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
# Select delimiters with a standard deviation of zero, ie. delimiters return io.TextIOWrapper(f, encoding=self.ENCODING)
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
if selected_delimiters: def parse(self, fp=None):
# Choose the delimiter with highest frequency amongst selected ones fp = fp or self._file
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1]) df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
return sorted_delimiters[-1][0]
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter)
# Parse CSV
reader = csv.reader(contents, delimiter=delimiter)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \
next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
(0, []))
# Get first not empty column of the first row, or 0
first_col = next((i for i, field in enumerate(headers) if field), 0)
# Strip out potential empty fields in headers
headers = headers[first_col:]
# Return a generator of dictionaries with column labels as keys, # Return a generator of dictionaries with column labels as keys,
# filtering out empty rows # filtering out empty rows
for i, fields in enumerate(reader): for i, fields in enumerate(df.itertuples(index=False)):
if i % 500 == 0: if i % 500 == 0:
print("CSV: parsing row #%s..." % (i+1)) print("CSV: parsing row #%s..." % (i+1))
if any(fields):
yield dict(zip(headers, fields[first_col:])) # See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
yield fields._asdict()
...@@ -14,15 +14,14 @@ class Parser: ...@@ -14,15 +14,14 @@ class Parser:
""" """
def __init__(self, file): def __init__(self, file):
if isinstance(file, str): self._file = self.open(file)
self._file = open(file, 'rb')
else:
self._file = file
def __del__(self): def __del__(self):
if hasattr(self, '_file'): if hasattr(self, '_file'):
self._file.close() self._file.close()
def open(self, file):
return open(file, 'rb') if isinstance(file, str) else file
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the encoding of a document. """Useful method to detect the encoding of a document.
...@@ -165,11 +164,10 @@ class Parser: ...@@ -165,11 +164,10 @@ class Parser:
file = self._file file = self._file
# if the file is a ZIP archive, recurse on each of its files... # if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file): if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file) with zipfile.ZipFile(file) as zf:
for filename in zipArchive.namelist(): for filename in zf.namelist():
f = zipArchive.open(filename, 'r') with zf.open(filename) as df, self.open(df) as f:
yield from self.__iter__(f) yield from self.__iter__(f)
f.close()
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
try: try:
......
...@@ -61,12 +61,14 @@ def nodes(parent=None, group_by='typename', order_by='typename', has_child='chec ...@@ -61,12 +61,14 @@ def nodes(parent=None, group_by='typename', order_by='typename', has_child='chec
def node_show(node, prefix='', maxlen=60): def node_show(node, prefix='', maxlen=60):
if node.children > 0 or node.cnt == 1: if node.children > 0 or node.cnt == 1:
node_id = '<{}> '.format(node.id)
name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name
label = Fore.CYAN + name + Fore.RESET label = node_id + Fore.CYAN + name + Fore.RESET
else: else:
label = Fore.MAGENTA + str(node.cnt) + Fore.RESET label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='') typename = Fore.GREEN + node.typename + Fore.RESET
print(prefix, '%s %s' % (typename, label), sep='')
def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True): def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment