Commit 372d44d1 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/simon-unstable-fix-csv-issue' into dev-merge

parents 3ef753ca 7367f14d
......@@ -6,6 +6,21 @@ Keep in mind that Alembic only handles SQLAlchemy models: tables created from
Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
alembic.ini.
To bootstrap Alembic where a gargantext database is already existing see
below: TELL ALEMBIC TO NOT START FROM SCRATCH.
USUAL WORKFLOW WITH ALEMBIC
1. Make change to models in gargantext/models
2. Autogenerate revision (see below GENERATE A REVISION)
3. Manually check and edit revision file in alembic/versions
4. Commit alembic revision (it should never be reverted)
5. Commit changes in models (it can be reverted if needed)
To create, drop or modify views, schemas, roles, stored procedures, triggers or
policies see below: REPLACEABLE OBJECTS.
TELL ALEMBIC TO NOT START FROM SCRATCH
......@@ -29,25 +44,76 @@ DOWNGRADE TO INITIAL DATABASE STATE
alembic downgrade base
GENERATE A NEW REVISION
GENERATE A REVISION
alembic revision -m "Message for this migration"
alembic revision --autogenerate -m "Message for this migration"
# A migration script is then created in alembic/versions directory. For
# example alembic/versions/3adcc9a56557_message_for_this_migration.py
# where 3adcc9a56557 is the revision id generated by Alembic.
#
# Alembic should generate a script reflecting changes already made in
# models or database. However it is always a good idea to check it and edit
# it manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
GENERATE AN EMPTY REVISION
alembic revision -m "Message for this migration"
# This script must be edited to write the migration itself, mainly
# in `upgrade` and `downgrade` functions. See Alembic documentation for
# further details.
GENERATE A REVISION FROM CURRENT STATE
REPLACEABLE OBJECTS
alembic revision --autogenerate -m "Message for this migration"
There is no specific way no handle views, schemas, roles, stored procedures,
triggers or policies with Alembic. To ease revisions of such objects, avoid
boilerplate code and too much op.execute we use an enhanced version of
ReplaceableObject recipe (see Alembic documentation).
# Alembic should generate a script reflecting changes already made in
# database. However it is always a good idea to check it and edit it
# manually, Alembic is not always accurate and can't see all alterations.
# It should work with basic changes such as model or column creation. See
# http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
To create, drop or modify such object you need to make a ReplaceableObject
instance, and then use create_*, drop_* or replace_* method of alembic.op.
Conversion between ReplaceableObject and SQL is implemented in
gargantext/util/alembic.py.
* Views: create_view(ReplaceableObject(<name>, <query>))
* Roles: create_role(ReplaceableObject(<name>, <options>))
* Schemas: create_schema(ReplaceableObject(<name>))
* Stored procedures: create_sp(ReplaceableObject(<name(arguments)>, <body>)
* Triggers: create_trigger(ReplaceableObject(<name>, <when>, <table>, <body>))
* Policies: create_policy(ReplaceableObject(<name>, <table>, <body>))
Here is an example with a stored procedure:
...
from gargantext.util.alembic import ReplaceableObject
revision = '08230100f512'
...
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 42 $$ LANGUAGE sql")
def upgrade():
op.create_sp(my_function_sp)
def downgrade():
op.drop_sp(my_function_sp)
To modify this stored procedure in a later revision:
...
from gargantext.util.alembic import ReplaceableObject
my_function_sp = ReplaceableObject(
"my_function()", "RETURNS integer AS $$ SELECT 43 $$ LANGUAGE sql")
def upgrade():
op.replace_sp(my_function_sp, replaces="08230100f512.my_function_sp")
def downgrade():
op.replace_sp(my_function_sp, replace_with="08230100f512.my_function_sp")
......@@ -52,6 +52,14 @@ def include_object(obj, name, typ, reflected, compare_to):
return True
context_opts = dict(
target_metadata=target_metadata,
include_object=include_object,
compare_server_default=True,
compare_type=True,
)
def run_migrations_offline():
"""Run migrations in 'offline' mode.
......@@ -65,9 +73,7 @@ def run_migrations_offline():
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True,
include_object=include_object)
context.configure(url=url, literal_binds=True, **context_opts)
with context.begin_transaction():
context.run_migrations()
......@@ -86,11 +92,7 @@ def run_migrations_online():
poolclass=pool.NullPool)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata,
include_object=include_object
)
context.configure(connection=connection, **context_opts)
with context.begin_transaction():
context.run_migrations()
......
"""Add server side sensible defaults for nodes
Revision ID: 73304ae9f1fb
Revises: 159a5154362b
Create Date: 2017-10-05 14:17:58.326646
"""
from alembic import op
import sqlalchemy as sa
import gargantext
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '73304ae9f1fb'
down_revision = '159a5154362b'
branch_labels = None
depends_on = None
def upgrade():
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=sa.text('CURRENT_TIMESTAMP'),
nullable=False)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=sa.text("'{}'::jsonb"),
nullable=False)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default='',
nullable=False)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=False)
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=False)
def downgrade():
op.alter_column('nodes', 'user_id',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'typename',
existing_type=sa.INTEGER(),
nullable=True)
op.alter_column('nodes', 'name',
existing_type=sa.VARCHAR(length=255),
server_default=None,
nullable=True)
op.alter_column('nodes', 'hyperdata',
existing_type=postgresql.JSONB(astext_type=sa.Text()),
server_default=None,
nullable=True)
op.alter_column('nodes', 'date',
existing_type=postgresql.TIMESTAMP(timezone=True),
server_default=None,
nullable=True)
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
Integer, REAL, Boolean, DateTime, String, Text
from sqlalchemy_utils.types import TSVectorType
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import text
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
"text",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TSVectorType",
......@@ -27,6 +29,16 @@ Base = declarative_base()
DjangoBase = declarative_base()
class Float(REAL):
"""Reflect exact REAL type for PostgreSQL in order to avoid confusion
within Alembic type comparison"""
def __init__(self, *args, **kwargs):
if kwargs.get('precision') == 24:
kwargs.pop('precision')
super(Float, self).__init__(*args, **kwargs)
class ValidatorMixin(object):
def enforce_length(self, key, value):
"""Truncate a string according to its column length
......
......@@ -6,7 +6,7 @@ from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, TSVectorType, \
MutableList, MutableDict, validates, ValidatorMixin
MutableList, MutableDict, validates, ValidatorMixin, text
from .users import User
__all__ = ['Node', 'NodeNode', 'CorpusNode']
......@@ -59,20 +59,23 @@ class Node(ValidatorMixin, Base):
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
typename = Column(NodeType, index=True, nullable=False)
__mapper_args__ = { 'polymorphic_on': typename }
# foreign keys
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
user_id = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'),
nullable=False)
user = relationship(User)
parent_id = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
parent = relationship('Node', remote_side=[id])
name = Column(String(255))
date = Column(DateTime(timezone=True), default=datetime.now)
name = Column(String(255), nullable=False, server_default='')
date = Column(DateTime(timezone=True), nullable=False,
server_default=text('CURRENT_TIMESTAMP'))
hyperdata = Column(JSONB, default=dict)
hyperdata = Column(JSONB, default=dict, nullable=False,
server_default=text("'{}'::jsonb"))
# Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
# We need to create a trigger to update this column on update and insert,
......
......@@ -7,6 +7,8 @@ the name of "ReplaceableObject" class.
This recipe is directly borrowed from Alembic documentation, see
http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects
**2017-10-09** ReversibleOp.define has been added to reduce boilerplate code.
"""
from alembic.operations import Operations, MigrateOperation
......@@ -58,44 +60,34 @@ class ReversibleOp(MigrateOperation):
operations.invoke(drop_old)
operations.invoke(create_new)
@classmethod
def define(cls, name, cname=None, register=Operations.register_operation):
def create(self):
return CreateOp(self.target)
@Operations.register_operation("create_view", "invoke_for_target")
@Operations.register_operation("replace_view", "replace")
class CreateViewOp(ReversibleOp):
def reverse(self):
return DropViewOp(self.target)
@Operations.register_operation("drop_view", "invoke_for_target")
class DropViewOp(ReversibleOp):
def reverse(self):
return CreateViewOp(self.view)
def drop(self):
return DropOp(self.target)
@Operations.register_operation("create_sp", "invoke_for_target")
@Operations.register_operation("replace_sp", "replace")
class CreateSPOp(ReversibleOp):
def reverse(self):
return DropSPOp(self.target)
name = name.lower()
cname = cname or name.capitalize()
CreateOp = type('Create%sOp' % cname, (ReversibleOp,), {'reverse': drop})
DropOp = type('Drop%sOp' % cname, (ReversibleOp,), {'reverse': create})
@Operations.register_operation("drop_sp", "invoke_for_target")
class DropSPOp(ReversibleOp):
def reverse(self):
return CreateSPOp(self.target)
CreateOp = register('create_' + name, 'invoke_for_target')(CreateOp)
CreateOp = register('replace_' + name, 'replace')(CreateOp)
DropOp = register('drop_' + name, 'invoke_for_target')(DropOp)
@Operations.register_operation("create_trigger", "invoke_for_target")
@Operations.register_operation("replace_trigger", "replace")
class CreateTriggerOp(ReversibleOp):
def reverse(self):
return DropTriggerOp(self.target)
return (CreateOp, DropOp)
@Operations.register_operation("drop_trigger", "invoke_for_target")
class DropTriggerOp(ReversibleOp):
def reverse(self):
return CreateTriggerOp(self.target)
CreateViewOp, DropViewOp = ReversibleOp.define('view')
CreateRoleOp, DropRoleOp = ReversibleOp.define('role')
CreateSchemaOp, DropSchemaOp = ReversibleOp.define('schema')
CreateSPOp, DropSPOp = ReversibleOp.define('sp', 'SP')
CreateTriggerOp, DropTriggerOp = ReversibleOp.define('trigger')
CreatePolicyOp, DropPolicyOp = ReversibleOp.define('policy')
@Operations.implementation_for(CreateViewOp)
......@@ -111,6 +103,32 @@ def drop_view(operations, operation):
operations.execute("DROP VIEW %s" % operation.target.name)
@Operations.implementation_for(CreateRoleOp)
def create_role(operations, operation):
args = operation.target.args
operations.execute(
"CREATE ROLE %s WITH %s" % (
operation.target.name,
args[0] if len(args) else 'NOLOGIN'
)
)
@Operations.implementation_for(DropRoleOp)
def drop_role(operations, operation):
operations.execute("DROP ROLE %s" % operation.target.name)
@Operations.implementation_for(CreateSchemaOp)
def create_schema(operations, operation):
operations.execute("CREATE SCHEMA %s" % operation.target.name)
@Operations.implementation_for(DropSchemaOp)
def drop_schema(operations, operation):
operations.execute("DROP SCHEMA %s" % operation.target.name)
@Operations.implementation_for(CreateSPOp)
def create_sp(operations, operation):
operations.execute(
......@@ -143,3 +161,24 @@ def drop_trigger(operations, operation):
operation.target.args[1]
)
)
@Operations.implementation_for(CreatePolicyOp)
def create_policy(operations, operation):
operations.execute(
"CREATE POLICY %s ON %s %s" % (
operation.target.name,
operation.target.args[0],
operation.target.args[1],
)
)
@Operations.implementation_for(DropPolicyOp)
def drop_policy(operations, operation):
operations.execute(
"DROP POLICY %s ON %s" % (
operation.target.name,
operation.target.args[0],
)
)
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import pandas
import io
class CSVParser(Parser):
DELIMITERS = ", \t;|:"
ENCODING = "utf-8"
def detect_delimiter(self, lines, sample_size=10):
sample = lines[:sample_size]
def open(self, file):
f = super(CSVParser, self).open(file)
# Compute frequency of each delimiter on each input line
delimiters_freqs = {
d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
if isinstance(file, str) and file.endswith('.zip'):
return f
# Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
return io.TextIOWrapper(f, encoding=self.ENCODING)
if selected_delimiters:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
return sorted_delimiters[-1][0]
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter)
# Parse CSV
reader = csv.reader(contents, delimiter=delimiter)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \
next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
(0, []))
# Get first not empty column of the first row, or 0
first_col = next((i for i, field in enumerate(headers) if field), 0)
# Strip out potential empty fields in headers
headers = headers[first_col:]
def parse(self, fp=None):
fp = fp or self._file
df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
# Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for i, fields in enumerate(reader):
for i, fields in enumerate(df.itertuples(index=False)):
if i % 500 == 0:
print("CSV: parsing row #%s..." % (i+1))
if any(fields):
yield dict(zip(headers, fields[first_col:]))
# See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
yield fields._asdict()
......@@ -14,15 +14,14 @@ class Parser:
"""
def __init__(self, file):
if isinstance(file, str):
self._file = open(file, 'rb')
else:
self._file = file
self._file = self.open(file)
def __del__(self):
if hasattr(self, '_file'):
self._file.close()
def open(self, file):
return open(file, 'rb') if isinstance(file, str) else file
def detect_encoding(self, string):
"""Useful method to detect the encoding of a document.
......@@ -165,11 +164,10 @@ class Parser:
file = self._file
# if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
zipArchive = zipfile.ZipFile(file)
for filename in zipArchive.namelist():
f = zipArchive.open(filename, 'r')
with zipfile.ZipFile(file) as zf:
for filename in zf.namelist():
with zf.open(filename) as df, self.open(df) as f:
yield from self.__iter__(f)
f.close()
# ...otherwise, let's parse it directly!
else:
try:
......
......@@ -61,12 +61,14 @@ def nodes(parent=None, group_by='typename', order_by='typename', has_child='chec
def node_show(node, prefix='', maxlen=60):
if node.children > 0 or node.cnt == 1:
node_id = '<{}> '.format(node.id)
name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name
label = Fore.CYAN + name + Fore.RESET
label = node_id + Fore.CYAN + name + Fore.RESET
else:
label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='')
typename = Fore.GREEN + node.typename + Fore.RESET
print(prefix, '%s %s' % (typename, label), sep='')
def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment