Merge remote-tracking branch 'origin/simon-unstable-fix-csv-issue' into dev-merge

372d44d1 · Alexandre Delanoë · 3ef753ca · 7367f14d · 372d44d1 · 372d44d1
Commit 372d44d1 authored Oct 23, 2017 by Alexandre Delanoë
9 changed files
--- a/alembic/README
+++ b/alembic/README
@@ -6,6 +6,21 @@ Keep in mind that Alembic only handles SQLAlchemy models: tables created from
 Django ORM must be put out of Alembic sight. See [alembic:exclude] section in
 alembic.ini.
+To bootstrap Alembic where a gargantext database is already existing see
+below: TELL ALEMBIC TO NOT START FROM SCRATCH.
+USUAL WORKFLOW WITH ALEMBIC
+1. Make change to models in gargantext/models
+2. Autogenerate revision (see below GENERATE A REVISION)
+3. Manually check and edit revision file in alembic/versions
+4. Commit alembic revision (it should never be reverted)
+5. Commit changes in models (it can be reverted if needed)
+To create, drop or modify views, schemas, roles, stored procedures, triggers or
+policies see below: REPLACEABLE OBJECTS.
 TELL ALEMBIC TO NOT START FROM SCRATCH
@@ -29,25 +44,76 @@ DOWNGRADE TO INITIAL DATABASE STATE
    alembic downgrade base
-GENERATE A NEW REVISION
+GENERATE A REVISION
-    alembic revision -m "Message for this migration"
+    alembic revision --autogenerate -m "Message for this migration"
    # A migration script is then created in alembic/versions directory. For
    # example alembic/versions/3adcc9a56557_message_for_this_migration.py
    # where 3adcc9a56557 is the revision id generated by Alembic.
    #
+    # Alembic should generate a script reflecting changes already made in
+    # models or database. However it is always a good idea to check it and edit
+    # it manually, Alembic is not always accurate and can't see all alterations.
+    # It should work with basic changes such as model or column creation. See
+    # http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
+GENERATE AN EMPTY REVISION
+    alembic revision -m "Message for this migration"
    # This script must be edited to write the migration itself, mainly
    # in `upgrade` and `downgrade` functions. See Alembic documentation for
    # further details.
-GENERATE A REVISION FROM CURRENT STATE
+REPLACEABLE OBJECTS
-    alembic revision --autogenerate -m "Message for this migration"
+There is no specific way no handle views, schemas, roles, stored procedures,
+triggers or policies with Alembic. To ease revisions of such objects, avoid
+boilerplate code and too much op.execute we use an enhanced version of
+ReplaceableObject recipe (see Alembic documentation).
-    # Alembic should generate a script reflecting changes already made in
+To create, drop or modify such object you need to make a ReplaceableObject
-    # database. However it is always a good idea to check it and edit it
+instance, and then use create_*, drop_* or replace_* method of alembic.op.
-    # manually, Alembic is not always accurate and can't see all alterations.
+Conversion between ReplaceableObject and SQL is implemented in
-    # It should work with basic changes such as model or column creation. See
+gargantext/util/alembic.py.
-    # http://alembic.zzzcomputing.com/en/latest/autogenerate.html#what-does-autogenerate-detect-and-what-does-it-not-detect
+* Views: create_view(ReplaceableObject(<name>, <query>))
+* Roles: create_role(ReplaceableObject(<name>, <options>))
+* Schemas: create_schema(ReplaceableObject(<name>))
+* Stored procedures: create_sp(ReplaceableObject(<name(arguments)>, <body>)
+* Triggers: create_trigger(ReplaceableObject(<name>, <when>, <table>, <body>))
+* Policies: create_policy(ReplaceableObject(<name>, <table>, <body>))
+Here is an example with a stored procedure:
+    ...
+    from gargantext.util.alembic import ReplaceableObject
+    revision = '08230100f512'
+    ...
+    my_function_sp = ReplaceableObject(
+        "my_function()", "RETURNS integer AS $$ SELECT 42 $$ LANGUAGE sql")
+    def upgrade():
+        op.create_sp(my_function_sp)
+    def downgrade():
+        op.drop_sp(my_function_sp)
+To modify this stored procedure in a later revision:
+    ...
+    from gargantext.util.alembic import ReplaceableObject
+    my_function_sp = ReplaceableObject(
+        "my_function()", "RETURNS integer AS $$ SELECT 43 $$ LANGUAGE sql")
+    def upgrade():
+        op.replace_sp(my_function_sp, replaces="08230100f512.my_function_sp")
+    def downgrade():
+        op.replace_sp(my_function_sp, replace_with="08230100f512.my_function_sp")
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -52,6 +52,14 @@ def include_object(obj, name, typ, reflected, compare_to):
        return True
+context_opts = dict(
+    target_metadata=target_metadata,
+    include_object=include_object,
+    compare_server_default=True,
+    compare_type=True,
+)
 def run_migrations_offline():
    """Run migrations in 'offline' mode.
@@ -65,9 +73,7 @@ def run_migrations_offline():
    """
    url = config.get_main_option("sqlalchemy.url")
-    context.configure(
+    context.configure(url=url, literal_binds=True, **context_opts)
-        url=url, target_metadata=target_metadata, literal_binds=True,
-        include_object=include_object)
    with context.begin_transaction():
        context.run_migrations()
@@ -86,11 +92,7 @@ def run_migrations_online():
        poolclass=pool.NullPool)
    with connectable.connect() as connection:
-        context.configure(
+        context.configure(connection=connection, **context_opts)
-            connection=connection,
-            target_metadata=target_metadata,
-            include_object=include_object
-        )
        with context.begin_transaction():
            context.run_migrations()

--- a/alembic/versions/73304ae9f1fb_add_server_side_sensible_defaults_for_.py
+++ b/alembic/versions/73304ae9f1fb_add_server_side_sensible_defaults_for_.py
+"""Add server side sensible defaults for nodes
+Revision ID: 73304ae9f1fb
+Revises: 159a5154362b
+Create Date: 2017-10-05 14:17:58.326646
+"""
+from alembic import op
+import sqlalchemy as sa
+import gargantext
+from sqlalchemy.dialects import postgresql
+# revision identifiers, used by Alembic.
+revision = '73304ae9f1fb'
+down_revision = '159a5154362b'
+branch_labels = None
+depends_on = None
+def upgrade():
+    op.alter_column('nodes', 'date',
+               existing_type=postgresql.TIMESTAMP(timezone=True),
+               server_default=sa.text('CURRENT_TIMESTAMP'),
+               nullable=False)
+    op.alter_column('nodes', 'hyperdata',
+               existing_type=postgresql.JSONB(astext_type=sa.Text()),
+               server_default=sa.text("'{}'::jsonb"),
+               nullable=False)
+    op.alter_column('nodes', 'name',
+               existing_type=sa.VARCHAR(length=255),
+               server_default='',
+               nullable=False)
+    op.alter_column('nodes', 'typename',
+               existing_type=sa.INTEGER(),
+               nullable=False)
+    op.alter_column('nodes', 'user_id',
+               existing_type=sa.INTEGER(),
+               nullable=False)
+def downgrade():
+    op.alter_column('nodes', 'user_id',
+               existing_type=sa.INTEGER(),
+               nullable=True)
+    op.alter_column('nodes', 'typename',
+               existing_type=sa.INTEGER(),
+               nullable=True)
+    op.alter_column('nodes', 'name',
+               existing_type=sa.VARCHAR(length=255),
+               server_default=None,
+               nullable=True)
+    op.alter_column('nodes', 'hyperdata',
+               existing_type=postgresql.JSONB(astext_type=sa.Text()),
+               server_default=None,
+               nullable=True)
+    op.alter_column('nodes', 'date',
+               existing_type=postgresql.TIMESTAMP(timezone=True),
+               server_default=None,
+               nullable=True)
--- a/gargantext/models/base.py
+++ b/gargantext/models/base.py
 from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
 from sqlalchemy.orm import relationship, validates
 from sqlalchemy.types import TypeDecorator, \
-                             Integer, Float, Boolean, DateTime, String, Text
+                             Integer, REAL, Boolean, DateTime, String, Text
 from sqlalchemy_utils.types import TSVectorType
 from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
 from sqlalchemy.ext.mutable import MutableDict, MutableList
 from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import text
 __all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
+           "text",
           "validates", "ValidatorMixin",
           "Integer", "Float", "Boolean", "DateTime", "String", "Text",
           "TSVectorType",
@@ -27,6 +29,16 @@ Base = declarative_base()
 DjangoBase = declarative_base()
+class Float(REAL):
+    """Reflect exact REAL type for PostgreSQL in order to avoid confusion
+    within Alembic type comparison"""
+    def __init__(self, *args, **kwargs):
+        if kwargs.get('precision') == 24:
+            kwargs.pop('precision')
+        super(Float, self).__init__(*args, **kwargs)
 class ValidatorMixin(object):
    def enforce_length(self, key, value):
        """Truncate a string according to its column length

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -6,7 +6,7 @@ from datetime import datetime
 from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
                  Integer, Float, String, DateTime, JSONB, TSVectorType, \
-                  MutableList, MutableDict, validates, ValidatorMixin
+                  MutableList, MutableDict, validates, ValidatorMixin, text
 from .users import User
 __all__ = ['Node', 'NodeNode', 'CorpusNode']
@@ -59,20 +59,23 @@ class Node(ValidatorMixin, Base):
    id = Column(Integer, primary_key=True)
-    typename = Column(NodeType, index=True)
+    typename = Column(NodeType, index=True, nullable=False)
    __mapper_args__ = { 'polymorphic_on': typename }
    # foreign keys
-    user_id       = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'))
+    user_id       = Column(Integer, ForeignKey(User.id, ondelete='CASCADE'),
+                           nullable=False)
    user          = relationship(User)
    parent_id     = Column(Integer, ForeignKey('nodes.id', ondelete='CASCADE'))
    parent        = relationship('Node', remote_side=[id])
-    name = Column(String(255))
+    name = Column(String(255), nullable=False, server_default='')
-    date  = Column(DateTime(timezone=True), default=datetime.now)
+    date = Column(DateTime(timezone=True), nullable=False,
+                  server_default=text('CURRENT_TIMESTAMP'))
-    hyperdata      = Column(JSONB, default=dict)
+    hyperdata = Column(JSONB, default=dict, nullable=False,
+                       server_default=text("'{}'::jsonb"))
    # Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
    # We need to create a trigger to update this column on update and insert,

--- a/gargantext/util/alembic.py
+++ b/gargantext/util/alembic.py
@@ -7,6 +7,8 @@ the name of "ReplaceableObject" class.
 This recipe is directly borrowed from Alembic documentation, see
 http://alembic.zzzcomputing.com/en/latest/cookbook.html#replaceable-objects
+**2017-10-09** ReversibleOp.define has been added to reduce boilerplate code.
 """
 from alembic.operations import Operations, MigrateOperation
@@ -58,44 +60,34 @@ class ReversibleOp(MigrateOperation):
        operations.invoke(drop_old)
        operations.invoke(create_new)
+    @classmethod
+    def define(cls, name, cname=None, register=Operations.register_operation):
+        def create(self):
+            return CreateOp(self.target)
-@Operations.register_operation("create_view", "invoke_for_target")
+        def drop(self):
-@Operations.register_operation("replace_view", "replace")
+            return DropOp(self.target)
-class CreateViewOp(ReversibleOp):
-    def reverse(self):
-        return DropViewOp(self.target)
-@Operations.register_operation("drop_view", "invoke_for_target")
-class DropViewOp(ReversibleOp):
-    def reverse(self):
-        return CreateViewOp(self.view)
-@Operations.register_operation("create_sp", "invoke_for_target")
+        name = name.lower()
-@Operations.register_operation("replace_sp", "replace")
+        cname = cname or name.capitalize()
-class CreateSPOp(ReversibleOp):
-    def reverse(self):
-        return DropSPOp(self.target)
+        CreateOp = type('Create%sOp' % cname, (ReversibleOp,), {'reverse': drop})
+        DropOp = type('Drop%sOp' % cname, (ReversibleOp,), {'reverse': create})
-@Operations.register_operation("drop_sp", "invoke_for_target")
+        CreateOp = register('create_' + name, 'invoke_for_target')(CreateOp)
-class DropSPOp(ReversibleOp):
+        CreateOp = register('replace_' + name, 'replace')(CreateOp)
-    def reverse(self):
-        return CreateSPOp(self.target)
+        DropOp = register('drop_' + name, 'invoke_for_target')(DropOp)
-@Operations.register_operation("create_trigger", "invoke_for_target")
+        return (CreateOp, DropOp)
-@Operations.register_operation("replace_trigger", "replace")
-class CreateTriggerOp(ReversibleOp):
-    def reverse(self):
-        return DropTriggerOp(self.target)
-@Operations.register_operation("drop_trigger", "invoke_for_target")
+CreateViewOp,    DropViewOp    = ReversibleOp.define('view')
-class DropTriggerOp(ReversibleOp):
+CreateRoleOp,    DropRoleOp    = ReversibleOp.define('role')
-    def reverse(self):
+CreateSchemaOp,  DropSchemaOp  = ReversibleOp.define('schema')
-        return CreateTriggerOp(self.target)
+CreateSPOp,      DropSPOp      = ReversibleOp.define('sp', 'SP')
+CreateTriggerOp, DropTriggerOp = ReversibleOp.define('trigger')
+CreatePolicyOp,  DropPolicyOp  = ReversibleOp.define('policy')
 @Operations.implementation_for(CreateViewOp)
@@ -111,6 +103,32 @@ def drop_view(operations, operation):
    operations.execute("DROP VIEW %s" % operation.target.name)
+@Operations.implementation_for(CreateRoleOp)
+def create_role(operations, operation):
+    args = operation.target.args
+    operations.execute(
+        "CREATE ROLE %s WITH %s" % (
+            operation.target.name,
+            args[0] if len(args) else 'NOLOGIN'
+        )
+    )
+@Operations.implementation_for(DropRoleOp)
+def drop_role(operations, operation):
+    operations.execute("DROP ROLE %s" % operation.target.name)
+@Operations.implementation_for(CreateSchemaOp)
+def create_schema(operations, operation):
+    operations.execute("CREATE SCHEMA %s" % operation.target.name)
+@Operations.implementation_for(DropSchemaOp)
+def drop_schema(operations, operation):
+    operations.execute("DROP SCHEMA %s" % operation.target.name)
 @Operations.implementation_for(CreateSPOp)
 def create_sp(operations, operation):
    operations.execute(
@@ -143,3 +161,24 @@ def drop_trigger(operations, operation):
            operation.target.args[1]
        )
    )
+@Operations.implementation_for(CreatePolicyOp)
+def create_policy(operations, operation):
+    operations.execute(
+        "CREATE POLICY %s ON %s %s" % (
+            operation.target.name,
+            operation.target.args[0],
+            operation.target.args[1],
+        )
+    )
+@Operations.implementation_for(DropPolicyOp)
+def drop_policy(operations, operation):
+    operations.execute(
+        "DROP POLICY %s ON %s" % (
+            operation.target.name,
+            operation.target.args[0],
+        )
+    )
--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
 from ._Parser import Parser
-# from ..NgramsExtractors import *
+import pandas
-import sys
+import io
-import csv
-csv.field_size_limit(sys.maxsize)
-import numpy as np
 class CSVParser(Parser):
-    DELIMITERS = ", \t;|:"
+    ENCODING = "utf-8"
-    def detect_delimiter(self, lines, sample_size=10):
+    def open(self, file):
-        sample = lines[:sample_size]
+        f = super(CSVParser, self).open(file)
-        # Compute frequency of each delimiter on each input line
+        if isinstance(file, str) and file.endswith('.zip'):
-        delimiters_freqs = {
+            return f
-            d: [line.count(d) for line in sample]
-            for d in self.DELIMITERS
-        }
-        # Select delimiters with a standard deviation of zero, ie. delimiters
+        return io.TextIOWrapper(f, encoding=self.ENCODING)
-        # for which we have the same number of fields on each line
-        selected_delimiters = [
-            (d, np.sum(freqs))
-            for d, freqs in delimiters_freqs.items()
-            if any(freqs) and np.std(freqs) == 0
-        ]
-        if selected_delimiters:
+    def parse(self, fp=None):
-            # Choose the delimiter with highest frequency amongst selected ones
+        fp = fp or self._file
-            sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
+        df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
-            return sorted_delimiters[-1][0]
-    def parse(self, filebuf):
-        print("CSV: parsing (assuming UTF-8 and LF line endings)")
-        contents = filebuf.read().decode("UTF-8").split("\n")
-        # Filter out empty lines
-        contents = [line for line in contents if line.strip()]
-        # Delimiter auto-detection
-        delimiter = self.detect_delimiter(contents, sample_size=10)
-        if delimiter is None:
-            raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
-        print("CSV: selected delimiter: %r" % delimiter)
-        # Parse CSV
-        reader = csv.reader(contents, delimiter=delimiter)
-        # Get first not empty row and its fields (ie. header row), or (0, [])
-        first_row, headers = \
-            next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
-                 (0, []))
-        # Get first not empty column of the first row, or 0
-        first_col = next((i for i, field in enumerate(headers) if field), 0)
-        # Strip out potential empty fields in headers
-        headers = headers[first_col:]
        # Return a generator of dictionaries with column labels as keys,
        # filtering out empty rows
-        for i, fields in enumerate(reader):
+        for i, fields in enumerate(df.itertuples(index=False)):
            if i % 500 == 0:
                print("CSV: parsing row #%s..." % (i+1))
-            if any(fields):
-                yield dict(zip(headers, fields[first_col:]))
+            # See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
+            yield fields._asdict()
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -14,15 +14,14 @@ class Parser:
    """
    def __init__(self, file):
-        if isinstance(file, str):
+        self._file = self.open(file)
-            self._file = open(file, 'rb')
-        else:
-            self._file = file
    def __del__(self):
        if hasattr(self, '_file'):
            self._file.close()
+    def open(self, file):
+        return open(file, 'rb') if isinstance(file, str) else file
    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -165,11 +164,10 @@ class Parser:
            file = self._file
        # if the file is a ZIP archive, recurse on each of its files...
        if zipfile.is_zipfile(file):
-            zipArchive = zipfile.ZipFile(file)
+            with zipfile.ZipFile(file) as zf:
-            for filename in zipArchive.namelist():
+                for filename in zf.namelist():
-                f = zipArchive.open(filename, 'r')
+                    with zf.open(filename) as df, self.open(df) as f:
                        yield from self.__iter__(f)
-                f.close()
        # ...otherwise, let's parse it directly!
        else:
            try:

--- a/gargantext/util/show_nodes.py
+++ b/gargantext/util/show_nodes.py
@@ -61,12 +61,14 @@ def nodes(parent=None, group_by='typename', order_by='typename', has_child='chec
 def node_show(node, prefix='', maxlen=60):
    if node.children > 0 or node.cnt == 1:
+        node_id = '<{}> '.format(node.id)
        name = node.name[:maxlen] + '..' if len(node.name) > maxlen else node.name
-        label = Fore.CYAN + name + Fore.RESET
+        label = node_id + Fore.CYAN + name + Fore.RESET
    else:
        label = Fore.MAGENTA + str(node.cnt) + Fore.RESET
-    print(prefix, '%s%s %s' % (Fore.GREEN, node.typename, label), sep='')
+    typename = Fore.GREEN + node.typename + Fore.RESET
+    print(prefix, '%s %s' % (typename, label), sep='')
 def tree_show(node, pos=FIRST|LAST, level=0, prefix='', maxlen=60, compact=True):