[FEAT] Add Node.title_abstract to search in doc title and abstract

cc7461f0 · sim · 1f5457df · cc7461f0 · cc7461f0 · cc7461f0
Commit cc7461f0 authored Sep 14, 2017 by sim
4 changed files
--- a/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+++ b/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+"""Add english fulltext index on Nodes.hyperdata for abstract and title
+Revision ID: 1fb4405b59e1
+Revises: bedce47c9e34
+Create Date: 2017-09-13 16:31:36.926692
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy_utils.types import TSVectorType
+from gargantext.util.alembic import ReplaceableObject
+# revision identifiers, used by Alembic.
+revision = '1fb4405b59e1'
+down_revision = 'bedce47c9e34'
+branch_labels = None
+depends_on = None
+title_abstract_update_trigger = ReplaceableObject(
+    'title_abstract_update_trigger()',
+    """
+RETURNS trigger AS $$
+begin
+  new.title_abstract := to_tsvector('english', (new.hyperdata ->> 'title') || ' ' || (new.hyperdata ->> 'abstract'));
+  return new;
+end
+$$ LANGUAGE plpgsql;
+    """
+)
+title_abstract_update = ReplaceableObject(
+    'title_abstract_update',
+    'BEFORE INSERT OR UPDATE',
+    'nodes',
+    'FOR EACH ROW EXECUTE PROCEDURE title_abstract_update_trigger()'
+)
+def upgrade():
+    op.add_column('nodes', sa.Column('title_abstract', TSVectorType))
+    op.create_sp(title_abstract_update_trigger)
+    op.create_trigger(title_abstract_update)
+    # Initialize index with already existing data
+    op.execute('UPDATE nodes SET hyperdata = hyperdata');
+def downgrade():
+    op.drop_trigger(title_abstract_update)
+    op.drop_sp(title_abstract_update_trigger)
+    op.drop_column('nodes', 'title_abstract')
--- a/gargantext/models/base.py
+++ b/gargantext/models/base.py
@@ -2,6 +2,7 @@ from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
 from sqlalchemy.orm import relationship, validates
 from sqlalchemy.types import TypeDecorator, \
                             Integer, Float, Boolean, DateTime, String, Text
+from sqlalchemy_utils.types import TSVectorType
 from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
 from sqlalchemy.ext.mutable import MutableDict, MutableList
 from sqlalchemy.ext.declarative import declarative_base
@@ -9,6 +10,7 @@ from sqlalchemy.ext.declarative import declarative_base
 __all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
           "validates", "ValidatorMixin",
           "Integer", "Float", "Boolean", "DateTime", "String", "Text",
+           "TSVectorType",
           "TypeDecorator",
           "JSONB", "Double",
           "MutableDict", "MutableList",

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -2,13 +2,10 @@ from gargantext.util.db import session
 from gargantext.util.files import upload
 from gargantext.constants import *
-# Uncomment to make column full text searchable
-#from sqlalchemy_utils.types import TSVectorType
 from datetime import datetime
 from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
-                  Integer, Float, String, DateTime, JSONB, \
+                  Integer, Float, String, DateTime, JSONB, TSVectorType, \
                  MutableList, MutableDict, validates, ValidatorMixin
 from .users import User
@@ -60,9 +57,6 @@ class Node(ValidatorMixin, Base):
            Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
            Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))
-    # TODO
-    # create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
    id = Column(Integer, primary_key=True)
    typename = Column(NodeType, index=True)
@@ -78,10 +72,15 @@ class Node(ValidatorMixin, Base):
    name = Column(String(255))
    date  = Column(DateTime(timezone=True), default=datetime.now)
-    hyperdata     = Column(JSONB, default=dict)
+    hyperdata      = Column(JSONB, default=dict)
-    # metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
-    # To make search possible uncomment the line below
+    # Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
-    #search_vector = Column(TSVectorType('hyperdata'))
+    # We need to create a trigger to update this column on update and insert,
+    # it's created in alembic/version/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+    #
+    # To use this column: session.query(DocumentNode) \
+    #                            .filter(Node.title_abstract.match('keyword'))
+    title_abstract = Column(TSVectorType(regconfig='english'))
    def __new__(cls, *args, **kwargs):
        if cls is Node and kwargs.get('typename'):

--- a/install/gargamelle/requirements.txt
+++ b/install/gargamelle/requirements.txt
@@ -35,4 +35,6 @@ requests-futures==0.9.7
 bs4==0.0.1
 requests==2.10.0
 alembic>=0.9.2
-# SQLAlchemy-Searchable==0.10.4
+SQLAlchemy==1.1.14
+SQLAlchemy-Searchable==0.10.4
+SQLAlchemy-Utils==0.32.16