diff --git a/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py b/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py new file mode 100644 index 0000000000000000000000000000000000000000..fa9318e624c03a9649a3b5624c5c812a79570058 --- /dev/null +++ b/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py @@ -0,0 +1,54 @@ +"""Add english fulltext index on Nodes.hyperdata for abstract and title + +Revision ID: 1fb4405b59e1 +Revises: bedce47c9e34 +Create Date: 2017-09-13 16:31:36.926692 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy_utils.types import TSVectorType +from gargantext.util.alembic import ReplaceableObject + + +# revision identifiers, used by Alembic. +revision = '1fb4405b59e1' +down_revision = 'bedce47c9e34' +branch_labels = None +depends_on = None + + +title_abstract_update_trigger = ReplaceableObject( + 'title_abstract_update_trigger()', + """ +RETURNS trigger AS $$ +begin + new.title_abstract := to_tsvector('english', (new.hyperdata ->> 'title') || ' ' || (new.hyperdata ->> 'abstract')); + return new; +end +$$ LANGUAGE plpgsql; + """ +) + + +title_abstract_update = ReplaceableObject( + 'title_abstract_update', + 'BEFORE INSERT OR UPDATE', + 'nodes', + 'FOR EACH ROW EXECUTE PROCEDURE title_abstract_update_trigger()' +) + + +def upgrade(): + op.add_column('nodes', sa.Column('title_abstract', TSVectorType)) + op.create_sp(title_abstract_update_trigger) + op.create_trigger(title_abstract_update) + + # Initialize index with already existing data + op.execute('UPDATE nodes SET hyperdata = hyperdata'); + + +def downgrade(): + op.drop_trigger(title_abstract_update) + op.drop_sp(title_abstract_update_trigger) + op.drop_column('nodes', 'title_abstract') diff --git a/annotations/static/annotations/http.js b/annotations/static/annotations/http.js index 5146abd8ee8c88794f653af9a83a213dea6d5a41..d631ca275906ed98d035e92366a10017ed9e8f2a 100644 --- a/annotations/static/annotations/http.js +++ b/annotations/static/annotations/http.js @@ -98,8 +98,8 @@ */ http.factory('MainApiAddNgramHttpService', function($resource) { return $resource( - // adding explicit "http://" b/c this a cross origin request - 'http://' + window.GARG_ROOT_URL + // adding explicit "https://" b/c this a cross origin request + 'https://' + window.GARG_ROOT_URL + "/api/ngrams?text=:ngramStr&corpus=:corpusId&testgroup", { ngramStr: '@ngramStr', @@ -131,8 +131,8 @@ http.factory('MainApiChangeNgramHttpService', function($resource) { return $resource( - // adding explicit "http://" b/c this a cross origin request - 'http://' + window.GARG_ROOT_URL + // adding explicit "https://" b/c this a cross origin request + 'https://' + window.GARG_ROOT_URL + "/api/ngramlists/change?list=:listId&ngrams=:ngramIdList", { listId: '@listId', @@ -171,8 +171,8 @@ */ http.factory('MainApiFavoritesHttpService', function($resource) { return $resource( - // adding explicit "http://" b/c this a cross origin request - 'http://' + window.GARG_ROOT_URL + "/api/nodes/:corpusId/favorites?docs=:docId", + // adding explicit "https://" b/c this a cross origin request + 'https://' + window.GARG_ROOT_URL + "/api/nodes/:corpusId/favorites?docs=:docId", { corpusId: '@corpusId', docId: '@docId' diff --git a/gargantext/models/base.py b/gargantext/models/base.py index 98e2cec827da09c10d80598c32f2c8b24733b30d..dcb9be7b17f345decae47b9adf3138bd8ae45fa1 100644 --- a/gargantext/models/base.py +++ b/gargantext/models/base.py @@ -2,13 +2,15 @@ from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index from sqlalchemy.orm import relationship, validates from sqlalchemy.types import TypeDecorator, \ Integer, Float, Boolean, DateTime, String, Text +from sqlalchemy_utils.types import TSVectorType from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double from sqlalchemy.ext.mutable import MutableDict, MutableList from sqlalchemy.ext.declarative import declarative_base -__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship", +__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship", "validates", "ValidatorMixin", "Integer", "Float", "Boolean", "DateTime", "String", "Text", + "TSVectorType", "TypeDecorator", "JSONB", "Double", "MutableDict", "MutableList", diff --git a/gargantext/models/nodes.py b/gargantext/models/nodes.py index c42ece26c45913bf9d0d6ab4bd4017214e926e4c..fed8bf3d561e6a9a1158049886c585953dd65c9b 100644 --- a/gargantext/models/nodes.py +++ b/gargantext/models/nodes.py @@ -2,13 +2,10 @@ from gargantext.util.db import session from gargantext.util.files import upload from gargantext.constants import * -# Uncomment to make column full text searchable -#from sqlalchemy_utils.types import TSVectorType - from datetime import datetime from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \ - Integer, Float, String, DateTime, JSONB, \ + Integer, Float, String, DateTime, JSONB, TSVectorType, \ MutableList, MutableDict, validates, ValidatorMixin from .users import User @@ -60,9 +57,6 @@ class Node(ValidatorMixin, Base): Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'), Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin')) - # TODO - # create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title')); - id = Column(Integer, primary_key=True) typename = Column(NodeType, index=True) @@ -78,10 +72,15 @@ class Node(ValidatorMixin, Base): name = Column(String(255)) date = Column(DateTime(timezone=True), default=datetime.now) - hyperdata = Column(JSONB, default=dict) - # metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/) - # To make search possible uncomment the line below - #search_vector = Column(TSVectorType('hyperdata')) + hyperdata = Column(JSONB, default=dict) + + # Create a TSVECTOR column to use fulltext search feature of PostgreSQL. + # We need to create a trigger to update this column on update and insert, + # it's created in alembic/version/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py + # + # To use this column: session.query(DocumentNode) \ + # .filter(Node.title_abstract.match('keyword')) + title_abstract = Column(TSVectorType(regconfig='english')) def __new__(cls, *args, **kwargs): if cls is Node and kwargs.get('typename'): diff --git a/gargantext/util/alembic.py b/gargantext/util/alembic.py index 6fce0800ebbf013eee2cbea35d486d1491952d90..f663a9d3d25cda27d2e2d7cbd368d92bd855ddec 100644 --- a/gargantext/util/alembic.py +++ b/gargantext/util/alembic.py @@ -16,9 +16,9 @@ __all__ = ['ReplaceableObject'] class ReplaceableObject(object): - def __init__(self, name, sqltext): + def __init__(self, name, *args): self.name = name - self.sqltext = sqltext + self.args = args class ReversibleOp(MigrateOperation): @@ -85,11 +85,24 @@ class DropSPOp(ReversibleOp): return CreateSPOp(self.target) +@Operations.register_operation("create_trigger", "invoke_for_target") +@Operations.register_operation("replace_trigger", "replace") +class CreateTriggerOp(ReversibleOp): + def reverse(self): + return DropTriggerOp(self.target) + + +@Operations.register_operation("drop_trigger", "invoke_for_target") +class DropTriggerOp(ReversibleOp): + def reverse(self): + return CreateTriggerOp(self.target) + + @Operations.implementation_for(CreateViewOp) def create_view(operations, operation): operations.execute("CREATE VIEW %s AS %s" % ( operation.target.name, - operation.target.sqltext + operation.target.args[0] )) @@ -102,7 +115,7 @@ def drop_view(operations, operation): def create_sp(operations, operation): operations.execute( "CREATE FUNCTION %s %s" % ( - operation.target.name, operation.target.sqltext + operation.target.name, operation.target.args[0] ) ) @@ -110,3 +123,23 @@ def create_sp(operations, operation): @Operations.implementation_for(DropSPOp) def drop_sp(operations, operation): operations.execute("DROP FUNCTION %s" % operation.target.name) + + +@Operations.implementation_for(CreateTriggerOp) +def create_trigger(operations, operation): + args = operation.target.args + operations.execute( + "CREATE TRIGGER %s %s ON %s %s" % ( + operation.target.name, args[0], args[1], args[2] + ) + ) + + +@Operations.implementation_for(DropTriggerOp) +def drop_trigger(operations, operation): + operations.execute( + "DROP TRIGGER %s ON %s" % ( + operation.target.name, + operation.target.args[1] + ) + ) diff --git a/install/gargamelle/requirements.txt b/install/gargamelle/requirements.txt index f61a9f77c91e89e62545b4f769c5f5fff83f8a3c..c66783d7284bf48b480b89e86c1ec14b5559ddc8 100644 --- a/install/gargamelle/requirements.txt +++ b/install/gargamelle/requirements.txt @@ -35,4 +35,6 @@ requests-futures==0.9.7 bs4==0.0.1 requests==2.10.0 alembic>=0.9.2 -# SQLAlchemy-Searchable==0.10.4 +SQLAlchemy==1.1.14 +SQLAlchemy-Searchable==0.10.4 +SQLAlchemy-Utils==0.32.16 diff --git a/install/notebook/gargantext_notebook.py b/install/notebook/gargantext_notebook.py index d074ace7620f9efc9bb394f06d5accdc9b234f47..84986ffe6582eeb366d309d0fa9d73cda34fbe67 100644 --- a/install/notebook/gargantext_notebook.py +++ b/install/notebook/gargantext_notebook.py @@ -15,7 +15,7 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings') django.setup() from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name -from gargantext.models import ProjectNode, DocumentNode +from gargantext.models import Node, ProjectNode, DocumentNode from gargantext.util.db import session, get_engine from collections import Counter import importlib @@ -53,15 +53,11 @@ def scan_hal(request): return hal.scan_results(request) -def scan_gargantext(corpus_id, lang, request): - connection = get_engine().connect() - # TODO add some sugar the request (ideally request should be the same for hal and garg) - query = """select count(n.id) from nodes n - where to_tsvector('%s', hyperdata ->> 'abstract' || 'title') - @@ to_tsquery('%s') - AND n.parent_id = %s;""" % (lang, request, corpus_id) - return [i for i in connection.execute(query)][0][0] - connection.close() +def scan_gargantext(corpus_id, request): + return (session.query(DocumentNode) + .filter_by(parent_id=corpus_id) + .filter(Node.title_abstract.match(request)) + .count()) def myProject_fromUrl(url):