Merge branch 'testing' into simon-testing

9314a5fd · sim · 6f3b91d3 · 6d567904 · 9314a5fd · 9314a5fd
Commit 9314a5fd authored Sep 15, 2017 by sim
13 changed files
--- a/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+++ b/alembic/versions/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+"""Add english fulltext index on Nodes.hyperdata for abstract and title
+
+Revision ID: 1fb4405b59e1
+Revises: bedce47c9e34
+Create Date: 2017-09-13 16:31:36.926692
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy_utils.types import TSVectorType
+from gargantext.util.alembic import ReplaceableObject
+
+
+# revision identifiers, used by Alembic.
+revision = '1fb4405b59e1'
+down_revision = 'bedce47c9e34'
+branch_labels = None
+depends_on = None
+
+
+title_abstract_update_trigger = ReplaceableObject(
+    'title_abstract_update_trigger()',
+    """
+RETURNS trigger AS $$
+begin
+  new.title_abstract := to_tsvector('english', (new.hyperdata ->> 'title') || ' ' || (new.hyperdata ->> 'abstract'));
+  return new;
+end
+$$ LANGUAGE plpgsql;
+    """
+)
+
+
+title_abstract_update = ReplaceableObject(
+    'title_abstract_update',
+    'BEFORE INSERT OR UPDATE',
+    'nodes',
+    'FOR EACH ROW EXECUTE PROCEDURE title_abstract_update_trigger()'
+)
+
+
+def upgrade():
+    op.add_column('nodes', sa.Column('title_abstract', TSVectorType))
+    op.create_sp(title_abstract_update_trigger)
+    op.create_trigger(title_abstract_update)
+
+    # Initialize index with already existing data
+    op.execute('UPDATE nodes SET hyperdata = hyperdata');
+
+
+def downgrade():
+    op.drop_trigger(title_abstract_update)
+    op.drop_sp(title_abstract_update_trigger)
+    op.drop_column('nodes', 'title_abstract')
--- a/annotations/static/annotations/http.js
+++ b/annotations/static/annotations/http.js
@@ -98,8 +98,8 @@
  */
  http.factory('MainApiAddNgramHttpService', function($resource) {
    return $resource(
-       // adding explicit "http://" b/c this a cross origin request
-      'http://' + window.GARG_ROOT_URL
+       // adding explicit "https://" b/c this a cross origin request
+      'https://' + window.GARG_ROOT_URL
                + "/api/ngrams?text=:ngramStr&corpus=:corpusId&testgroup",
      {
        ngramStr: '@ngramStr',
@@ -131,8 +131,8 @@

  http.factory('MainApiChangeNgramHttpService', function($resource) {
    return $resource(
-       // adding explicit "http://" b/c this a cross origin request
-      'http://' + window.GARG_ROOT_URL
+       // adding explicit "https://" b/c this a cross origin request
+      'https://' + window.GARG_ROOT_URL
                + "/api/ngramlists/change?list=:listId&ngrams=:ngramIdList",
      {
        listId: '@listId',
@@ -171,8 +171,8 @@
  */
  http.factory('MainApiFavoritesHttpService', function($resource) {
    return $resource(
-       // adding explicit "http://" b/c this a cross origin request
-      'http://' + window.GARG_ROOT_URL  + "/api/nodes/:corpusId/favorites?docs=:docId",
+       // adding explicit "https://" b/c this a cross origin request
+      'https://' + window.GARG_ROOT_URL  + "/api/nodes/:corpusId/favorites?docs=:docId",
      {
        corpusId: '@corpusId',
        docId: '@docId'

--- a/gargantext/models/base.py
+++ b/gargantext/models/base.py
@@ -2,13 +2,15 @@ from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
 from sqlalchemy.orm import relationship, validates
 from sqlalchemy.types import TypeDecorator, \
                             Integer, Float, Boolean, DateTime, String, Text
+from sqlalchemy_utils.types import TSVectorType
 from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
 from sqlalchemy.ext.mutable import MutableDict, MutableList
 from sqlalchemy.ext.declarative import declarative_base

-__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
+__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
           "validates", "ValidatorMixin",
           "Integer", "Float", "Boolean", "DateTime", "String", "Text",
+           "TSVectorType",
           "TypeDecorator",
           "JSONB", "Double",
           "MutableDict", "MutableList",

--- a/gargantext/models/nodes.py
+++ b/gargantext/models/nodes.py
@@ -2,13 +2,10 @@ from gargantext.util.db import session
 from gargantext.util.files import upload
 from gargantext.constants import *

-# Uncomment to make column full text searchable
-#from sqlalchemy_utils.types import TSVectorType
-
 from datetime import datetime

 from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
-                  Integer, Float, String, DateTime, JSONB, \
+                  Integer, Float, String, DateTime, JSONB, TSVectorType, \
                  MutableList, MutableDict, validates, ValidatorMixin
 from .users import User

@@ -60,9 +57,6 @@ class Node(ValidatorMixin, Base):
            Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
            Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))

-    # TODO
-    # create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
-
    id = Column(Integer, primary_key=True)

    typename = Column(NodeType, index=True)
@@ -78,10 +72,15 @@ class Node(ValidatorMixin, Base):
    name = Column(String(255))
    date  = Column(DateTime(timezone=True), default=datetime.now)

-    hyperdata     = Column(JSONB, default=dict)
-    # metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
-    # To make search possible uncomment the line below
-    #search_vector = Column(TSVectorType('hyperdata'))
+    hyperdata      = Column(JSONB, default=dict)
+
+    # Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
+    # We need to create a trigger to update this column on update and insert,
+    # it's created in alembic/version/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
+    #
+    # To use this column: session.query(DocumentNode) \
+    #                            .filter(Node.title_abstract.match('keyword'))
+    title_abstract = Column(TSVectorType(regconfig='english'))

    def __new__(cls, *args, **kwargs):
        if cls is Node and kwargs.get('typename'):

--- a/gargantext/util/alembic.py
+++ b/gargantext/util/alembic.py
@@ -16,9 +16,9 @@ __all__ = ['ReplaceableObject']


 class ReplaceableObject(object):
-    def __init__(self, name, sqltext):
+    def __init__(self, name, *args):
        self.name = name
-        self.sqltext = sqltext
+        self.args = args


 class ReversibleOp(MigrateOperation):
@@ -85,11 +85,24 @@ class DropSPOp(ReversibleOp):
        return CreateSPOp(self.target)


+@Operations.register_operation("create_trigger", "invoke_for_target")
+@Operations.register_operation("replace_trigger", "replace")
+class CreateTriggerOp(ReversibleOp):
+    def reverse(self):
+        return DropTriggerOp(self.target)
+
+
+@Operations.register_operation("drop_trigger", "invoke_for_target")
+class DropTriggerOp(ReversibleOp):
+    def reverse(self):
+        return CreateTriggerOp(self.target)
+
+
 @Operations.implementation_for(CreateViewOp)
 def create_view(operations, operation):
    operations.execute("CREATE VIEW %s AS %s" % (
        operation.target.name,
-        operation.target.sqltext
+        operation.target.args[0]
    ))


@@ -102,7 +115,7 @@ def drop_view(operations, operation):
 def create_sp(operations, operation):
    operations.execute(
        "CREATE FUNCTION %s %s" % (
-            operation.target.name, operation.target.sqltext
+            operation.target.name, operation.target.args[0]
        )
    )

@@ -110,3 +123,23 @@ def create_sp(operations, operation):
 @Operations.implementation_for(DropSPOp)
 def drop_sp(operations, operation):
    operations.execute("DROP FUNCTION %s" % operation.target.name)
+
+
+@Operations.implementation_for(CreateTriggerOp)
+def create_trigger(operations, operation):
+    args = operation.target.args
+    operations.execute(
+        "CREATE TRIGGER %s %s ON %s %s" % (
+            operation.target.name, args[0], args[1], args[2]
+        )
+    )
+
+
+@Operations.implementation_for(DropTriggerOp)
+def drop_trigger(operations, operation):
+    operations.execute(
+        "DROP TRIGGER %s ON %s" % (
+            operation.target.name,
+            operation.target.args[1]
+        )
+    )
--- a/gargantext/util/crawlers/HAL.py
+++ b/gargantext/util/crawlers/HAL.py
@@ -14,12 +14,12 @@ from gargantext.util.files import save

 class HalCrawler(Crawler):
    ''' HAL API CLIENT'''
-    
+
    def __init__(self):
        # Main EndPoints
        self.BASE_URL = "https://api.archives-ouvertes.fr"
        self.API_URL  = "search"
-        
+
        # Final EndPoints
        # TODO : Change endpoint according type of database
        self.URL   = self.BASE_URL + "/" + self.API_URL
@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
    def _get(self, query, fromPage=1, count=10, lang=None):
        # Parameters

-        fl = """ en_title_s
+        fl = """ docid
+               , title_s
+               , abstract_s
               , en_title_s
               , en_abstract_s
               , submittedDate_s
@@ -59,7 +61,7 @@ class HalCrawler(Crawler):
             """
               #, authUrl_s
               #, type_s
-        
+
        wt = "json"

        querystring = { "q"       : query
@@ -68,18 +70,18 @@ class HalCrawler(Crawler):
                      , "fl"      : fl
                      , "wt"      : wt
                      }
-        
+
        # Specify Headers
        headers = { "cache-control" : "no-cache" }
-        
-        
+
+
        # Do Request and get response
        response = requests.request( "GET"
                                   , self.URL
                                   , headers = headers
                                   , params  = querystring
                                   )
-        
+
        #print(querystring)
        # Validation : 200 if ok else raise Value
        if response.status_code == 200:
@@ -90,27 +92,27 @@ class HalCrawler(Crawler):
            return (json.loads(response.content.decode(charset)))
        else:
            raise ValueError(response.status_code, response.reason)
-        
+
    def scan_results(self, query):
        '''
        scan_results : Returns the number of results
        Query String -> Int
        '''
        self.results_nb = 0
-        
+
        total = ( self._get(query)
                      .get("response", {})
                      .get("numFound"  ,  0)
                )
-        
+
        self.results_nb = total

        return self.results_nb

    def download(self, query):
-        
+
        downloaded = False
-        
+
        self.status.append("fetching results")

        corpus = []
@@ -124,7 +126,7 @@ class HalCrawler(Crawler):
                                                            )
            print("ERROR (scrap: HAL d/l ): " , msg)
            self.query_max = QUERY_SIZE_N_MAX
-        
+
        #for page in range(1, trunc(self.query_max / 100) + 2):
        for page in range(0, self.query_max, paging):
            print("Downloading page %s to %s results" % (page, paging))
@@ -141,5 +143,5 @@ class HalCrawler(Crawler):
                        , basedir=UPLOAD_DIRECTORY
                        )
        downloaded = True
-        
+
        return downloaded
--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -8,7 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
 """

 from gargantext.util.group_tools import query_groups, group_union
-from gargantext.util.db          import session, bulk_insert_ifnotexists
+from gargantext.util.db          import session, bulk_insert_ifnotexists, desc
 from gargantext.models           import Ngram, NodeNgram, NodeNodeNgram, \
                                        NodeNgramNgram, Node


--- a/gargantext/util/parsers/HAL.py
+++ b/gargantext/util/parsers/HAL.py
@@ -12,12 +12,12 @@ import json

 class HalParser(Parser):
    def _parse(self, json_docs):
-        
+
        hyperdata_list = []
-        
-        hyperdata_path = { "id"              : "isbn_s"
-                         , "title"           : "en_title_s"
-                         , "abstract"        : "en_abstract_s"
+
+        hyperdata_path = { "id"              : "docid"
+                         , "title"           : ["en_title_s", "title_s"]
+                         , "abstract"        : ["en_abstract_s", "abstract_s"]
                         , "source"          : "journalTitle_s"
                         , "url"             : "uri_s"
                         , "authors"         : "authFullName_s"
@@ -29,8 +29,8 @@ class HalParser(Parser):
                         , "instStructId_i"  : "instStructId_i"
                         , "deptStructId_i"  : "deptStructId_i"
                         , "labStructId_i"   : "labStructId_i"
-                         , "rteamStructId_i" : "rteamStructId_i" 
-                         , "docType_s"       : "docType_s" 
+                         , "rteamStructId_i" : "rteamStructId_i"
+                         , "docType_s"       : "docType_s"
                         }

        uris = set()
@@ -38,29 +38,32 @@ class HalParser(Parser):
        for doc in json_docs:

            hyperdata = {}
-            
+
            for key, path in hyperdata_path.items():
-                    
-                    field = doc.get(path, "NOT FOUND")
-                    if isinstance(field, list):
-                        hyperdata[key] = ", ".join(map(lambda x: str(x), field))
-                    else:
-                        hyperdata[key] = str(field)
-            
+
+                # A path can be a field name or a sequence of field names
+                if isinstance(path, (list, tuple)):
+                    # Get first non-empty value of fields in path sequence, or None
+                    field = next((x for x in (doc.get(p) for p in path) if x), None)
+                else:
+                    # Get field value
+                    field = doc.get(path)
+
+                if field is None:
+                    field = "NOT FOUND"
+
+                if isinstance(field, list):
+                    hyperdata[key] = ", ".join(map(str, field))
+                else:
+                    hyperdata[key] = str(field)
+
            if hyperdata["url"] in uris:
                print("Document already parsed")
+
            else:
                uris.add(hyperdata["url"])
-#            hyperdata["authors"] = ", ".join(
-#                                             [ p.get("person", {})
-#                                                .get("name"  , "")
-#                          
-#                                               for p in doc.get("hasauthor", [])
-#                                             ]
-#                                            )
-#            
-                maybeDate = doc.get("submittedDate_s", None)

+                maybeDate = doc.get("submittedDate_s", None)
                if maybeDate is not None:
                    date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
                else:
@@ -70,9 +73,9 @@ class HalParser(Parser):
                hyperdata["publication_year"]  = str(date.year)
                hyperdata["publication_month"] = str(date.month)
                hyperdata["publication_day"]   = str(date.day)
-                
+
                hyperdata_list.append(hyperdata)
-        
+
        return hyperdata_list

    def parse(self, filebuf):

--- a/install/gargamelle/requirements.txt
+++ b/install/gargamelle/requirements.txt
@@ -35,4 +35,6 @@ requests-futures==0.9.7
 bs4==0.0.1
 requests==2.10.0
 alembic>=0.9.2
-# SQLAlchemy-Searchable==0.10.4
+SQLAlchemy==1.1.14
+SQLAlchemy-Searchable==0.10.4
+SQLAlchemy-Utils==0.32.16
--- a/install/notebook/gargantext_notebook.py
+++ b/install/notebook/gargantext_notebook.py
@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
 django.setup()

 from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
-from gargantext.models import ProjectNode, DocumentNode, UserNode, User
+from gargantext.models import Node, ProjectNode, DocumentNode
 from gargantext.util.db import session, get_engine
 from collections import Counter
 import importlib
 from django.http import Http404

+# Import those to be available by notebook user
+from langdetect import detect as detect_lang
+from gargantext.models import UserNode, User
+

 class NotebookError(Exception):
    pass
@@ -49,16 +53,19 @@ def scan_hal(request):
    return hal.scan_results(request)


-def scan_gargantext(corpus_id, lang, request):
-    connection = get_engine().connect()
-    # TODO add some sugar the request (ideally request should be the same for hal and garg)
-    query = """select count(n.id) from nodes n
-                  where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
-                  @@ to_tsquery('%s')
-                  AND n.parent_id = %s;""" % (lang, request, corpus_id)
-    return [i for i in connection.execute(query)][0][0]
-    connection.close()
+def scan_gargantext(corpus_id, request):
+    return (session.query(DocumentNode)
+                   .filter_by(parent_id=corpus_id)
+                   .filter(Node.title_abstract.match(request))
+                   .count())
+

+def scan_gargantext_and_delete(corpus_id, request):
+    return (session.query(DocumentNode)
+                   .filter_by(parent_id=corpus_id)
+                   .filter(Node.title_abstract.match(request))
+                   .delete(synchronize_session='fetch')
+                   )

 def myProject_fromUrl(url):
    """

--- a/notebooks/.ipynb_checkpoints/AdvancedTutorial-checkpoint.ipynb
+++ b/notebooks/.ipynb_checkpoints/AdvancedTutorial-checkpoint.ipynb
--- a/notebooks/AdvancedTutorial.ipynb
+++ b/notebooks/AdvancedTutorial.ipynb
--- a/templates/pages/projects/overview.html
+++ b/templates/pages/projects/overview.html
@@ -203,6 +203,7 @@
      // do something…
        resetStatusForm("#createForm");
      })
+      return false;

    })