Commit 9314a5fd authored by sim's avatar sim

Merge branch 'testing' into simon-testing

parents 6f3b91d3 6d567904
"""Add english fulltext index on Nodes.hyperdata for abstract and title
Revision ID: 1fb4405b59e1
Revises: bedce47c9e34
Create Date: 2017-09-13 16:31:36.926692
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy_utils.types import TSVectorType
from gargantext.util.alembic import ReplaceableObject
# revision identifiers, used by Alembic.
revision = '1fb4405b59e1'
down_revision = 'bedce47c9e34'
branch_labels = None
depends_on = None
title_abstract_update_trigger = ReplaceableObject(
'title_abstract_update_trigger()',
"""
RETURNS trigger AS $$
begin
new.title_abstract := to_tsvector('english', (new.hyperdata ->> 'title') || ' ' || (new.hyperdata ->> 'abstract'));
return new;
end
$$ LANGUAGE plpgsql;
"""
)
title_abstract_update = ReplaceableObject(
'title_abstract_update',
'BEFORE INSERT OR UPDATE',
'nodes',
'FOR EACH ROW EXECUTE PROCEDURE title_abstract_update_trigger()'
)
def upgrade():
op.add_column('nodes', sa.Column('title_abstract', TSVectorType))
op.create_sp(title_abstract_update_trigger)
op.create_trigger(title_abstract_update)
# Initialize index with already existing data
op.execute('UPDATE nodes SET hyperdata = hyperdata');
def downgrade():
op.drop_trigger(title_abstract_update)
op.drop_sp(title_abstract_update_trigger)
op.drop_column('nodes', 'title_abstract')
......@@ -98,8 +98,8 @@
*/
http.factory('MainApiAddNgramHttpService', function($resource) {
return $resource(
// adding explicit "http://" b/c this a cross origin request
'http://' + window.GARG_ROOT_URL
// adding explicit "https://" b/c this a cross origin request
'https://' + window.GARG_ROOT_URL
+ "/api/ngrams?text=:ngramStr&corpus=:corpusId&testgroup",
{
ngramStr: '@ngramStr',
......@@ -131,8 +131,8 @@
http.factory('MainApiChangeNgramHttpService', function($resource) {
return $resource(
// adding explicit "http://" b/c this a cross origin request
'http://' + window.GARG_ROOT_URL
// adding explicit "https://" b/c this a cross origin request
'https://' + window.GARG_ROOT_URL
+ "/api/ngramlists/change?list=:listId&ngrams=:ngramIdList",
{
listId: '@listId',
......@@ -171,8 +171,8 @@
*/
http.factory('MainApiFavoritesHttpService', function($resource) {
return $resource(
// adding explicit "http://" b/c this a cross origin request
'http://' + window.GARG_ROOT_URL + "/api/nodes/:corpusId/favorites?docs=:docId",
// adding explicit "https://" b/c this a cross origin request
'https://' + window.GARG_ROOT_URL + "/api/nodes/:corpusId/favorites?docs=:docId",
{
corpusId: '@corpusId',
docId: '@docId'
......
......@@ -2,13 +2,15 @@ from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy.orm import relationship, validates
from sqlalchemy.types import TypeDecorator, \
Integer, Float, Boolean, DateTime, String, Text
from sqlalchemy_utils.types import TSVectorType
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION as Double
from sqlalchemy.ext.mutable import MutableDict, MutableList
from sqlalchemy.ext.declarative import declarative_base
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "relationship",
__all__ = ["Column", "ForeignKey", "UniqueConstraint", "Index", "relationship",
"validates", "ValidatorMixin",
"Integer", "Float", "Boolean", "DateTime", "String", "Text",
"TSVectorType",
"TypeDecorator",
"JSONB", "Double",
"MutableDict", "MutableList",
......
......@@ -2,13 +2,10 @@ from gargantext.util.db import session
from gargantext.util.files import upload
from gargantext.constants import *
# Uncomment to make column full text searchable
#from sqlalchemy_utils.types import TSVectorType
from datetime import datetime
from .base import Base, Column, ForeignKey, relationship, TypeDecorator, Index, \
Integer, Float, String, DateTime, JSONB, \
Integer, Float, String, DateTime, JSONB, TSVectorType, \
MutableList, MutableDict, validates, ValidatorMixin
from .users import User
......@@ -60,9 +57,6 @@ class Node(ValidatorMixin, Base):
Index('nodes_user_id_typename_parent_id_idx', 'user_id', 'typename', 'parent_id'),
Index('nodes_hyperdata_idx', 'hyperdata', postgresql_using='gin'))
# TODO
# create INDEX full_text_idx on nodes using gin(to_tsvector('english', hyperdata ->> 'abstract' || 'title'));
id = Column(Integer, primary_key=True)
typename = Column(NodeType, index=True)
......@@ -78,10 +72,15 @@ class Node(ValidatorMixin, Base):
name = Column(String(255))
date = Column(DateTime(timezone=True), default=datetime.now)
hyperdata = Column(JSONB, default=dict)
# metadata (see https://bashelton.com/2014/03/updating-postgresql-json-fields-via-sqlalchemy/)
# To make search possible uncomment the line below
#search_vector = Column(TSVectorType('hyperdata'))
hyperdata = Column(JSONB, default=dict)
# Create a TSVECTOR column to use fulltext search feature of PostgreSQL.
# We need to create a trigger to update this column on update and insert,
# it's created in alembic/version/1fb4405b59e1_add_english_fulltext_index_on_nodes_.py
#
# To use this column: session.query(DocumentNode) \
# .filter(Node.title_abstract.match('keyword'))
title_abstract = Column(TSVectorType(regconfig='english'))
def __new__(cls, *args, **kwargs):
if cls is Node and kwargs.get('typename'):
......
......@@ -16,9 +16,9 @@ __all__ = ['ReplaceableObject']
class ReplaceableObject(object):
def __init__(self, name, sqltext):
def __init__(self, name, *args):
self.name = name
self.sqltext = sqltext
self.args = args
class ReversibleOp(MigrateOperation):
......@@ -85,11 +85,24 @@ class DropSPOp(ReversibleOp):
return CreateSPOp(self.target)
@Operations.register_operation("create_trigger", "invoke_for_target")
@Operations.register_operation("replace_trigger", "replace")
class CreateTriggerOp(ReversibleOp):
def reverse(self):
return DropTriggerOp(self.target)
@Operations.register_operation("drop_trigger", "invoke_for_target")
class DropTriggerOp(ReversibleOp):
def reverse(self):
return CreateTriggerOp(self.target)
@Operations.implementation_for(CreateViewOp)
def create_view(operations, operation):
operations.execute("CREATE VIEW %s AS %s" % (
operation.target.name,
operation.target.sqltext
operation.target.args[0]
))
......@@ -102,7 +115,7 @@ def drop_view(operations, operation):
def create_sp(operations, operation):
operations.execute(
"CREATE FUNCTION %s %s" % (
operation.target.name, operation.target.sqltext
operation.target.name, operation.target.args[0]
)
)
......@@ -110,3 +123,23 @@ def create_sp(operations, operation):
@Operations.implementation_for(DropSPOp)
def drop_sp(operations, operation):
operations.execute("DROP FUNCTION %s" % operation.target.name)
@Operations.implementation_for(CreateTriggerOp)
def create_trigger(operations, operation):
args = operation.target.args
operations.execute(
"CREATE TRIGGER %s %s ON %s %s" % (
operation.target.name, args[0], args[1], args[2]
)
)
@Operations.implementation_for(DropTriggerOp)
def drop_trigger(operations, operation):
operations.execute(
"DROP TRIGGER %s ON %s" % (
operation.target.name,
operation.target.args[1]
)
)
......@@ -14,12 +14,12 @@ from gargantext.util.files import save
class HalCrawler(Crawler):
''' HAL API CLIENT'''
def __init__(self):
# Main EndPoints
self.BASE_URL = "https://api.archives-ouvertes.fr"
self.API_URL = "search"
# Final EndPoints
# TODO : Change endpoint according type of database
self.URL = self.BASE_URL + "/" + self.API_URL
......@@ -38,7 +38,9 @@ class HalCrawler(Crawler):
def _get(self, query, fromPage=1, count=10, lang=None):
# Parameters
fl = """ en_title_s
fl = """ docid
, title_s
, abstract_s
, en_title_s
, en_abstract_s
, submittedDate_s
......@@ -59,7 +61,7 @@ class HalCrawler(Crawler):
"""
#, authUrl_s
#, type_s
wt = "json"
querystring = { "q" : query
......@@ -68,18 +70,18 @@ class HalCrawler(Crawler):
, "fl" : fl
, "wt" : wt
}
# Specify Headers
headers = { "cache-control" : "no-cache" }
# Do Request and get response
response = requests.request( "GET"
, self.URL
, headers = headers
, params = querystring
)
#print(querystring)
# Validation : 200 if ok else raise Value
if response.status_code == 200:
......@@ -90,27 +92,27 @@ class HalCrawler(Crawler):
return (json.loads(response.content.decode(charset)))
else:
raise ValueError(response.status_code, response.reason)
def scan_results(self, query):
'''
scan_results : Returns the number of results
Query String -> Int
'''
self.results_nb = 0
total = ( self._get(query)
.get("response", {})
.get("numFound" , 0)
)
self.results_nb = total
return self.results_nb
def download(self, query):
downloaded = False
self.status.append("fetching results")
corpus = []
......@@ -124,7 +126,7 @@ class HalCrawler(Crawler):
)
print("ERROR (scrap: HAL d/l ): " , msg)
self.query_max = QUERY_SIZE_N_MAX
#for page in range(1, trunc(self.query_max / 100) + 2):
for page in range(0, self.query_max, paging):
print("Downloading page %s to %s results" % (page, paging))
......@@ -141,5 +143,5 @@ class HalCrawler(Crawler):
, basedir=UPLOAD_DIRECTORY
)
downloaded = True
return downloaded
......@@ -8,7 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
"""
from gargantext.util.group_tools import query_groups, group_union
from gargantext.util.db import session, bulk_insert_ifnotexists
from gargantext.util.db import session, bulk_insert_ifnotexists, desc
from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
NodeNgramNgram, Node
......
......@@ -12,12 +12,12 @@ import json
class HalParser(Parser):
def _parse(self, json_docs):
hyperdata_list = []
hyperdata_path = { "id" : "isbn_s"
, "title" : "en_title_s"
, "abstract" : "en_abstract_s"
hyperdata_path = { "id" : "docid"
, "title" : ["en_title_s", "title_s"]
, "abstract" : ["en_abstract_s", "abstract_s"]
, "source" : "journalTitle_s"
, "url" : "uri_s"
, "authors" : "authFullName_s"
......@@ -29,8 +29,8 @@ class HalParser(Parser):
, "instStructId_i" : "instStructId_i"
, "deptStructId_i" : "deptStructId_i"
, "labStructId_i" : "labStructId_i"
, "rteamStructId_i" : "rteamStructId_i"
, "docType_s" : "docType_s"
, "rteamStructId_i" : "rteamStructId_i"
, "docType_s" : "docType_s"
}
uris = set()
......@@ -38,29 +38,32 @@ class HalParser(Parser):
for doc in json_docs:
hyperdata = {}
for key, path in hyperdata_path.items():
field = doc.get(path, "NOT FOUND")
if isinstance(field, list):
hyperdata[key] = ", ".join(map(lambda x: str(x), field))
else:
hyperdata[key] = str(field)
# A path can be a field name or a sequence of field names
if isinstance(path, (list, tuple)):
# Get first non-empty value of fields in path sequence, or None
field = next((x for x in (doc.get(p) for p in path) if x), None)
else:
# Get field value
field = doc.get(path)
if field is None:
field = "NOT FOUND"
if isinstance(field, list):
hyperdata[key] = ", ".join(map(str, field))
else:
hyperdata[key] = str(field)
if hyperdata["url"] in uris:
print("Document already parsed")
else:
uris.add(hyperdata["url"])
# hyperdata["authors"] = ", ".join(
# [ p.get("person", {})
# .get("name" , "")
#
# for p in doc.get("hasauthor", [])
# ]
# )
#
maybeDate = doc.get("submittedDate_s", None)
maybeDate = doc.get("submittedDate_s", None)
if maybeDate is not None:
date = datetime.strptime(maybeDate, "%Y-%m-%d %H:%M:%S")
else:
......@@ -70,9 +73,9 @@ class HalParser(Parser):
hyperdata["publication_year"] = str(date.year)
hyperdata["publication_month"] = str(date.month)
hyperdata["publication_day"] = str(date.day)
hyperdata_list.append(hyperdata)
return hyperdata_list
def parse(self, filebuf):
......
......@@ -35,4 +35,6 @@ requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
alembic>=0.9.2
# SQLAlchemy-Searchable==0.10.4
SQLAlchemy==1.1.14
SQLAlchemy-Searchable==0.10.4
SQLAlchemy-Utils==0.32.16
......@@ -15,12 +15,16 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django.setup()
from gargantext.constants import QUERY_SIZE_N_MAX, get_resource, get_resource_by_name
from gargantext.models import ProjectNode, DocumentNode, UserNode, User
from gargantext.models import Node, ProjectNode, DocumentNode
from gargantext.util.db import session, get_engine
from collections import Counter
import importlib
from django.http import Http404
# Import those to be available by notebook user
from langdetect import detect as detect_lang
from gargantext.models import UserNode, User
class NotebookError(Exception):
pass
......@@ -49,16 +53,19 @@ def scan_hal(request):
return hal.scan_results(request)
def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0]
connection.close()
def scan_gargantext(corpus_id, request):
return (session.query(DocumentNode)
.filter_by(parent_id=corpus_id)
.filter(Node.title_abstract.match(request))
.count())
def scan_gargantext_and_delete(corpus_id, request):
return (session.query(DocumentNode)
.filter_by(parent_id=corpus_id)
.filter(Node.title_abstract.match(request))
.delete(synchronize_session='fetch')
)
def myProject_fromUrl(url):
"""
......
This diff is collapsed.
......@@ -203,6 +203,7 @@
// do something…
resetStatusForm("#createForm");
})
return false;
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment