Commit a7330cb9 authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] using 'node_metadata' and 'metadata' tables for quick metadata filtering

[CODE] using SQLAlchemy for REST metadata retrieval
metadata retrieval is so much quicker now!
parent 50cbc12d
...@@ -6,6 +6,7 @@ from django.db.models import Avg, Max, Min, Count, Sum ...@@ -6,6 +6,7 @@ from django.db.models import Avg, Max, Min, Count, Sum
# from node.models import Language, ResourceType, Resource # from node.models import Language, ResourceType, Resource
# from node.models import Node, NodeType, Node_Resource, Project, Corpus # from node.models import Node, NodeType, Node_Resource, Project, Corpus
from sqlalchemy import text
from sqlalchemy.sql import func from sqlalchemy.sql import func
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
...@@ -14,7 +15,65 @@ NodeType = node.models.NodeType.sa ...@@ -14,7 +15,65 @@ NodeType = node.models.NodeType.sa
Node = node.models.Node.sa Node = node.models.Node.sa
Node_Ngram = node.models.Node_Ngram.sa Node_Ngram = node.models.Node_Ngram.sa
Ngram = node.models.Ngram.sa Ngram = node.models.Ngram.sa
Metadata = node.models.Metadata.sa
Node_Metadata = node.models.Node_Metadata.sa
def literalquery(statement, dialect=None):
"""Generate an SQL expression string with bound parameters rendered inline
for the given SQLAlchemy statement.
WARNING: This method of escaping is insecure, incomplete, and for debugging
purposes only. Executing SQL statements with inline-rendered user values is
extremely insecure.
"""
from datetime import datetime
import sqlalchemy.orm
if isinstance(statement, sqlalchemy.orm.Query):
if dialect is None:
dialect = statement.session.get_bind(
statement._mapper_zero_or_none()
).dialect
statement = statement.statement
if dialect is None:
dialect = getattr(statement.bind, 'dialect', None)
if dialect is None:
from sqlalchemy.dialects import mysql
dialect = mysql.dialect()
Compiler = type(statement._compiler(dialect))
class LiteralCompiler(Compiler):
visit_bindparam = Compiler.render_literal_bindparam
def render_literal_value(self, value, type_):
return str(value)
# if isinstance(value, (float, int)):
# return str(value)
# elif isinstance(value, datetime):
# return repr(str(value))
# else: # fallback
# value = super(LiteralCompiler, self).render_literal_value(
# value, type_,
# )
# if isinstance(value, unicode):
# return value.encode('UTF-8')
# else:
# return value
return LiteralCompiler(dialect, statement)
def get_connection():
import sqlalchemy.orm
from django.db import connections
from aldjemy.core import get_engine
alias = 'default'
connection = connections[alias]
session = sqlalchemy.orm.create_session()
engine = get_engine()
return engine.connect()
# connection = engine.connect()
# _sql_cte = ''' # _sql_cte = '''
# WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS ( # WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS (
...@@ -154,81 +213,48 @@ class CorpusController: ...@@ -154,81 +213,48 @@ class CorpusController:
@classmethod @classmethod
def metadata(cls, request, node_id): def metadata(cls, request, node_id):
# query metadata keys
ParentNode = aliased(Node) ParentNode = aliased(Node)
query = (Ngram metadata_query = (Metadata
.query(Ngram.metadata[''], func.count('*')) .query(Metadata)
.join(Node, Node.id == Node_Ngram.node_id) .join(Node_Metadata, Node_Metadata.metadata_id == Metadata.id)
.join(ParentNode, ParentNode.id == Node.parent_id) .join(Node, Node.id == Node_Metadata.node_id)
.filter(ParentNode.id == node_id) .filter(Node.parent_id == node_id)
.group_by(Ngram.terms) .group_by(Metadata)
.order_by(func.count('*').desc())
) )
collection = query.all() # build a collection with the metadata ekys
return JsonHttpResponse(collection)
# parameters retrieval and validation
corpus = cls.get(corpus_id)
# query building
cursor = connection.cursor()
# cursor.execute(_sql_cte + '''
# SELECT key
# FROM (
# SELECT skeys(metadata) AS key, COUNT(*)
# FROM cte
# INNER JOIN %s AS node ON node.id = cte.id
# WHERE (NOT cte.id = \'%d\') AND (\'%d\' = ANY(cte."path"))
# ) AS keys
# GROUP BY key
# ORDER BY COUNT(*) DESC
# ''' % (Node._meta.db_table, corpus.id, corpus.id, ))
cursor.execute('''
SELECT key, COUNT(*) AS count, (
SELECT COUNT(DISTINCT metadata->key) FROM %s
) AS values
FROM (
SELECT skeys(metadata) AS key
FROM %s
WHERE parent_id = \'%d\'
) AS keys
GROUP BY key
ORDER BY count
''' % (Node._meta.db_table, Node._meta.db_table, corpus.id, ))
# response building
collection = [] collection = []
for row in cursor.fetchall(): for metadata in metadata_query:
type = 'string' # count values
key = row[0] value_column = getattr(Node_Metadata, 'value_' + metadata.type)
split_key = key.split('_') node_metadata_query = (Node_Metadata
name = split_key[0] .query(value_column)
if len(split_key) == 2: .join(Node, Node.id == Node_Metadata.node_id)
if split_key[1] == 'date': .filter(Node.parent_id == node_id)
name = split_key[0] .filter(Node_Metadata.metadata_id == metadata.id)
type = 'datetime' .group_by(value_column)
elif row[0] == 'language_fullname': .order_by(value_column)
name = 'language' )
type = 'string' valuesCount = node_metadata_query.count()
else: # if there is less than 32 values, retrieve them
continue
values = None values = None
if row[2] < 32: if valuesCount <= 32:
cursor.execute(''' values = [row[0] for row in node_metadata_query.all()]
SELECT DISTINCT metadata->'%s' if metadata.type == 'datetime':
FROM %s values = []
WHERE parent_id = %s values = map(lambda x: x.isoformat(), values)
AND metadata ? '%s'
ORDER BY metadata->'%s'
''' % (key, Node._meta.db_table, corpus.id, key, key, ))
values = [row[0] for row in cursor.fetchall()]
collection.append({ collection.append({
'key': key, 'key': metadata.name,
'text': name,
'documents': row[1],
'valuesCount': row[2],
'values': values, 'values': values,
'type': type, 'valuesCount': valuesCount,
}) })
return JsonHttpResponse(collection)
return JsonHttpResponse({
'test' : 123,
'collection': collection,
})
@classmethod @classmethod
def data(cls, request, corpus_id): def data(cls, request, corpus_id):
......
...@@ -75,6 +75,23 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet): ...@@ -75,6 +75,23 @@ class NodeQuerySet(CTENodeManager.CTEQuerySet):
ngramscaches = NgramsCaches() ngramscaches = NgramsCaches()
for node in self: for node in self:
node.extract_ngrams(keys, ngramsextractorscache, ngramscaches) node.extract_ngrams(keys, ngramsextractorscache, ngramscaches)
def make_metadata_filterable(self):
metadata_cache = {metadata.name: metadata for metadata in Metadata.objects.all()}
data = []
for node in self:
print(node.id)
for key, value in node.metadata.items():
if key in metadata_cache:
metadata = metadata_cache[key]
if metadata.type == 'string':
value = value[:255]
data.append(Node_Metadata(**{
'node_id' : node.id,
'metadata_id' : metadata.id,
('value_'+metadata.type) : value,
}))
Node_Metadata.objects.bulk_create(data)
class NodeManager(CTENodeManager): class NodeManager(CTENodeManager):
"""Methods available from Node.object.""" """Methods available from Node.object."""
...@@ -85,6 +102,10 @@ class NodeManager(CTENodeManager): ...@@ -85,6 +102,10 @@ class NodeManager(CTENodeManager):
if name.startswith("_"): if name.startswith("_"):
raise AttributeError raise AttributeError
return getattr(self.get_queryset(), name, *args) return getattr(self.get_queryset(), name, *args)
class Metadata(models.Model):
name = models.CharField(max_length=32, db_index=True)
type = models.CharField(max_length=16, db_index=True)
class Node(CTENode): class Node(CTENode):
"""The node.""" """The node."""
...@@ -137,7 +158,7 @@ class Node(CTENode): ...@@ -137,7 +158,7 @@ class Node(CTENode):
return resource return resource
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def parse_resources(self): def parse_resources(self, verbose=False):
# parse all resources into a list of metadata # parse all resources into a list of metadata
metadata_list = [] metadata_list = []
for node_resource in self.node_resource.filter(parsed=False): for node_resource in self.node_resource.filter(parsed=False):
...@@ -151,22 +172,33 @@ class Node(CTENode): ...@@ -151,22 +172,33 @@ class Node(CTENode):
'europress_english' : EuropressFileParser, 'europress_english' : EuropressFileParser,
})[resource.type.name]() })[resource.type.name]()
metadata_list += parser.parse(str(resource.file)) metadata_list += parser.parse(str(resource.file))
# insert the new resources in the database! # retrieve info from the database
type = NodeType.objects.get(name='Document') type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache() langages_cache = LanguagesCache()
Node.objects.bulk_create([ user_id = self.user.id
# insert the new resources in the database!
for i, metadata_values in enumerate(metadata_list):
if verbose:
print(i, end='\r', flush=True)
name = metadata_values.get('title', '')[:200]
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
Node( Node(
user = self.user, user_id = user_id,
type = type, type_id = type_id,
name = metadata['title'][0:199] if 'title' in metadata else '', name = name,
parent = self, parent = self,
language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None, language_id = language.id if language else None,
metadata = metadata, metadata = metadata_values
) ).save()
for metadata in metadata_list
]) # make metadata filterable
self.children.all().make_metadata_filterable()
# mark the resources as parsed for this node # mark the resources as parsed for this node
self.node_resource.update(parsed=True) self.node_resource.update(parsed=True)
def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None): def extract_ngrams(self, keys, ngramsextractorscache=None, ngramscaches=None):
# if there is no cache... # if there is no cache...
...@@ -202,6 +234,15 @@ class Node(CTENode): ...@@ -202,6 +234,15 @@ class Node(CTENode):
for ngram_text, weight in associations.items() for ngram_text, weight in associations.items()
]) ])
class Node_Metadata(models.Model):
node = models.ForeignKey(Node)
metadata = models.ForeignKey(Metadata)
value_int = models.IntegerField(null=True, db_index=True)
value_float = models.FloatField(null=True, db_index=True)
value_string = models.CharField(max_length=255, null=True, db_index=True)
value_datetime = models.DateTimeField(null=True, db_index=True)
value_text = models.TextField(null=True)
class Node_Resource(models.Model): class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource') node = models.ForeignKey(Node, related_name='node_resource')
resource = models.ForeignKey(Resource) resource = models.ForeignKey(Resource)
......
...@@ -75,9 +75,10 @@ class LanguagesCache(defaultdict): ...@@ -75,9 +75,10 @@ class LanguagesCache(defaultdict):
self[str(language.iso2.lower())] = language self[str(language.iso2.lower())] = language
self[str(language.iso3.lower())] = language self[str(language.iso3.lower())] = language
self[str(language.fullname.lower())] = language self[str(language.fullname.lower())] = language
betterKey = key.strip().lower() if key not in self.keys():
self[key] = self[betterKey] if betterKey in self.keys() else None betterKey = key.strip().lower()
return self[betterKey] self[key] = self[betterKey] if betterKey in self.keys() else None
return self[key]
......
/srv/gargantext_lib/treetagger /home/mat/projects/gargantext/old/gargantext-stable-2/shared/treetagger
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment