Commit 50cbc12d authored by Mathieu Rodic's avatar Mathieu Rodic

[CODE] the route "/api/corpus/{id}/ngrams" is now doing the job with SQLAlchemy!

See: https://github.com/mathieurodic/aldjemy
parent 542ddf49
...@@ -3,33 +3,39 @@ from django.core.exceptions import PermissionDenied, SuspiciousOperation ...@@ -3,33 +3,39 @@ from django.core.exceptions import PermissionDenied, SuspiciousOperation
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.db.models import Avg, Max, Min, Count, Sum from django.db.models import Avg, Max, Min, Count, Sum
from node.models import NodeType, Node, Node_Ngram, Ngram
from django.db import connection
# from node.models import Language, ResourceType, Resource # from node.models import Language, ResourceType, Resource
# from node.models import Node, NodeType, Node_Resource, Project, Corpus # from node.models import Node, NodeType, Node_Resource, Project, Corpus
# from node.admin import CorpusForm, ProjectForm, ResourceForm
_sql_cte = ''' from sqlalchemy.sql import func
WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS ( from sqlalchemy.orm import aliased
SELECT 1 AS depth,
array[T."id"] AS path,
array[T."id"] AS ordering,
T."id"
FROM %s T
WHERE T."parent_id" IS NULL
UNION ALL import node.models
NodeType = node.models.NodeType.sa
Node = node.models.Node.sa
Node_Ngram = node.models.Node_Ngram.sa
Ngram = node.models.Ngram.sa
# _sql_cte = '''
# WITH RECURSIVE cte ("depth", "path", "ordering", "id") AS (
# SELECT 1 AS depth,
# array[T."id"] AS path,
# array[T."id"] AS ordering,
# T."id"
# FROM %s T
# WHERE T."parent_id" IS NULL
# UNION ALL
# SELECT cte.depth + 1 AS depth,
# cte.path || T."id",
# cte.ordering || array[T."id"],
# T."id"
# FROM %s T
# JOIN cte ON T."parent_id" = cte."id"
# )
# ''' % (Node._meta.db_table, Node._meta.db_table, )
SELECT cte.depth + 1 AS depth,
cte.path || T."id",
cte.ordering || array[T."id"],
T."id"
FROM %s T
JOIN cte ON T."parent_id" = cte."id"
)
''' % (Node._meta.db_table, Node._meta.db_table, )
def DebugHttpResponse(data): def DebugHttpResponse(data):
return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), )) return HttpResponse('<html><body style="background:#000;color:#FFF"><pre>%s</pre></body></html>' % (str(data), ))
...@@ -111,59 +117,56 @@ class CorpusController: ...@@ -111,59 +117,56 @@ class CorpusController:
@classmethod @classmethod
def ngrams(cls, request, corpus_id): def ngrams(cls, request, node_id):
# parameters retrieval and validation # parameters retrieval and validation
corpus = cls.get(corpus_id) startwith = request.GET.get('startwith', '').replace("'", "\\'")
order = request.GET.get('order', 'frequency')
if order not in _ngrams_order_columns: # build query
raise ValidationError('The order parameter should take one of the following values: ' + ', '.join(_ngrams_order_columns), 400) ParentNode = aliased(Node)
order_column = _ngrams_order_columns[order] query = (Ngram
# query building .query(Ngram.terms, func.count('*'))
cursor = connection.cursor() .join(Node_Ngram, Node_Ngram.ngram_id == Ngram.id)
cursor.execute(_sql_cte + ''' .join(Node, Node.id == Node_Ngram.node_id)
SELECT ngram.terms, COUNT(*) AS occurrences .join(ParentNode, ParentNode.id == Node.parent_id)
FROM cte .filter(ParentNode.id == node_id)
INNER JOIN %s AS node ON node.id = cte.id .filter(Ngram.terms.like('%s%%' % (startwith, )))
INNER JOIN %s AS nodetype ON nodetype.id = node.type_id .group_by(Ngram.terms)
INNER JOIN %s AS node_ngram ON node_ngram.node_id = node.id .order_by(func.count('*').desc())
INNER JOIN %s AS ngram ON ngram.id = node_ngram.ngram_id )
WHERE (NOT cte.id = \'%d\') AND (\'%d\' = ANY(cte."path"))
AND nodetype.name = 'Document'
AND ngram.terms LIKE '%s%%'
GROUP BY ngram.terms
ORDER BY occurrences DESC
''' % (
Node._meta.db_table,
NodeType._meta.db_table,
Node_Ngram._meta.db_table,
Ngram._meta.db_table,
corpus.id,
corpus.id,
request.GET.get('startwith', '').replace("'", "\\'"),
))
# # response building
# return JsonHttpResponse({
# "list" : [row[0] for row in cursor.fetchall()],
# })
# response building # response building
format = request.GET.get('format', 'json') format = request.GET.get('format', 'json')
if format == 'json': if format == 'json':
return JsonHttpResponse({ return JsonHttpResponse({
"list": [{ "collection": [{
'terms': row[0], 'terms': row[0],
'occurrences': row[1] 'occurrences': row[1]
} for row in cursor.fetchall()], } for row in query.all()],
}) })
elif format == 'csv': elif format == 'csv':
return CsvHttpResponse( return CsvHttpResponse(
[['terms', 'occurences']] + [row for row in cursor.fetchall()] [['terms', 'occurences']] + [row for row in query.all()]
) )
else: else:
raise ValidationError('Unrecognized "format=%s", should be "csv" or "json"' % (format, )) raise ValidationError('Unrecognized "format=%s", should be "csv" or "json"' % (format, ))
@classmethod @classmethod
def metadata(cls, request, corpus_id): def metadata(cls, request, node_id):
ParentNode = aliased(Node)
query = (Ngram
.query(Ngram.metadata[''], func.count('*'))
.join(Node, Node.id == Node_Ngram.node_id)
.join(ParentNode, ParentNode.id == Node.parent_id)
.filter(ParentNode.id == node_id)
.group_by(Ngram.terms)
.order_by(func.count('*').desc())
)
collection = query.all()
return JsonHttpResponse(collection)
# parameters retrieval and validation # parameters retrieval and validation
corpus = cls.get(corpus_id) corpus = cls.get(corpus_id)
# query building # query building
......
...@@ -68,6 +68,7 @@ INSTALLED_APPS = ( ...@@ -68,6 +68,7 @@ INSTALLED_APPS = (
'ngram', 'ngram',
'django_hstore', 'django_hstore',
'djcelery', 'djcelery',
'aldjemy',
) )
MIDDLEWARE_CLASSES = ( MIDDLEWARE_CLASSES = (
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment