Commit 63b11f79 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] Added a route to remove duplicates in children of a node, depending on their metadata

https://forge.iscpif.fr/issues/1360
parent b7f451ba
......@@ -168,7 +168,7 @@ def Root(request, format=None):
class NodesChildrenDuplicates(APIView):
def get(self, request, node_id):
def _fetch_duplicates(self, request, node_id, extra_columns=[], min_count=1):
# input validation
if 'keys' not in request.GET:
raise APIException('Missing GET parameter: "keys"', 400)
......@@ -190,10 +190,10 @@ class NodesChildrenDuplicates(APIView):
columns.append(
getattr(_Node_Metadata, 'value_' + metadata.type)
)
# build the query!
# build the query
groups = list(columns)
duplicates_query = (get_session()
.query(*( [func.count()] + columns ))
.query(*(extra_columns + [func.count()] + columns))
.select_from(Node)
)
for _Node_Metadata, metadata in zip(aliases, metadata_query):
......@@ -202,15 +202,53 @@ class NodesChildrenDuplicates(APIView):
duplicates_query = duplicates_query.filter(Node.parent_id == node_id)
duplicates_query = duplicates_query.group_by(*columns)
duplicates_query = duplicates_query.order_by(func.count().desc())
duplicates_query = duplicates_query.having(func.count() > 1)
# return results
return JsonHttpResponse([
{
'count': duplicate[0],
'values': duplicate[1:],
}
for duplicate in duplicates_query
])
duplicates_query = duplicates_query.having(func.count() > min_count)
# and now, return it
return duplicates_query
# def get(self, request, node_id):
# # data to be returned
# duplicates = self._fetch_duplicates(request, node_id)
# # pagination
# offset = int(request.GET.get('offset', 0))
# limit = int(request.GET.get('limit', 10))
# total = duplicates.count()
# # response building
# return JsonHttpResponse({
# 'pagination': {
# 'offset': offset,
# 'limit': limit,
# 'total': total,
# },
# 'data': [
# {
# 'count': duplicate[0],
# 'values': duplicate[1:],
# }
# for duplicate in duplicates[offset : offset+limit]
# ]
# })
def delete(self, request, node_id):
session = get_session()
# get the minimum ID for each of the nodes sharing the same metadata
kept_node_ids_query = self._fetch_duplicates(request, node_id, [func.min(Node.id).label('id')], 0)
kept_node_ids = [kept_node.id for kept_node in kept_node_ids_query]
# delete the stuff
delete_query = (session
.query(Node)
.filter(Node.parent_id == node_id)
.filter(~Node.id.in_(kept_node_ids))
)
count = delete_query.count()
delete_query.delete(synchronize_session=False)
session.flush()
# return the result
return JsonHttpResponse({
'deleted': count,
})
# return duplicates_query
class NodesChildrenMetatadata(APIView):
......
......@@ -53,7 +53,7 @@ INSTALLED_APPS = (
'django.contrib.messages',
'django.contrib.staticfiles',
'django_extensions',
#'south',
'south',
'cte_tree',
'node',
'ngram',
......
......@@ -243,7 +243,7 @@ class Node(CTENode):
do_tfidf(self)
class Node_Metadata(models.Model):
node = models.ForeignKey(Node)
node = models.ForeignKey(Node, on_delete=models.CASCADE)
metadata = models.ForeignKey(Metadata)
value_int = models.IntegerField(null=True, db_index=True)
value_float = models.FloatField(null=True, db_index=True)
......@@ -252,12 +252,12 @@ class Node_Metadata(models.Model):
value_text = models.TextField(null=True)
class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource')
node = models.ForeignKey(Node, related_name='node_resource', on_delete=models.CASCADE)
resource = models.ForeignKey(Resource)
parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model):
node = models.ForeignKey(Node)
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram)
weight = models.FloatField()
def __str__(self):
......@@ -289,7 +289,7 @@ class Document(Node):
proxy=True
class NodeNgramNgram(models.Model):
node = models.ForeignKey(Node)
node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngramx = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE)
ngramy = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE)
......@@ -301,8 +301,8 @@ class NodeNgramNgram(models.Model):
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex")
nodey = models.ForeignKey(Node, related_name="nodey")
nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
......@@ -312,8 +312,8 @@ class NodeNodeNgram(models.Model):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex")
nodey = models.ForeignKey(Node, related_name="nodey")
nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment