Commit 63b11f79 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] Added a route to remove duplicates in children of a node, depending on their metadata

https://forge.iscpif.fr/issues/1360
parent b7f451ba
...@@ -168,7 +168,7 @@ def Root(request, format=None): ...@@ -168,7 +168,7 @@ def Root(request, format=None):
class NodesChildrenDuplicates(APIView): class NodesChildrenDuplicates(APIView):
def get(self, request, node_id): def _fetch_duplicates(self, request, node_id, extra_columns=[], min_count=1):
# input validation # input validation
if 'keys' not in request.GET: if 'keys' not in request.GET:
raise APIException('Missing GET parameter: "keys"', 400) raise APIException('Missing GET parameter: "keys"', 400)
...@@ -190,10 +190,10 @@ class NodesChildrenDuplicates(APIView): ...@@ -190,10 +190,10 @@ class NodesChildrenDuplicates(APIView):
columns.append( columns.append(
getattr(_Node_Metadata, 'value_' + metadata.type) getattr(_Node_Metadata, 'value_' + metadata.type)
) )
# build the query! # build the query
groups = list(columns) groups = list(columns)
duplicates_query = (get_session() duplicates_query = (get_session()
.query(*( [func.count()] + columns )) .query(*(extra_columns + [func.count()] + columns))
.select_from(Node) .select_from(Node)
) )
for _Node_Metadata, metadata in zip(aliases, metadata_query): for _Node_Metadata, metadata in zip(aliases, metadata_query):
...@@ -202,15 +202,53 @@ class NodesChildrenDuplicates(APIView): ...@@ -202,15 +202,53 @@ class NodesChildrenDuplicates(APIView):
duplicates_query = duplicates_query.filter(Node.parent_id == node_id) duplicates_query = duplicates_query.filter(Node.parent_id == node_id)
duplicates_query = duplicates_query.group_by(*columns) duplicates_query = duplicates_query.group_by(*columns)
duplicates_query = duplicates_query.order_by(func.count().desc()) duplicates_query = duplicates_query.order_by(func.count().desc())
duplicates_query = duplicates_query.having(func.count() > 1) duplicates_query = duplicates_query.having(func.count() > min_count)
# return results # and now, return it
return JsonHttpResponse([ return duplicates_query
{
'count': duplicate[0], # def get(self, request, node_id):
'values': duplicate[1:], # # data to be returned
} # duplicates = self._fetch_duplicates(request, node_id)
for duplicate in duplicates_query # # pagination
]) # offset = int(request.GET.get('offset', 0))
# limit = int(request.GET.get('limit', 10))
# total = duplicates.count()
# # response building
# return JsonHttpResponse({
# 'pagination': {
# 'offset': offset,
# 'limit': limit,
# 'total': total,
# },
# 'data': [
# {
# 'count': duplicate[0],
# 'values': duplicate[1:],
# }
# for duplicate in duplicates[offset : offset+limit]
# ]
# })
def delete(self, request, node_id):
session = get_session()
# get the minimum ID for each of the nodes sharing the same metadata
kept_node_ids_query = self._fetch_duplicates(request, node_id, [func.min(Node.id).label('id')], 0)
kept_node_ids = [kept_node.id for kept_node in kept_node_ids_query]
# delete the stuff
delete_query = (session
.query(Node)
.filter(Node.parent_id == node_id)
.filter(~Node.id.in_(kept_node_ids))
)
count = delete_query.count()
delete_query.delete(synchronize_session=False)
session.flush()
# return the result
return JsonHttpResponse({
'deleted': count,
})
# return duplicates_query
class NodesChildrenMetatadata(APIView): class NodesChildrenMetatadata(APIView):
......
...@@ -53,7 +53,7 @@ INSTALLED_APPS = ( ...@@ -53,7 +53,7 @@ INSTALLED_APPS = (
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'django_extensions', 'django_extensions',
#'south', 'south',
'cte_tree', 'cte_tree',
'node', 'node',
'ngram', 'ngram',
......
...@@ -243,7 +243,7 @@ class Node(CTENode): ...@@ -243,7 +243,7 @@ class Node(CTENode):
do_tfidf(self) do_tfidf(self)
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
node = models.ForeignKey(Node) node = models.ForeignKey(Node, on_delete=models.CASCADE)
metadata = models.ForeignKey(Metadata) metadata = models.ForeignKey(Metadata)
value_int = models.IntegerField(null=True, db_index=True) value_int = models.IntegerField(null=True, db_index=True)
value_float = models.FloatField(null=True, db_index=True) value_float = models.FloatField(null=True, db_index=True)
...@@ -252,12 +252,12 @@ class Node_Metadata(models.Model): ...@@ -252,12 +252,12 @@ class Node_Metadata(models.Model):
value_text = models.TextField(null=True) value_text = models.TextField(null=True)
class Node_Resource(models.Model): class Node_Resource(models.Model):
node = models.ForeignKey(Node, related_name='node_resource') node = models.ForeignKey(Node, related_name='node_resource', on_delete=models.CASCADE)
resource = models.ForeignKey(Resource) resource = models.ForeignKey(Resource)
parsed = models.BooleanField(default=False) parsed = models.BooleanField(default=False)
class Node_Ngram(models.Model): class Node_Ngram(models.Model):
node = models.ForeignKey(Node) node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram) ngram = models.ForeignKey(Ngram)
weight = models.FloatField() weight = models.FloatField()
def __str__(self): def __str__(self):
...@@ -289,7 +289,7 @@ class Document(Node): ...@@ -289,7 +289,7 @@ class Document(Node):
proxy=True proxy=True
class NodeNgramNgram(models.Model): class NodeNgramNgram(models.Model):
node = models.ForeignKey(Node) node = models.ForeignKey(Node, on_delete=models.CASCADE)
ngramx = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE) ngramx = models.ForeignKey(Ngram, related_name="nodengramngramx", on_delete=models.CASCADE)
ngramy = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE) ngramy = models.ForeignKey(Ngram, related_name="nodengramngramy", on_delete=models.CASCADE)
...@@ -301,8 +301,8 @@ class NodeNgramNgram(models.Model): ...@@ -301,8 +301,8 @@ class NodeNgramNgram(models.Model):
class NodeNodeNgram(models.Model): class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex") nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey") nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE) ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
...@@ -312,8 +312,8 @@ class NodeNodeNgram(models.Model): ...@@ -312,8 +312,8 @@ class NodeNodeNgram(models.Model):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score) return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
class NodeNodeNgram(models.Model): class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex") nodex = models.ForeignKey(Node, related_name="nodex", on_delete=models.CASCADE)
nodey = models.ForeignKey(Node, related_name="nodey") nodey = models.ForeignKey(Node, related_name="nodey", on_delete=models.CASCADE)
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE) ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment