Commit af9ec6af authored by Mathieu Rodic's avatar Mathieu Rodic

[OPTI] Improved determination of most relevant documents from ngrams in `/api/tfidf`

Also, values are now comma-separated instead of 'a'-separated
https://forge.iscpif.fr/issues/1481
parent 32341612
...@@ -46,7 +46,7 @@ urlpatterns = patterns('', ...@@ -46,7 +46,7 @@ urlpatterns = patterns('',
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^corpus/(\d+)/node_link.json$', views.node_link), url(r'^corpus/(\d+)/node_link.json$', views.node_link),
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency), url(r'^corpus/(\d+)/adjacency.json$', views.adjacency),
url(r'^api/tfidf/(\d+)/(\w+)$', views.tfidf), url(r'^api/tfidf/(\d+)/(\d+(?:,\d+)+)$', views_optimized.tfidf),
# Data management # Data management
url(r'^api$', gargantext_web.api.Root), url(r'^api$', gargantext_web.api.Root),
......
...@@ -655,46 +655,3 @@ def nodeinfo(request , node_id): ...@@ -655,46 +655,3 @@ def nodeinfo(request , node_id):
'node_id' : node_id, 'node_id' : node_id,
})) }))
return HttpResponse(html) return HttpResponse(html)
def tfidf(request, corpus_id, ngram_id):
"""
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
#it will receive something like: api/tfidf/corpus_id/NGRAM1aNGRAM2aNGRAM3aNGRAM4...
ngramsids = ngram_id.split("a")
corpus = Node.objects.get(id=corpus_id)
ngram = Ngram.objects.get(id=ngramsids[0])#not used
print("********web/views.tfidf*******")
print("first ngram:")
print(ngram)
node_node_ngrams = NodeNodeNgram.objects.filter(nodex=corpus, ngram__in=ngramsids).order_by('-score')
# print(node_node_ngrams)
goodDict = {}
for x in node_node_ngrams:
goodDict[x.nodey.id] = x.nodey
# print("imma here")
# print("arguments... nodes ids:")
# print(ngramsids)
# print ("with tfidf:")
# print(node_node_ngrams)
# print("corpus:")
# print(NodeNodeNgram.objects.filter(nodex=corpus))
tfidf_list = []
for x in goodDict:
pub = goodDict[x] # getting the unique publication
finalpub = {}
finalpub["id"] = pub.id
if "title" in pub.metadata: finalpub["title"] = pub.metadata['title']
if "publication_date" in pub.metadata: finalpub["publication_date"] = pub.metadata['publication_date']
if "journal" in pub.metadata: finalpub["journal"] = pub.metadata['journal']
if "authors" in pub.metadata: finalpub["authors"] = pub.metadata['authors']
if "fields" in pub.metadata: finalpub["fields"] = pub.metadata['fields']
tfidf_list.append(finalpub) # doing a dictionary with only available atributes
if len(tfidf_list)==6: break # max 6 papers
data = json.dumps(tfidf_list)
return JsonHttpResponse(data)
...@@ -12,6 +12,7 @@ from threading import Thread ...@@ -12,6 +12,7 @@ from threading import Thread
from node.admin import CustomForm from node.admin import CustomForm
from gargantext_web.db import * from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
...@@ -152,3 +153,32 @@ def project(request, project_id): ...@@ -152,3 +153,32 @@ def project(request, project_id):
'number' : corpora_count, 'number' : corpora_count,
}) })
def tfidf(request, corpus_id, ngram_ids, limit=6):
"""Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
# filter input
ngram_ids = ngram_ids.split(',')
# request data
nodes_query = (session
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.nodey_id == Node.id)
.filter(NodeNodeNgram.nodex_id == corpus_id)
.filter(NodeNodeNgram.ngram_id.in_(ngram_ids))
.group_by(Node)
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
)
# convert query result to a list of dicts
nodes_list = []
for node, score in nodes_query:
node_dict = {
'id': node.id,
'score': score,
}
for key in ('title', 'publication_date', 'journal', 'authors', 'fields'):
if key in node.metadata:
node_dict[key] = node.metadata[key]
nodes_list.append(node_dict)
# return the result
return JsonHttpResponse(nodes_list)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment