Commit be75f405 authored by Administrator's avatar Administrator

[FEAT] List of relevant documents according tfidf score.

parent 33429e65
from node.models import Language, ResourceType, Resource, \ from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \ Node, NodeType, Node_Resource, Project, Corpus, \
Node_Ngram, NodeNgramNgram Node_Ngram, NodeNgramNgram, NodeNodeNgram
from collections import defaultdict from collections import defaultdict
from django.db import connection, transaction from django.db import connection, transaction
from math import log
def create_blacklist(user, corpus): def create_blacklist(user, corpus):
pass pass
...@@ -233,6 +235,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -233,6 +235,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
return data return data
def tfidf(corpus, document, ngram):
try:
x = Node_Ngram.objects.get(node=document, ngram=ngram).weight
y = Node_Ngram.objects.filter(node=document).count()
tf = x/y
xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
yy = Node_Ngram.objects.filter(ngram=ngram).count()
idf= log(xx/yy)
result = tf * idf
except Exception as error:
print(error)
result = 0
return result
def do_tfidf(corpus, reset=True):
with transaction.atomic():
if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document):
try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
except:
score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save()
else:
print("Only corpus implemented yet, you put instead:", type(corpus))
......
...@@ -36,6 +36,7 @@ urlpatterns = patterns('', ...@@ -36,6 +36,7 @@ urlpatterns = patterns('',
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^corpus/(\d+)/node_link.json$', views.node_link), url(r'^corpus/(\d+)/node_link.json$', views.node_link),
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency), url(r'^corpus/(\d+)/adjacency.json$', views.adjacency),
url(r'^api/tfidf/(\d+)/(\d+)$', views.tfidf),
url(r'^api$', gargantext_web.api.Root), url(r'^api$', gargantext_web.api.Root),
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()), url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
......
...@@ -7,7 +7,7 @@ from django.template import Context ...@@ -7,7 +7,7 @@ from django.template import Context
from node.models import Language, ResourceType, Resource, \ from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \ Node, NodeType, Node_Resource, Project, Corpus, \
Node_Ngram, NodeNgramNgram Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
from node.admin import CorpusForm, ProjectForm, ResourceForm from node.admin import CorpusForm, ProjectForm, ResourceForm
...@@ -470,7 +470,7 @@ def send_csv(request, corpus_id): ...@@ -470,7 +470,7 @@ def send_csv(request, corpus_id):
# To get the data # To get the data
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.functions import get_cooc from analysis.functions import get_cooc
import json
def node_link(request, corpus_id): def node_link(request, corpus_id):
''' '''
...@@ -488,7 +488,6 @@ def adjacency(request, corpus_id): ...@@ -488,7 +488,6 @@ def adjacency(request, corpus_id):
data = get_cooc(request=request, corpus_id=corpus_id, type="adjacency") data = get_cooc(request=request, corpus_id=corpus_id, type="adjacency")
return JsonHttpResponse(data) return JsonHttpResponse(data)
def graph_it(request): def graph_it(request):
'''The new multimodal graph.''' '''The new multimodal graph.'''
t = get_template('graph-it.html') t = get_template('graph-it.html')
...@@ -528,3 +527,21 @@ def ngrams(request): ...@@ -528,3 +527,21 @@ def ngrams(request):
})) }))
return HttpResponse(html) return HttpResponse(html)
def tfidf(request, corpus_id, ngram_id):
"""
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
corpus = Node.objects.get(id=corpus_id)
ngram = Ngram.objects.get(id=ngram_id)
node_node_ngrams = NodeNodeNgram.objects.filter(nodex=corpus, ngram=ngram).order_by('-score')
tfidf_list = [ dict(id=x.nodey.id, title=x.nodey.metadata['title']) for x in node_node_ngrams]
data = json.dumps(tfidf_list)
return JsonHttpResponse(data)
...@@ -239,7 +239,8 @@ class Node(CTENode): ...@@ -239,7 +239,8 @@ class Node(CTENode):
self.parse_resources() self.parse_resources()
type_document = NodeType.objects.get(name='Document') type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
from analysis.functions import do_tfidf
do_tfidf(self)
class Node_Metadata(models.Model): class Node_Metadata(models.Model):
node = models.ForeignKey(Node) node = models.ForeignKey(Node)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment