Commit 022f1c6f authored by delanoe's avatar delanoe

[FEAT] Adding TFIDF filter for Explorer.

parent e861900f
from gargantext.models import Node from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram
from gargantext.constants import NODETYPES from gargantext.constants import NODETYPES
from gargantext.util.db import session, delete from gargantext.util.db import session, delete, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache, or_
from gargantext.util.validation import validate from gargantext.util.validation import validate
from gargantext.util.http import ValidationException, APIView \ from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse , get_parameters, JsonHttpResponse
...@@ -98,6 +98,84 @@ class NodeListResource(APIView): ...@@ -98,6 +98,84 @@ class NodeListResource(APIView):
return JsonHttpResponse({'deleted': result.rowcount}) return JsonHttpResponse({'deleted': result.rowcount})
class NodeListHaving(APIView):
'''
Gives a list of nodes according to its score which is related
to some specific ngrams.
TODO: implement other options (offset)
Simple implementation:
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
'''
def get(self, request, corpus_id):
parameters = get_parameters(request)
parameters = validate(parameters, {'score': str, 'ngram_ids' : list} )
try :
ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')]
except :
raise ValidationException('"ngram_ids" needs integers separated by comma.')
limit=5
nodes_list = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
tfidf_id = ( session.query( Node.id )
.filter( Node.typename == "TFIDF-CORPUS"
, Node.parent_id == corpus.id
)
.first()
)
tfidf_id = tfidf_id[0]
print(tfidf_id)
# request data
nodes_query = (session
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.node2_id == Node.id)
.filter(NodeNodeNgram.node1_id == tfidf_id)
.filter(Node.typename == 'DOCUMENT', Node.parent_id== corpus.id)
.filter(or_(*[NodeNodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
.group_by(Node)
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
)
# print("\n")
# print("in TFIDF:")
# print("\tcorpus_id:",corpus_id)
# convert query result to a list of dicts
# if nodes_query is None:
# print("TFIDF error, juste take sums")
# nodes_query = (session
# .query(Node, func.sum(NodeNgram.weight))
# .join(NodeNgram, NodeNgram.node_id == Node.id)
# .filter(Node.parent_id == corpus_id)
# .filter(Node.typename == 'DOCUMENT')
# .filter(or_(*[NodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
# .group_by(Node)
# .order_by(func.sum(NodeNgram.weight).desc())
# .limit(limit)
# )
for node, score in nodes_query:
print(node,score)
print("\t corpus:",corpus_id,"\t",node.name)
node_dict = {
'id': node.id,
'score': score,
}
for key in ('title', 'publication_date', 'journal', 'authors', 'fields'):
if key in node.hyperdata:
node_dict[key] = node.hyperdata[key]
nodes_list.append(node_dict)
return JsonHttpResponse(nodes_list)
class NodeResource(APIView): class NodeResource(APIView):
...@@ -188,44 +266,4 @@ class CorpusFacet(APIView): ...@@ -188,44 +266,4 @@ class CorpusFacet(APIView):
return (xcounts, total) return (xcounts, total)
class CorpusGraph(APIView):
'''
Generate a graph
'''
def get(self, request, node_id):
# check that the node is a corpus
# ? faster from cache than: corpus = session.query(Node)...
corpus = cache.Node[node_id]
if corpus.typename != 'CORPUS':
raise ValidationException(
"Only nodes of type CORPUS can accept facet queries" +
" (but this node has type %s)..." % corpus.typename
)
else:
self.corpus = corpus
# check that the hyperfield parameter makes sense
_facet_available_subfields = [
'journal', 'publication_year', 'rubrique',
'language_iso2', 'language_iso3', 'language_name'
]
parameters = get_parameters(request)
# validate() triggers an info message if subfield not in range
parameters = validate(parameters, {'type': dict, 'items': {
'hyperfield': {'type': str, 'range': _facet_available_subfields}
}})
subfield = parameters['hyperfield']
# do_cooc
# do_distance
# response
return JsonHttpResponse({
'doc_count' : total,
'by': { subfield: xcounts }
})
...@@ -9,6 +9,7 @@ urlpatterns = [ ...@@ -9,6 +9,7 @@ urlpatterns = [
url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view()), url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view()),
url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()), url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
# get a list of ngram_ids or ngram_infos by list_id # get a list of ngram_ids or ngram_infos by list_id
# #
......
/srv/gargantext_lib/js/extras_explorerjs.js
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment