Commit 9a617780 authored by Administrator's avatar Administrator

[CLEAN] Removing old TFIDF code.

parent edf9a1d7
import sqlalchemy
from gargantext_web import api
from node import models
from sqlalchemy import create_engine
from sqlalchemy.sql import func
import numpy as np
import collections
from math import log
NodeType = models.NodeType.sa
NodeNgram = models.Node_Ngram.sa
NodeNodeNgram = models.NodeNgramNgram.sa
Ngram = models.Ngram.sa
Node = models.Node.sa
Corpus = models.Corpus.sa
def get_session():
import sqlalchemy.orm
from django.db import connections
from sqlalchemy.orm import sessionmaker
from aldjemy.core import get_engine
alias = 'default'
connection = connections[alias]
engine = create_engine("postgresql+psycopg2://gargantua:C8kdcUrAQy66U@localhost/gargandb",
use_native_hstore=True)
Session = sessionmaker(bind=engine)
return Session()
session = get_session()
type_doc = session.query(NodeType).filter(NodeType.name == "Document").first()
def tfidf(corpus, document, ngram):
'''
Compute TF-IDF (Term Frequency - Inverse Document Frequency)
See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
'''
try:
#occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
occurrences_of_ngram = session.query(NodeNgram)\
.filter(NodeNgram.node_id == document.id)\
.filter(NodeNgram.ngram_id == ngram.id)\
.first().weight
#return(type(occurrences_of_ngram))
#ngrams_by_document = np.sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
ngrams_by_document = session.query(NodeNgram).filter(NodeNgram.node_id == document.id).count()
term_frequency = occurrences_of_ngram / ngrams_by_document
#return term_frequency
#xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
xx = session.query(Node)\
.filter(Node.parent_id == corpus.id)\
.filter(Node.type_id == type_doc.id) .count()
#yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
yy = session.query(NodeNgram)\
.join(Node, NodeNgram.node_id == Node.id)\
.filter(Node.parent_id == corpus.id)\
.filter(NodeNgram.ngram_id == ngram.id)\
.count()
# print("\t\t\t","occs:",occurrences_of_ngram," || ngramsbydoc:",ngrams_by_document," || TF = occ/ngramsbydoc:",term_frequency," |||||| x:",xx," || y:",yy," || IDF = log(x/y):",log(xx/yy))
inverse_document_frequency= log(xx/yy)
# result = tf * idf
result = term_frequency * inverse_document_frequency
return result
except Exception as error:
print(error)
#corpus = session.query(Node).get(int(102750))
#ngram = session.query(Ngram).get(10885)
##ngram = session.query(Ngram).filter(Ngram.terms == "bayer").first()
#type_doc = session.query(NodeType).filter(NodeType.name == "Document").first()
#doc_id = session.query(NodeNgram.node, NodeNgram.node_id)\
# .join(Node, Node.id == NodeNgram.node_id)\
# .filter(NodeNgram.ngram == ngram)\
# .filter(Node.type_id == type_doc.id)\
# .first()
#document = session.query(Node).get(doc_id[1])
#
#result = tfidf(corpus,document, ngram)
#print(result)
#
......@@ -701,73 +701,3 @@ def nodeinfo(request , node_id):
return HttpResponse(html)
def tfidf2(request, corpus_id, ngram_id):
"""
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
#it will receive something like: api/tfidf/corpus_id/NGRAM1aNGRAM2aNGRAM3aNGRAM4...
docsids = ngram_id.split("a")
tfidf_list = []
for i in docsids:
pub = Node.objects.get(id=i)
finalpub = {}
finalpub["id"] = pub.id
pubhyperdata = pub.hyperdata
if "title" in pubhyperdata: finalpub["title"] = pubhyperdata['title']
if "publication_date" in pubhyperdata: finalpub["publication_date"] = pubhyperdata['publication_date']
if "journal" in pubhyperdata: finalpub["journal"] = pubhyperdata['journal']
if "authors" in pubhyperdata: finalpub["authors"] = pubhyperdata['authors']
if "fields" in pubhyperdata: finalpub["fields"] = pubhyperdata['fields']
tfidf_list.append(finalpub) # doing a dictionary with only available atributes
if len(tfidf_list)==6: break # max 6 papers
data = json.dumps(tfidf_list)
# data = ["hola","mundo"]
return JsonHttpResponse(data)
def tfidf(request, corpus_id, ngram_id):
"""
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
#it will receive something like: api/tfidf/corpus_id/NGRAM1aNGRAM2aNGRAM3aNGRAM4...
ngramsids = ngram_id.split("a")
corpus = Node.objects.get(id=corpus_id)
ngram = Ngram.objects.get(id=ngramsids[0])#not used
print("********web/views.tfidf*******")
print("first ngram:")
print(ngram)
node_node_ngrams = NodeNodeNgram.objects.filter(nodex=corpus, ngram__in=ngramsids).order_by('-score')
# print(node_node_ngrams)
goodDict = {}
for x in node_node_ngrams:
goodDict[x.nodey.id] = x.nodey
# print("imma here")
# print("arguments... nodes ids:")
# print(ngramsids)
# print ("with tfidf:")
# print(node_node_ngrams)
# print("corpus:")
# print(NodeNodeNgram.objects.filter(nodex=corpus))
tfidf_list = []
for x in goodDict:
pub = goodDict[x] # getting the unique publication
finalpub = {}
finalpub["id"] = pub.id
if "title" in pub.hyperdata: finalpub["title"] = pub.hyperdata['title']
if "publication_date" in pub.hyperdata: finalpub["publication_date"] = pub.hyperdata['publication_date']
if "journal" in pub.hyperdata: finalpub["journal"] = pub.hyperdata['journal']
if "authors" in pub.hyperdata: finalpub["authors"] = pub.hyperdata['authors']
if "fields" in pub.hyperdata: finalpub["fields"] = pub.hyperdata['fields']
tfidf_list.append(finalpub) # doing a dictionary with only available atributes
if len(tfidf_list)==6: break # max 6 papers
data = json.dumps(tfidf_list)
return JsonHttpResponse(data)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment