Commit 2ad6bbf2 authored by Administrator's avatar Administrator

[BUG FIX] TFIDF computation with sqlalchemy (need optimization with session).

parent 63e9bfb1
...@@ -244,31 +244,31 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -244,31 +244,31 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
return data return data
def tfidf(corpus, document, ngram): #def tfidf(corpus, document, ngram):
''' # '''
Compute TF-IDF (Term Frequency - Inverse Document Frequency) # Compute TF-IDF (Term Frequency - Inverse Document Frequency)
See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf # See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
''' # '''
try: # try:
occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight # occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)]) # ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
term_frequency = occurences_of_ngram / ngrams_by_document # term_frequency = occurences_of_ngram / ngrams_by_document
#
xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count() # xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus # yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
inverse_document_frequency= log(xx/yy) # inverse_document_frequency= log(xx/yy)
#
# result = tf * idf # # result = tf * idf
result = term_frequency * inverse_document_frequency # result = term_frequency * inverse_document_frequency
except Exception as error: # except Exception as error:
print(error, ngram) # print(error, ngram)
result = 0 # result = 0
return result # return result
from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
print("doing tfidf")
with transaction.atomic(): with transaction.atomic():
if reset==True: if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete() NodeNodeNgram.objects.filter(nodex=corpus).delete()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment