Commit 1348a4af authored by PkSM3's avatar PkSM3

[UPDATE] best-method(?) for getting ngram-scores

parent 7eba0f07
...@@ -80,6 +80,7 @@ def create_whitelist(user, corpus_id, size=size, count_min=2, miam_id=None): ...@@ -80,6 +80,7 @@ def create_whitelist(user, corpus_id, size=size, count_min=2, miam_id=None):
%d %d
; ;
""" % (white_list.id, int(corpus_id), int(type_document_id), int(miam_id), count_min, size) """ % (white_list.id, int(corpus_id), int(type_document_id), int(miam_id), count_min, size)
# print("PRINTING QYERY OF WHITELIST:") # print("PRINTING QYERY OF WHITELIST:")
# print(query_whitelist) # print(query_whitelist)
cursor.execute(query_whitelist) cursor.execute(query_whitelist)
......
...@@ -54,9 +54,9 @@ urlpatterns = patterns('', ...@@ -54,9 +54,9 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/journals/journals.json$', corpus_views.test_journals), url(r'^project/(\d+)/corpus/(\d+)/journals/journals.json$', corpus_views.test_journals),
url(r'^project/(\d+)/corpus/(\d+)/journals', corpus_views.get_journals), url(r'^project/(\d+)/corpus/(\d+)/journals', corpus_views.get_journals),
# # Terms view # # # Terms view
url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', corpus_views.test_ngrams), # url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', corpus_views.test_ngrams),
url(r'^project/(\d+)/corpus/(\d+)/terms/?$', corpus_views.get_ngrams), # url(r'^project/(\d+)/corpus/(\d+)/terms/?$', corpus_views.get_ngrams),
# Update corpus # Update corpus
url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes), url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes),
......
...@@ -7,6 +7,8 @@ from django.template.loader import get_template ...@@ -7,6 +7,8 @@ from django.template.loader import get_template
from django.template import Context from django.template import Context
from node import models from node import models
# from node.models import Node_Ngram
from django.db import connection
#from node.models import Language, ResourceType, Resource, \ #from node.models import Language, ResourceType, Resource, \
# Node, NodeType, Node_Resource, Project, Corpus, \ # Node, NodeType, Node_Resource, Project, Corpus, \
# Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram # Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
...@@ -28,6 +30,7 @@ from collections import defaultdict ...@@ -28,6 +30,7 @@ from collections import defaultdict
from parsing.FileParsers import * from parsing.FileParsers import *
import os import os
import json import json
import math
# SOME FUNCTIONS # SOME FUNCTIONS
...@@ -216,7 +219,8 @@ def test_journals(request , project_id, corpus_id ): ...@@ -216,7 +219,8 @@ def test_journals(request , project_id, corpus_id ):
JournalsDict[journal] += 1 JournalsDict[journal] += 1
return JsonHttpResponse(JournalsDict) return JsonHttpResponse(JournalsDict)
def test_ngrams(request , project_id, corpus_id ): # old function!
def test_ngrams_test(request , project_id, corpus_id ):
results = ["hola" , "mundo"] results = ["hola" , "mundo"]
user_id = request.user.id user_id = request.user.id
...@@ -374,3 +378,102 @@ def test_ngrams(request , project_id, corpus_id ): ...@@ -374,3 +378,102 @@ def test_ngrams(request , project_id, corpus_id ):
return JsonHttpResponse(Metrics) return JsonHttpResponse(Metrics)
def test_ngrams(request , project_id, corpus_id ):
results = ["holaaaa" , "mundo"]
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
# [ Get Uniq_Occs ]
myamlist_type_id = cache.NodeType['MiamList'].id
myamlist = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == myamlist_type_id ).first()
sql_average = """SELECT avg(weight) as Average FROM node_node_ngram WHERE node_node_ngram.node_id=%d""" % (myamlist.id)
cursor = connection.cursor()
cursor.execute(sql_average)
avg_result = cursor.fetchone()[0]
threshold = min (10 , math.sqrt(avg_result) )
OCCs = session.query(Node_Ngram).filter( Node_Ngram.node_id==myamlist.id , Node_Ngram.weight >= threshold ).all()
# [ / Get Uniq_Occs ]
# [ Initializing Ngrams_Scores with occ_uniq ]
Ngrams_Scores = {}
for ngram in OCCs:
if ngram.ngram_id not in StopList:
if ngram.ngram_id not in Ngrams_Scores:
Ngrams_Scores[ngram.ngram_id] = {}
Ngrams_Scores[ngram.ngram_id]["scores"] = {
"occ_uniq": ngram.weight,
"tfidf_sum": 0.0
}
# [ / Initializing Ngrams_Scores with occ_uniq ]
# [ Getting TF-IDF scores (sum per each ngram) ]
NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
for ngram in NgramTFIDF:
if ngram.ngram_id not in StopList:
if ngram.ngram_id in Ngrams_Scores:
Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score
# [ / Getting TF-IDF scores ]
# [ Preparing JSON-Array full of Scores! ]
Metrics = {
"ngrams":[],
"scores": {}
}
ngrams_ids = Ngrams_Scores.keys()
query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids ))
ngrams_data = query.all()
for ngram in ngrams_data:
if ngram.id not in StopList:
occ_uniq = occ_uniq = Ngrams_Scores[ngram.id]["scores"]["occ_uniq"]
Ngrams_Scores[ngram.id]["name"] = ngram.terms
Ngrams_Scores[ngram.id]["id"] = ngram.id
Ngrams_Scores[ngram.id]["scores"]["tfidf"] = Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"] / occ_uniq
del Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"]
Metrics["ngrams"].append( Ngrams_Scores[ngram.id] )
Metrics["scores"] = {
"initial":"occ_uniq",
"nb_docs":1,
"orig_nb_ngrams":1,
"nb_ngrams":len(Metrics["ngrams"]),
"occs_threshold":threshold
}
# [ / Preparing JSON-Array full of Scores! ]
# print("miamlist:",myamlist.id)
# print("sql avg:",sql_average)
# print (avg_result)
# print ("LALALALALALALALLLALALALALA")
return JsonHttpResponse(Metrics)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment