Commit f4fceb54 authored by PkSM3's avatar PkSM3

[UPDATE] table by occs finished!

parent 30b37b48
...@@ -80,8 +80,8 @@ urlpatterns = patterns('', ...@@ -80,8 +80,8 @@ urlpatterns = patterns('',
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX), url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX),
url(r'^tests/paginator/corpus/(\d+)/$', views.newpaginatorJSON), url(r'^tests/paginator/corpus/(\d+)/$', views.newpaginatorJSON),
url(r'^tests/move2trash/$' , views.move_to_trash_multiple ), url(r'^tests/move2trash/$' , views.move_to_trash_multiple ),
url(r'^project/(\d+)/corpus/(\d+)/ngrams/ngrams.json$', samtest.test_ngrams) url(r'^project/(\d+)/corpus/(\d+)/ngrams/ngrams.json$', samtest.test_ngrams),
# url(r'^project/(\d+)/corpus/(\d+)/ngrams$', views.get_ngrams), url(r'^project/(\d+)/corpus/(\d+)/ngrams$', samtest.get_ngrams)
) )
......
This diff is collapsed.
...@@ -174,9 +174,11 @@ def test_ngrams(request , project_id, corpus_id ): ...@@ -174,9 +174,11 @@ def test_ngrams(request , project_id, corpus_id ):
# ## Getting the unique number of OCCS /> ## # ## Getting the unique number of OCCS /> ##
Sum = 0
NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all() NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
for ngram in NgramTFIDF: for ngram in NgramTFIDF:
Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score
Sum += Ngrams_Scores[ngram.ngram_id]["scores"]["occ_uniq"]
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score) # print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
...@@ -195,25 +197,34 @@ def test_ngrams(request , project_id, corpus_id ): ...@@ -195,25 +197,34 @@ def test_ngrams(request , project_id, corpus_id ):
ngrams_ids = Ngrams_Scores.keys() ngrams_ids = Ngrams_Scores.keys()
import math
occs_threshold = math.sqrt(Sum / len(ngrams_ids))
print("excluding ngrams with OCCs <",occs_threshold)
Metrics = { Metrics = {
"ngrams":[], "ngrams":[],
"scores": { "scores": {}
"nb_docs":len(documents),
"nb_ngrams":len(ngrams_ids)
}
} }
query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids )) query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids ))
ngrams_data = query.all() ngrams_data = query.all()
for ngram in ngrams_data: for ngram in ngrams_data:
if Ngrams_Scores[ngram.id]["scores"]["occ_uniq"] > occs_threshold:
Ngrams_Scores[ngram.id]["name"] = ngram.terms Ngrams_Scores[ngram.id]["name"] = ngram.terms
Ngrams_Scores[ngram.id]["id"] = ngram.id Ngrams_Scores[ngram.id]["id"] = ngram.id
Metrics["ngrams"].append( Ngrams_Scores[ngram.id] ) Metrics["ngrams"].append( Ngrams_Scores[ngram.id] )
Metrics["scores"] = {
"nb_docs":len(documents),
"orig_nb_ngrams":len(ngrams_ids),
"nb_ngrams":len(Metrics["ngrams"]),
"occs_threshold":occs_threshold
}
return JsonHttpResponse(Metrics) return JsonHttpResponse(Metrics)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment