Commit a4606499 authored by PkSM3's avatar PkSM3

[UPDATE] just delete ngrams

parent 5a08c859
......@@ -261,7 +261,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
#edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree()
nodes_to_remove = [n for n in degree if degree[n] <= 1]
nodes_to_remove = [n for n in degree if degree[n] ==0]
G.remove_nodes_from(nodes_to_remove)
uG = G.to_undirected()
partition = best_partition(uG)
......
......@@ -44,11 +44,11 @@ def apply_workflow(corpus_id):
extract_ngrams(corpus, ['title', 'abstract'])
update_processing(corpus, 3)
compute_tfidf(corpus)
compute_tfidf_global(corpus, lang='en')
# compute_tfidf(corpus)
# compute_tfidf_global(corpus, lang='en')
computeCvalue(corpus)
groupNgrams(corpus)
# computeCvalue(corpus)
# groupNgrams(corpus)
ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing(corpus, 0)
......
......@@ -51,8 +51,8 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/documents/?$', views.corpus),
# Journals view
url(r'^project/(\d+)/corpus/(\d+)/journals/journals.json$', corpus_views.test_journals),
url(r'^project/(\d+)/corpus/(\d+)/journals', corpus_views.get_journals),
url(r'^project/(\d+)/corpus/(\d+)/journals/journals.json$', samtest.get_journals_json),
url(r'^project/(\d+)/corpus/(\d+)/journals$', samtest.get_journals),
# # # Terms view
# url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', corpus_views.test_ngrams),
......@@ -108,9 +108,9 @@ urlpatterns = patterns('',
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX),
url(r'^tests/paginator/corpus/(\d+)/$', views.newpaginatorJSON),
url(r'^tests/move2trash/$' , views.move_to_trash_multiple ),
url(r'^corpus/(\d+)/document/(\d+)/testpage$', samtest.test_test),
url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', samtest.test_ngrams),
url(r'^project/(\d+)/corpus/(\d+)/terms', samtest.get_ngrams)
url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', samtest.get_ngrams_json),
url(r'^project/(\d+)/corpus/(\d+)/terms$', samtest.get_ngrams),
url(r'^project/(\d+)/corpus/(\d+)/stop_list.json$', samtest.get_stoplist)
)
......
......@@ -31,21 +31,21 @@ var action1 = {
"name": "Delete",
"color":"red"
}
var action2 = {
"id":"to_keep",
"name": "Keep",
"color":"green"
}
var action3 = {
"id":"to_group",
"name": "Group",
"color":"blue"
}
// var action2 = {
// "id":"to_keep",
// "name": "Keep",
// "color":"green"
// }
// var action3 = {
// "id":"to_group",
// "name": "Group",
// "color":"blue"
// }
PossibleActions.push(action1)
PossibleActions.push(action2)
PossibleActions.push(action3)
// PossibleActions.push(action2)
// PossibleActions.push(action3)
var FlagsBuffer = {}
for(var i in PossibleActions) {
......@@ -655,6 +655,53 @@ function Main_test( data , initial) {
return "OK"
}
function SearchFilters( elem ) {
var MODE = elem.value;
if( MODE == "filter_all") {
var result = Main_test(AjaxRecords , MODE)
console.log( result )
return ;
}
// if( MODE == "filter_stoplist") {
// }
// if( MODE == "filter_miamlist") {
// }
// var getDupl_API = "/api/nodes/"+url_mainIDs["corpus"]+"/children/duplicates?keys=title&limit=9999"
// $.ajax({
// url: getDupl_API,
// success: function(data) {
// bisarray = data.data
// for(var i in bisarray) {
// titlebis = bisarray[i].values
// BIS_dict[titlebis[0]] = true;
// }
// var Duplicates = []
// for(var r in AjaxRecords) {
// if ( BIS_dict[AjaxRecords[r].title] )
// Duplicates.push( AjaxRecords[r] )
// }
// var result = Main_test(Duplicates , MODE)
// console.log( result )
// MyTable.data('dynatable').sorts.clear();
// MyTable.data('dynatable').sorts.add('title', 1) // 1=ASCENDING,
// MyTable.data('dynatable').process();
// }
// });
}
console.log(window.location.href+"/ngrams.json")
$.ajax({
url: window.location.href+"/ngrams.json",
......
......@@ -180,6 +180,19 @@ input[type=radio]:checked + label {
</div>
</div>
<div id="filter_search" style="visibility:hidden">
<select id="example-single-optgroups" onchange="SearchFilters(this);">
<!-- <optgroup label=""> -->
<option id="filter_all" value="filter_all">All</option>
<!-- <option id="filter_title" value="filter_title">Title</option> -->
<!-- <option id="filter_date" value="filter_date">Date</option> -->
<!-- </optgroup> -->
<!-- <optgroup label="Duplicates"> -->
<!-- <option value="filter_doi">By DOI</option> -->
<option id="filter_stoplist" value="filter_dupl-titles">StopList</option>
<option id="filter_miamlist" value="filter_dupl-titles">MiamList</option>
<!-- </optgroup> -->
</select>
<div id="savemodal" class="modal fade">
<div class="modal-dialog">
......
......@@ -138,30 +138,29 @@ def get_ngrams(request , project_id , corpus_id ):
return HttpResponse(html)
def test_test(request , corpus_id , doc_id):
def get_stoplist(request , corpus_id , doc_id):
"""Get All for a doc id"""
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
doc_id = int(doc_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
print(list_id[0][0])
# # # ngrams of list_id of corpus_id:
# commeca = "StopList"
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
to_del = {}
StopList = {}
for n in doc_ngram_list:
to_del[ n[0] ] = True
print( to_del.keys() )
StopList[ n[0] ] = True
results = [ "hola" , "mundo" ]
return JsonHttpResponse(results)
results = StopList.keys() #[ "hola" , "mundo" ]
return JsonHttpResponse(StopList)
def get_journals(request , project_id , corpus_id ):
......@@ -202,7 +201,7 @@ def get_journals(request , project_id , corpus_id ):
return HttpResponse(html)
def test_journals(request , project_id, corpus_id ):
def get_journals_json(request , project_id, corpus_id ):
results = ["hola" , "mundo"]
JournalsDict = {}
......@@ -218,171 +217,7 @@ def test_journals(request , project_id, corpus_id ):
JournalsDict[journal] += 1
return JsonHttpResponse(JournalsDict)
# old function!
def test_ngrams_test(request , project_id, corpus_id ):
results = ["hola" , "mundo"]
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
# # 13099 clinical benefits
# # 7492 recent data
# # 14279 brain development
# # 50681 possible cause
# # 47111 psychological symptoms
# # 3944 common form
# ngram_of_interest = 14279
# documents = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == document_type_id ).all()
# to_print = []
# for doc in documents:
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc.id).all()
# # print( len(NgramOccs) )
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# to_print.append( [doc.id,doc.name] )
# break
# if len(to_print)>0:
# for doc in to_print:
# doc_id = doc[0]
# doc_name = doc[1]
# print("doc_id:",doc_id)
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc_id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - - - -- - - - ")
# print("Calculation using the DB:")
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print( ngram.weight, "\t" , ngram.ngram_id)
# print( "= = = = = = = = == = = ")
# NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
# for ngram in NgramTFIDF:
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
Ngrams_Scores = {}
## < Getting the Effective nro de OCCS ##
documents = (session.query(Node).filter(
Node.user_id == user_id
, Node.parent_id==corpus_id
, Node.type_id == document_type_id ).all()
)
for doc in documents:
NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc.id).all()
for ngram in NgramOccs:
if ngram.ngram_id not in StopList:
if ngram.ngram_id not in Ngrams_Scores:
Ngrams_Scores[ngram.ngram_id] = {}
Ngrams_Scores[ngram.ngram_id]["scores"] = {
"occ_sum": 0.0,
"occ_uniq": 0.0,
"tfidf_sum": 0.0
}
Ngrams_Scores[ngram.ngram_id]["scores"]["occ_sum"]+=ngram.weight
Ngrams_Scores[ngram.ngram_id]["scores"]["occ_uniq"]+=1
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
## Getting the Effective nro de OCCS / >##
# # CA MARCHE PAS POUR TOUT LES NGRAMS!!
# ## < Getting the unique number of OCCS ##
# summ1 = len(Ngrams_Scores.keys())
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()# get whitelist id from corpus
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# summ2 = 0
# for ngram in NgramOccs:
# Ngrams_Scores[ngram.ngram_id]["occ_uniq"] = ngram.weight
# summ2+=1
# # print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - -- - - - - - ")
# print ("Sum numero 01:",summ1)
# print ("Sum numero 02:",summ2)
# ## Getting the unique number of OCCS /> ##
Sum = 0
NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
for ngram in NgramTFIDF:
if ngram.ngram_id not in StopList:
Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score
Sum += Ngrams_Scores[ngram.ngram_id]["scores"]["occ_uniq"]
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
# import pprint
# pprint.pprint( Ngrams_Scores )
# # select * from node_nodenodengram where ngram_id=14279;
# NodeNodeNgram
# nodex_id = real corpus id
# nodey_id = document id
# ngram_id = duh
# id | nodex_id | nodey_id | ngram_id | score
ngrams_ids = Ngrams_Scores.keys()
import math
occs_threshold = min ( 10 , math.sqrt(Sum / len(ngrams_ids)) )
Metrics = {
"ngrams":[],
"scores": {}
}
query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids ))
ngrams_data = query.all()
for ngram in ngrams_data:
if ngram.id not in StopList:
occ_uniq = Ngrams_Scores[ngram.id]["scores"]["occ_uniq"]
if occ_uniq > occs_threshold:
Ngrams_Scores[ngram.id]["name"] = ngram.terms
Ngrams_Scores[ngram.id]["id"] = ngram.id
Ngrams_Scores[ngram.id]["scores"]["tfidf"] = Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"] / occ_uniq
del Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"]
Metrics["ngrams"].append( Ngrams_Scores[ngram.id] )
Metrics["scores"] = {
"initial":"occ_uniq",
"nb_docs":len(documents),
"orig_nb_ngrams":len(ngrams_ids),
"nb_ngrams":len(Metrics["ngrams"]),
"occs_threshold":occs_threshold
}
return JsonHttpResponse(Metrics)
def test_ngrams(request , project_id, corpus_id ):
def get_ngrams_json(request , project_id, corpus_id ):
results = ["holaaaa" , "mundo"]
user_id = request.user.id
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment