Commit 2f5136ce authored by delanoe's avatar delanoe

[FIX] merge with samuel changes (use of nx.isolates for degree <=1)

parents bd24b8a3 2a9c5311
......@@ -115,8 +115,9 @@ def do_distance(cooc_id):
#edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree()
nodes_to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(nodes_to_remove)
G.remove_nodes_from(nx.isolates(G))
#nodes_to_remove = [n for n in degree if degree[n] <= 1]
#G.remove_nodes_from(nodes_to_remove)
partition = best_partition(G.to_undirected())
print("Density of the graph:", nx.density(G))
return(G,partition,ids,weight)
......@@ -150,7 +151,8 @@ def get_cooc(request=None, corpus=None
G.node[node]['pk'] = ids[node]
G.node[node]['label'] = session.query(Ngram.terms).filter(Ngram.id==node).first()
G.node[node]['size'] = weight[ids[node]]
G.node[node]['group'] = partition[node]
G.node[node]['type'] = "NGrams"
G.node[node]['attributes'] = { "clust_default": partition[node]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error:
pass #PrintException()
......
......@@ -78,6 +78,8 @@ urlpatterns = patterns('',
url(r'^ngrams$', views.ngrams), # to be removed
url(r'^nodeinfo/(\d+)$', views.nodeinfo), # to be removed ?
url(r'^tfidf/(\d+)/(\w+)$', views_optimized.tfidf),
url(r'^api/corpusintersection/(\w+)$', views_optimized.getCorpusIntersection),
url(r'^api/userportfolio/project/(\d+)/corpuses$', views_optimized.getUserPortfolio),
url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes),
# TODO rest to update corpus and information for progress bar
......
......@@ -566,33 +566,14 @@ def graph(request, project_id, corpus_id, generic=100, specific=100):
project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id
results = {}
projs = session.query(Node).filter(Node.user_id == user_id,Node.type_id==project_type_id).all()
for i in projs:
# print(i.id , i.name)
if i.id not in results: results[i.id] = {}
results[i.id]["proj_name"] = i.name
results[i.id]["corpuses"] = []
corpuses = session.query(Node).filter(Node.parent_id==i.id , Node.type_id==corpus_type_id).all()
for j in corpuses:
if int(j.id)!=int(corpus_id):
info = { "id":j.id , "name":j.name }
results[i.id]["corpuses"].append(info)
# print("\t",j.id , j.name)
# import pprint
# pprint.pprint(results)
# if specific != None and generic != None :
graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\
'debug': settings.DEBUG,
'user' : user,\
'date' : date,\
'corpus' : corpus,\
'project' : project,\
'corpusinfo' : results,\
'graphfile' : graphurl,\
}))
......
......@@ -195,7 +195,7 @@ def tfidf(request, corpus_id, ngram_ids):
"""Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
"""
limit=6
limit=5
nodes_list = []
# filter input
ngram_ids = ngram_ids.split('a')
......@@ -219,7 +219,7 @@ def tfidf(request, corpus_id, ngram_ids):
# print("\tcorpus_id:",corpus_id)
# convert query result to a list of dicts
for node, score in nodes_query:
print("\t corpus:",corpus_id,"\t",node.name)
# print("\t corpus:",corpus_id,"\t",node.name)
node_dict = {
'id': node.id,
'score': score,
......@@ -229,6 +229,85 @@ def tfidf(request, corpus_id, ngram_ids):
node_dict[key] = node.hyperdata[key]
nodes_list.append(node_dict)
# print("= = = = = = = = \n")
data = json.dumps(nodes_list)
return JsonHttpResponse(data)
return JsonHttpResponse(nodes_list)
def getCorpusIntersection(request , corpuses_ids):
FinalDict = False
if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0:
import ast
node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ]
# Here are the visible nodes of the initial semantic map.
corpuses_ids = corpuses_ids.split('a')
corpuses_ids = [int(i) for i in corpuses_ids] # corpus[1] will be the corpus to compare
cooc_type_id = cache.NodeType['Cooccurrence'].id
cooc_ids = session.query(Node.id).filter(Node.user_id == request.user.id , Node.parent_id==corpuses_ids[1] , Node.type_id == cooc_type_id ).first()
if len(cooc_ids)==0:
return JsonHttpResponse(FinalDict)
# If corpus[1] has a coocurrence.id then lets continue
Cooc_Avg = {}
import networkx as nx
G = nx.Graph() # I use an undirected graph, because direction doesnt matter here, coocs should be a triangular matrix, so...
ngrams_data1 = session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id==cooc_ids[0], NodeNgramNgram.ngramx_id.in_( node_ids )).all()
for ngram in ngrams_data1: # are there visible nodes in the X-axis of corpus to compare ?
G.add_edge( ngram.ngramx_id , ngram.ngramy_id , weight=ngram.score)
ngrams_data2 = session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id==cooc_ids[0], NodeNgramNgram.ngramy_id.in_( node_ids )).all()
for ngram in ngrams_data2: # are there visible nodes in the Y-axis of corpus to compare ?
if not G.has_edge(ngram.ngramx_id,ngram.ngramy_id):
G.add_edge( ngram.ngramx_id , ngram.ngramy_id , weight=ngram.score)
for e in G.edges_iter():
n1 = e[0]
n2 = e[1]
# print( G[n1][n2]["weight"] , "\t", n1,",",n2 )
if n1 not in Cooc_Avg:
Cooc_Avg[n1]=0
if n2 not in Cooc_Avg:
Cooc_Avg[n2]=0
Cooc_Avg[n1]+=G[n1][n2]["weight"]
Cooc_Avg[n2]+=G[n1][n2]["weight"]
FinalDict = {}
for node in node_ids:
if node in Cooc_Avg:
FinalDict[node] = Cooc_Avg[node]/G.degree(node)
# Getting AVG-COOC of each ngram that exists in the cooc-matrix of the compared-corpus.
return JsonHttpResponse(FinalDict)
def getUserPortfolio(request , project_id):
user = request.user
user_id = cache.User[request.user.username].id
project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id
results = {}
projs = session.query(Node).filter(Node.user_id == user_id,Node.type_id==project_type_id ).all()
for i in projs:
# print (i.id,i.name)
if i.id not in results:
results[i.id] = {}
results[i.id]["proj_name"] = i.name
results[i.id]["corpuses"] = []
corpuses = session.query(Node).filter(Node.parent_id==i.id , Node.type_id==corpus_type_id).all()
for j in corpuses:
doc_count = session.query(func.count(Node.id)).filter(Node.parent_id==j.id).all()[0][0]
if doc_count >= 10:
# print(session.query(Node).filter(Node.id==j.id).first())
info = {
"id":j.id ,
"name":j.name ,
"c":doc_count
}
results[i.id]["corpuses"].append(info)
# print("\t\t",j.id , j.name , doc_count)
if len(results[i.id]["corpuses"])==0:
del results[i.id]
return JsonHttpResponse( results )
......@@ -5,6 +5,7 @@ from lxml.html import html5parser
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
from .FileParser import FileParser
from ..NgramsExtractors import *
......@@ -140,7 +141,10 @@ class EuropressFileParser(FileParser):
if format_europresse == 50.2:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
try:# # 2015-oct-08 exception added
text = text.split(', ')[1]
except:
pass
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
......@@ -166,12 +170,15 @@ class EuropressFileParser(FileParser):
if test_date_fr is not None or test_date_fr_v2 is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding)
locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try:
hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
except:
try :
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except :
......@@ -278,10 +285,10 @@ class EuropressFileParser(FileParser):
else:
hyperdata['doi'] = "not found"
try:
hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
except:
PrintException()
# try:
# hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
# except:
# PrintException()
hyperdata['length_letters'] = len(hyperdata['abstract'])
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment