Commit 2f5136ce authored by delanoe's avatar delanoe

[FIX] merge with samuel changes (use of nx.isolates for degree <=1)

parents bd24b8a3 2a9c5311
...@@ -115,8 +115,9 @@ def do_distance(cooc_id): ...@@ -115,8 +115,9 @@ def do_distance(cooc_id):
#edges_to_remove = [ e for e in G.edges_iter() if #edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree() degree = G.degree()
nodes_to_remove = [n for n in degree if degree[n] <= 1] G.remove_nodes_from(nx.isolates(G))
G.remove_nodes_from(nodes_to_remove) #nodes_to_remove = [n for n in degree if degree[n] <= 1]
#G.remove_nodes_from(nodes_to_remove)
partition = best_partition(G.to_undirected()) partition = best_partition(G.to_undirected())
print("Density of the graph:", nx.density(G)) print("Density of the graph:", nx.density(G))
return(G,partition,ids,weight) return(G,partition,ids,weight)
...@@ -150,7 +151,8 @@ def get_cooc(request=None, corpus=None ...@@ -150,7 +151,8 @@ def get_cooc(request=None, corpus=None
G.node[node]['pk'] = ids[node] G.node[node]['pk'] = ids[node]
G.node[node]['label'] = session.query(Ngram.terms).filter(Ngram.id==node).first() G.node[node]['label'] = session.query(Ngram.terms).filter(Ngram.id==node).first()
G.node[node]['size'] = weight[ids[node]] G.node[node]['size'] = weight[ids[node]]
G.node[node]['group'] = partition[node] G.node[node]['type'] = "NGrams"
G.node[node]['attributes'] = { "clust_default": partition[node]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3) # G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error: except Exception as error:
pass #PrintException() pass #PrintException()
......
...@@ -78,6 +78,8 @@ urlpatterns = patterns('', ...@@ -78,6 +78,8 @@ urlpatterns = patterns('',
url(r'^ngrams$', views.ngrams), # to be removed url(r'^ngrams$', views.ngrams), # to be removed
url(r'^nodeinfo/(\d+)$', views.nodeinfo), # to be removed ? url(r'^nodeinfo/(\d+)$', views.nodeinfo), # to be removed ?
url(r'^tfidf/(\d+)/(\w+)$', views_optimized.tfidf), url(r'^tfidf/(\d+)/(\w+)$', views_optimized.tfidf),
url(r'^api/corpusintersection/(\w+)$', views_optimized.getCorpusIntersection),
url(r'^api/userportfolio/project/(\d+)/corpuses$', views_optimized.getUserPortfolio),
url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes), url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes),
# TODO rest to update corpus and information for progress bar # TODO rest to update corpus and information for progress bar
......
...@@ -566,33 +566,14 @@ def graph(request, project_id, corpus_id, generic=100, specific=100): ...@@ -566,33 +566,14 @@ def graph(request, project_id, corpus_id, generic=100, specific=100):
project_type_id = cache.NodeType['Project'].id project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id corpus_type_id = cache.NodeType['Corpus'].id
results = {}
projs = session.query(Node).filter(Node.user_id == user_id,Node.type_id==project_type_id).all()
for i in projs:
# print(i.id , i.name)
if i.id not in results: results[i.id] = {}
results[i.id]["proj_name"] = i.name
results[i.id]["corpuses"] = []
corpuses = session.query(Node).filter(Node.parent_id==i.id , Node.type_id==corpus_type_id).all()
for j in corpuses:
if int(j.id)!=int(corpus_id):
info = { "id":j.id , "name":j.name }
results[i.id]["corpuses"].append(info)
# print("\t",j.id , j.name)
# import pprint
# pprint.pprint(results)
# if specific != None and generic != None :
graphurl = "corpus/"+str(corpus_id)+"/node_link.json" graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\ html = t.render(Context({\
'debug': settings.DEBUG, 'debug': settings.DEBUG,
'user' : user,\ 'user' : user,\
'date' : date,\ 'date' : date,\
'corpus' : corpus,\ 'corpus' : corpus,\
'project' : project,\ 'project' : project,\
'corpusinfo' : results,\
'graphfile' : graphurl,\ 'graphfile' : graphurl,\
})) }))
......
...@@ -195,7 +195,7 @@ def tfidf(request, corpus_id, ngram_ids): ...@@ -195,7 +195,7 @@ def tfidf(request, corpus_id, ngram_ids):
"""Takes IDs of corpus and ngram and returns list of relevent documents in json format """Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing). according to TFIDF score (order is decreasing).
""" """
limit=6 limit=5
nodes_list = [] nodes_list = []
# filter input # filter input
ngram_ids = ngram_ids.split('a') ngram_ids = ngram_ids.split('a')
...@@ -219,7 +219,7 @@ def tfidf(request, corpus_id, ngram_ids): ...@@ -219,7 +219,7 @@ def tfidf(request, corpus_id, ngram_ids):
# print("\tcorpus_id:",corpus_id) # print("\tcorpus_id:",corpus_id)
# convert query result to a list of dicts # convert query result to a list of dicts
for node, score in nodes_query: for node, score in nodes_query:
print("\t corpus:",corpus_id,"\t",node.name) # print("\t corpus:",corpus_id,"\t",node.name)
node_dict = { node_dict = {
'id': node.id, 'id': node.id,
'score': score, 'score': score,
...@@ -229,6 +229,85 @@ def tfidf(request, corpus_id, ngram_ids): ...@@ -229,6 +229,85 @@ def tfidf(request, corpus_id, ngram_ids):
node_dict[key] = node.hyperdata[key] node_dict[key] = node.hyperdata[key]
nodes_list.append(node_dict) nodes_list.append(node_dict)
# print("= = = = = = = = \n") return JsonHttpResponse(nodes_list)
data = json.dumps(nodes_list)
return JsonHttpResponse(data)
def getCorpusIntersection(request , corpuses_ids):
FinalDict = False
if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0:
import ast
node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ]
# Here are the visible nodes of the initial semantic map.
corpuses_ids = corpuses_ids.split('a')
corpuses_ids = [int(i) for i in corpuses_ids] # corpus[1] will be the corpus to compare
cooc_type_id = cache.NodeType['Cooccurrence'].id
cooc_ids = session.query(Node.id).filter(Node.user_id == request.user.id , Node.parent_id==corpuses_ids[1] , Node.type_id == cooc_type_id ).first()
if len(cooc_ids)==0:
return JsonHttpResponse(FinalDict)
# If corpus[1] has a coocurrence.id then lets continue
Cooc_Avg = {}
import networkx as nx
G = nx.Graph() # I use an undirected graph, because direction doesnt matter here, coocs should be a triangular matrix, so...
ngrams_data1 = session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id==cooc_ids[0], NodeNgramNgram.ngramx_id.in_( node_ids )).all()
for ngram in ngrams_data1: # are there visible nodes in the X-axis of corpus to compare ?
G.add_edge( ngram.ngramx_id , ngram.ngramy_id , weight=ngram.score)
ngrams_data2 = session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id==cooc_ids[0], NodeNgramNgram.ngramy_id.in_( node_ids )).all()
for ngram in ngrams_data2: # are there visible nodes in the Y-axis of corpus to compare ?
if not G.has_edge(ngram.ngramx_id,ngram.ngramy_id):
G.add_edge( ngram.ngramx_id , ngram.ngramy_id , weight=ngram.score)
for e in G.edges_iter():
n1 = e[0]
n2 = e[1]
# print( G[n1][n2]["weight"] , "\t", n1,",",n2 )
if n1 not in Cooc_Avg:
Cooc_Avg[n1]=0
if n2 not in Cooc_Avg:
Cooc_Avg[n2]=0
Cooc_Avg[n1]+=G[n1][n2]["weight"]
Cooc_Avg[n2]+=G[n1][n2]["weight"]
FinalDict = {}
for node in node_ids:
if node in Cooc_Avg:
FinalDict[node] = Cooc_Avg[node]/G.degree(node)
# Getting AVG-COOC of each ngram that exists in the cooc-matrix of the compared-corpus.
return JsonHttpResponse(FinalDict)
def getUserPortfolio(request , project_id):
user = request.user
user_id = cache.User[request.user.username].id
project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id
results = {}
projs = session.query(Node).filter(Node.user_id == user_id,Node.type_id==project_type_id ).all()
for i in projs:
# print (i.id,i.name)
if i.id not in results:
results[i.id] = {}
results[i.id]["proj_name"] = i.name
results[i.id]["corpuses"] = []
corpuses = session.query(Node).filter(Node.parent_id==i.id , Node.type_id==corpus_type_id).all()
for j in corpuses:
doc_count = session.query(func.count(Node.id)).filter(Node.parent_id==j.id).all()[0][0]
if doc_count >= 10:
# print(session.query(Node).filter(Node.id==j.id).first())
info = {
"id":j.id ,
"name":j.name ,
"c":doc_count
}
results[i.id]["corpuses"].append(info)
# print("\t\t",j.id , j.name , doc_count)
if len(results[i.id]["corpuses"])==0:
del results[i.id]
return JsonHttpResponse( results )
...@@ -5,6 +5,7 @@ from lxml.html import html5parser ...@@ -5,6 +5,7 @@ from lxml.html import html5parser
from datetime import datetime, date from datetime import datetime, date
from django.utils import timezone from django.utils import timezone
import dateutil.parser import dateutil.parser
import dateparser
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
...@@ -140,7 +141,10 @@ class EuropressFileParser(FileParser): ...@@ -140,7 +141,10 @@ class EuropressFileParser(FileParser):
if format_europresse == 50.2: if format_europresse == 50.2:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014' # TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
text = text.split(', ')[1] try:# # 2015-oct-08 exception added
text = text.split(', ')[1]
except:
pass
format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE) format_date_fr_v2 = re.compile('\s*\w+\s+\d+\s+\w+\s+\d{4}', re.UNICODE)
...@@ -166,29 +170,32 @@ class EuropressFileParser(FileParser): ...@@ -166,29 +170,32 @@ class EuropressFileParser(FileParser):
if test_date_fr is not None or test_date_fr_v2 is not None: if test_date_fr is not None or test_date_fr_v2 is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, "fr_FR.utf-8")
if encoding != "utf-8": if encoding != "utf-8":
text = text.replace('י', 'é') text = text.replace('י', 'é')
text = text.replace('ű', 'û') text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ') text = text.replace(' aot ', ' août ')
try : try:
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y') hyperdata['publication_date'] = dateparser.parse(text, languages=['fr'])
except : except:
try: try :
hyperdata['publication_date'] = datetime.strptime(text, '%B %Y') hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except : except :
try: try:
locale.setlocale(locale.LC_ALL, "fr_FR") hyperdata['publication_date'] = datetime.strptime(text, '%B %Y')
hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except : except :
# TODO format to parse: ' mercredi 26 novembre 2014' try:
try : locale.setlocale(locale.LC_ALL, "fr_FR")
hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y') hyperdata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except Exception as error: # hyperdata['publication_date'] = dateutil.parser.parse(text)
print(error, text) except :
pass # TODO format to parse: ' mercredi 26 novembre 2014'
try :
hyperdata['publication_date'] = datetime.strptime(text, ' %A %d %B %Y')
except Exception as error:
print(error, text)
pass
if test_date_en is not None: if test_date_en is not None:
...@@ -278,10 +285,10 @@ class EuropressFileParser(FileParser): ...@@ -278,10 +285,10 @@ class EuropressFileParser(FileParser):
else: else:
hyperdata['doi'] = "not found" hyperdata['doi'] = "not found"
try: # try:
hyperdata['length_words'] = len(hyperdata['abstract'].split(' ')) # hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
except: # except:
PrintException() # PrintException()
hyperdata['length_letters'] = len(hyperdata['abstract']) hyperdata['length_letters'] = len(hyperdata['abstract'])
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment