Commit e848d4cc authored by delanoe's avatar delanoe

[FEAT] Adding distributional distance to graph (need to be filtered).

parent 7d8c854d
......@@ -29,11 +29,10 @@ from sqlalchemy.orm import aliased
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
......@@ -55,30 +54,19 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x)
#y = diag_null(y)
distance = 'conditional'
if distance == 'conditional':
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
elif distance == 'cosine':
xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0))
n = np.max(xs.sum(axis=1))
m = np.min(xs.sum(axis=1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
......@@ -106,16 +94,64 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
#G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
elif distance == 'cosine':
xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0))
n = np.max(xs.sum(axis=1))
m = np.min(xs.sum(axis=1))
elif distance == 'distributional':
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
# r = result
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k] for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > 0 and r[i][j] > r[j][i]
]
)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
#
......@@ -144,6 +180,7 @@ def get_cooc(request=None, corpus=None
, cooc_id=None, type='node_link', size=1000
, start=None, end=None
, hapax=1
, distance='conditional'
):
'''
get_ccoc : to compute the graph.
......@@ -168,7 +205,8 @@ def get_cooc(request=None, corpus=None
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size
, isMonopartite=True, start=start , end=end , hapax=hapax)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=True)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams"
, isMonopartite=True, distance=distance)
if type == "node_link":
nodesB_dict = {}
......
......@@ -22,20 +22,24 @@ class Graph(APIView):
format_ = request.GET.get('format', 'json')
type_ = request.GET.get('type', 'node_link')
hapax = request.GET.get('hapax', 1)
distance = request.GET.get('distance', 'conditional')
corpus = session.query(Node).filter(Node.id==corpus_id).first()
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',]
options = ['start', 'end', 'hapax']
options = ['start', 'end', 'hapax', 'distance']
if field1 in accepted_field1 :
if field2 in accepted_field2 :
if start is not None and end is not None :
data = get_cooc(corpus=corpus,field1=field1, field2=field2, start=start, end=end, hapax=hapax)
data = get_cooc(corpus=corpus,field1=field1, field2=field2
, start=start, end=end
, hapax=hapax, distance=distance)
else:
data = get_cooc(corpus=corpus,field1=field1, field2=field2, hapax=hapax)
data = get_cooc(corpus=corpus,field1=field1, field2=field2
, hapax=hapax, distance = distance)
if format_ == 'json':
return JsonHttpResponse(data)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment