Commit e848d4cc authored by delanoe's avatar delanoe

[FEAT] Adding distributional distance to graph (need to be filtered).

parent 7d8c854d
...@@ -29,11 +29,10 @@ from sqlalchemy.orm import aliased ...@@ -29,11 +29,10 @@ from sqlalchemy.orm import aliased
def diag_null(x): def diag_null(x):
return x - x * scipy.eye(x.shape[0]) return x - x * scipy.eye(x.shape[0])
def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True, distance='conditional'):
''' '''
do_distance :: Int -> (Graph, Partition, {ids}, {weight}) do_distance :: Int -> (Graph, Partition, {ids}, {weight})
''' '''
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
matrix = defaultdict(lambda : defaultdict(float)) matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int)) ids = defaultdict(lambda : defaultdict(int))
...@@ -55,66 +54,103 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True): ...@@ -55,66 +54,103 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score weight[cooc.ngramy_id] = weight.get(cooc.ngramy_id, 0) + cooc.score
x = pd.DataFrame(matrix).fillna(0) x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x)
#y = diag_null(y)
distance = 'conditional'
if distance == 'conditional': if distance == 'conditional':
x = x / x.sum(axis=1) x = x / x.sum(axis=1)
y = y / y.sum(axis=0) #y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x ys = x.sum(axis=0) - x
# top inclus ou exclus # top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1)) n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific # top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1)) m = ( xs - ys) / (2 * (x.shape[0] - 1))
elif distance == 'cosine': n = n.sort(inplace=False)
xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0)) m = m.sort(inplace=False)
n = np.max(xs.sum(axis=1))
m = np.min(xs.sum(axis=1))
n = n.sort(inplace=False) nodes_included = 500 #int(round(size/20,0))
m = m.sort(inplace=False) #nodes_excluded = int(round(size/10,0))
nodes_included = 500 #int(round(size/20,0)) nodes_specific = 500 #int(round(size/10,0))
#nodes_excluded = int(round(size/10,0)) #nodes_generic = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0)) # TODO use the included score for the node size
#nodes_generic = int(round(size/10,0)) n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# TODO use the included score for the node size x_index = pd.Index.union(n_index, m_index)
n_index = pd.Index.intersection(x.index, n.index[:nodes_included]) xx = x[list(x_index)].T[list(x_index)]
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index) # Removing unconnected nodes
xx = x[list(x_index)].T[list(x_index)] xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
# Removing unconnected nodes G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
xxx = xx.values G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0) elif distance == 'cosine':
#matrix_filtered = matrix_filtered.resize((90,90)) xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0))
n = np.max(xs.sum(axis=1))
m = np.min(xs.sum(axis=1))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered)) elif distance == 'distributional':
#G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph()) mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
# r = result
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k] for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > 0 and r[i][j] > r[j][i]
]
)
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
# Removing too connected nodes (find automatic way to do it) # Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if #edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1] # nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove) # G.remove_nodes_from(nodes_to_remove)
def getWeight(item): def getWeight(item):
return item[1] return item[1]
...@@ -144,6 +180,7 @@ def get_cooc(request=None, corpus=None ...@@ -144,6 +180,7 @@ def get_cooc(request=None, corpus=None
, cooc_id=None, type='node_link', size=1000 , cooc_id=None, type='node_link', size=1000
, start=None, end=None , start=None, end=None
, hapax=1 , hapax=1
, distance='conditional'
): ):
''' '''
get_ccoc : to compute the graph. get_ccoc : to compute the graph.
...@@ -168,7 +205,8 @@ def get_cooc(request=None, corpus=None ...@@ -168,7 +205,8 @@ def get_cooc(request=None, corpus=None
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size , miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size
, isMonopartite=True, start=start , end=end , hapax=hapax) , isMonopartite=True, start=start , end=end , hapax=hapax)
G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams", isMonopartite=True) G, partition, ids, weight = do_distance(cooc_id, field1="ngrams", field2="ngrams"
, isMonopartite=True, distance=distance)
if type == "node_link": if type == "node_link":
nodesB_dict = {} nodesB_dict = {}
......
...@@ -19,23 +19,27 @@ class Graph(APIView): ...@@ -19,23 +19,27 @@ class Graph(APIView):
start = request.GET.get('start', None) start = request.GET.get('start', None)
end = request.GET.get('end' , None) end = request.GET.get('end' , None)
format_ = request.GET.get('format', 'json') format_ = request.GET.get('format', 'json')
type_ = request.GET.get('type', 'node_link') type_ = request.GET.get('type', 'node_link')
hapax = request.GET.get('hapax', 1) hapax = request.GET.get('hapax', 1)
distance = request.GET.get('distance', 'conditional')
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
accepted_field1 = ['ngrams', 'journal', 'source', 'authors'] accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',] accepted_field2 = ['ngrams',]
options = ['start', 'end', 'hapax'] options = ['start', 'end', 'hapax', 'distance']
if field1 in accepted_field1 : if field1 in accepted_field1 :
if field2 in accepted_field2 : if field2 in accepted_field2 :
if start is not None and end is not None : if start is not None and end is not None :
data = get_cooc(corpus=corpus,field1=field1, field2=field2, start=start, end=end, hapax=hapax) data = get_cooc(corpus=corpus,field1=field1, field2=field2
, start=start, end=end
, hapax=hapax, distance=distance)
else: else:
data = get_cooc(corpus=corpus,field1=field1, field2=field2, hapax=hapax) data = get_cooc(corpus=corpus,field1=field1, field2=field2
, hapax=hapax, distance = distance)
if format_ == 'json': if format_ == 'json':
return JsonHttpResponse(data) return JsonHttpResponse(data)
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment