Commit 2b07ba58 authored by delanoe's avatar delanoe

[FEAT] Adding some specific monograms to the maplist.

parent 4f12f4bc
......@@ -14,7 +14,7 @@ def do_cooc(corpus=None
, field1='ngrams', field2='ngrams'
, miam_id=None, stop_id=None, group_id=None
, cvalue_id=None
, n_min=2, n_max=None
, n_min=1, n_max=None
, start=None, end=None
, limit=1000
, isMonopartite=True
......@@ -62,7 +62,6 @@ def do_cooc(corpus=None
session.commit()
# END
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
......
......@@ -59,17 +59,25 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#xo = diag_null(x)
#y = diag_null(y)
distance = 'conditional'
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
if distance == 'conditional':
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
elif distance == 'cosine':
xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0))
n = np.max(xs.sum(axis=1))
m = np.min(xs.sum(axis=1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
......@@ -110,21 +118,21 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
def getWeight(item):
return item[1]
node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
#print(node_degree)
nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
for n in nodes_too_connected:
n_edges = list()
for v in nx.neighbors(G,n):
#print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
n_edges.append(((n, v), G[n][v]['weight']))
n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
#G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
#G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
......
......@@ -15,10 +15,15 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams
import csv
def compute_mapList(corpus,limit=500):
def compute_mapList(corpus,limit=500,n=1):
'''
According to Specificities and stoplist,
'''
monograms_part = 0.005
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
node_miam = get_or_create_node(nodetype='MiamList', corpus=corpus)
......@@ -33,18 +38,30 @@ def compute_mapList(corpus,limit=500):
Spec=aliased(NodeNodeNgram)
top_ngrams = (session.query(Spec.ngram_id, Spec.score)
query = (session.query(Spec.ngram_id, Spec.score)
.join(Miam, Spec.ngram_id == Miam.ngram_id)
.join(Ngram, Ngram.id == Spec.ngram_id)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
#.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.filter(Miam.node_id == node_miam.id)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.filter(Spec.nodex_id == node_spec.id)
)
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(Spec.score))
.limit(monograms_limit)
)
top_multigrams = (query
.filter(Ngram.n >= 2)
.order_by(desc(Spec.score))
.limit(limit)
.limit(multigrams_limit)
)
#print([t for t in top_ngrams])
node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
......@@ -53,7 +70,7 @@ def compute_mapList(corpus,limit=500):
data = zip(
[node_mapList.id for i in range(1,limit)]
, [n[0] for n in top_ngrams]
, [n[0] for n in list(top_multigrams) + list(top_monograms)]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
......@@ -100,37 +117,3 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment