Commit be968d2c authored by Administrator's avatar Administrator

[FIX] fix conflicts.

parent 04b49e8a
......@@ -28,3 +28,4 @@ def PrintException():
line = linecache.getline(filename, lineno, f.f_globals)
print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
......@@ -17,23 +17,23 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus):
pass
size = 1000
size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2):
cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id
blacklist_type_id = cache.NodeType['BlackList'].id
type_document_id = cache.NodeType['Document'].id
white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id)
black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id)
session.add(white_list)
session.add(black_list)
session.commit()
# delete avant pour éviter les doublons
# try:
......@@ -105,21 +105,21 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE
n.parent_id = %s
AND
......@@ -128,13 +128,13 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
whitelistY.node_id = %s
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY
score DESC
LIMIT
......@@ -153,9 +153,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
import networkx as nx
from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition
#print(corpus_id, cooc_id)
try:
......@@ -172,7 +172,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
......@@ -192,41 +192,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x)
#y = diag_null(y)
x = x / x.sum(axis=1)
y = y / y.sum(axis=0)
#print(x)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
print(n)
print(m)
nodes_included = 300 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
......@@ -236,26 +236,28 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0)
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
except:
PrintException()
try:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree()
to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(to_remove)
nodes_to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(nodes_to_remove)
partition = best_partition(G)
except:
PrintException()
if type == "node_link":
......@@ -270,7 +272,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error:
print("error01: ",error)
data = json_graph.node_link_data(G)
links = []
......@@ -285,7 +287,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# print(data)
data["links"] = []
data["links"] = links
elif type == "adjacency":
for node in G.nodes():
try:
......@@ -298,7 +300,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
except Exception as error:
print("error02: ",error)
data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\
......
......@@ -82,9 +82,8 @@ print('Initialize node types...')
node_types = [
'Root', 'Trash',
'Project', 'Corpus', 'Document',
'Stem', 'Lem', 'Tfidf',
'Synonym',
'MiamList', 'StopList',
'MiamList', 'StopList', 'MainList',
'Stem', 'Lem', 'Group', 'Tfidf',
'Cooccurrence', 'WhiteList', 'BlackList'
]
......@@ -93,6 +92,20 @@ for node_type in node_types:
# Integration: resource types
print('Initialize users...')
me = session.query(User).filter(User.username=='alexandre').first()
gargantua = session.query(User).filter(User.username=='gargantua').first()
node_root = Node(user_id=gargantua.id, type_id=cache.NodeType['Root'].id, name='Root')
node_stem = Node(user_id=gargantua.id, type_id=cache.NodeType['Stem'].id, name='Stem', parent_id=node_root.id)
node_lem = Node(user_id=gargantua.id, type_id=cache.NodeType['Lem'].id, name='Lem', parent_id=node_root.id)
session.add(node_root)
session.add(node_stem)
session.add(node_lem)
session.commit()
print('Initialize resource...')
from parsing.parsers_config import parsers
......
......@@ -35,7 +35,6 @@ extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment