Commit be968d2c authored by Administrator's avatar Administrator

[FIX] fix conflicts.

parent 04b49e8a
...@@ -28,3 +28,4 @@ def PrintException(): ...@@ -28,3 +28,4 @@ def PrintException():
line = linecache.getline(filename, lineno, f.f_globals) line = linecache.getline(filename, lineno, f.f_globals)
print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)) print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj))
...@@ -17,23 +17,23 @@ def create_blacklist(user, corpus): ...@@ -17,23 +17,23 @@ def create_blacklist(user, corpus):
def create_synonymes(user, corpus): def create_synonymes(user, corpus):
pass pass
size = 1000
size = 1000
def create_whitelist(user, corpus_id, size=size, count_min=2): def create_whitelist(user, corpus_id, size=size, count_min=2):
cursor = connection.cursor() cursor = connection.cursor()
whitelist_type_id = cache.NodeType['WhiteList'].id whitelist_type_id = cache.NodeType['WhiteList'].id
blacklist_type_id = cache.NodeType['BlackList'].id blacklist_type_id = cache.NodeType['BlackList'].id
type_document_id = cache.NodeType['Document'].id type_document_id = cache.NodeType['Document'].id
white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id) white_list = Node(name='WhiteList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=whitelist_type_id)
black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id) black_list = Node(name='BlackList Corpus ' + str(corpus_id), user_id=user.id, parent_id=corpus_id, type_id=blacklist_type_id)
session.add(white_list) session.add(white_list)
session.add(black_list) session.add(black_list)
session.commit() session.commit()
# delete avant pour éviter les doublons # delete avant pour éviter les doublons
# try: # try:
...@@ -105,21 +105,21 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start ...@@ -105,21 +105,21 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
COUNT(*) AS score COUNT(*) AS score
FROM FROM
node_node AS n -- the nodes who are direct children of the corpus node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN INNER JOIN
node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node node_node_ngram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN INNER JOIN
node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id node_node_ngram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE WHERE
n.parent_id = %s n.parent_id = %s
AND AND
...@@ -128,13 +128,13 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start ...@@ -128,13 +128,13 @@ def create_cooc(user=None, corpus_id=None, whitelist=None, size=size, year_start
whitelistY.node_id = %s whitelistY.node_id = %s
AND AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY GROUP BY
ngX.id, ngX.id,
ngX.terms, ngX.terms,
ngY.id, ngY.id,
ngY.terms ngY.terms
ORDER BY ORDER BY
score DESC score DESC
LIMIT LIMIT
...@@ -153,9 +153,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -153,9 +153,9 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
import networkx as nx import networkx as nx
from networkx.readwrite import json_graph from networkx.readwrite import json_graph
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.louvain import best_partition from analysis.louvain import best_partition
#print(corpus_id, cooc_id) #print(corpus_id, cooc_id)
try: try:
...@@ -172,7 +172,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -172,7 +172,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size) cooccurrence_node_id = create_cooc(user=request.user, corpus_id=corpus_id, whitelist=whitelist, size=size)
else: else:
cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() cooccurrence_node_id = session.query(Node.id).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first()
for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all(): for cooccurrence in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooccurrence_node_id).all():
...@@ -192,41 +192,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -192,41 +192,41 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
x = pd.DataFrame(matrix).fillna(0) x = pd.DataFrame(matrix).fillna(0)
y = pd.DataFrame(matrix).fillna(0) y = pd.DataFrame(matrix).fillna(0)
#xo = diag_null(x) #xo = diag_null(x)
#y = diag_null(y) #y = diag_null(y)
x = x / x.sum(axis=1) x = x / x.sum(axis=1)
y = y / y.sum(axis=0) y = y / y.sum(axis=0)
#print(x) #print(x)
xs = x.sum(axis=1) - x xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x ys = x.sum(axis=0) - x
# top inclus ou exclus # top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1)) n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific # top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1)) m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False) n = n.sort(inplace=False)
m = m.sort(inplace=False) m = m.sort(inplace=False)
print(n) print(n)
print(m) print(m)
nodes_included = 300 #int(round(size/20,0)) nodes_included = 300 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0)) #nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0)) nodes_specific = 300 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0)) #nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size # TODO user the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included]) n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic: # Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic]) #m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific: # Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:]) m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
x_index = pd.Index.union(n_index, m_index) x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)] xx = x[list(x_index)].T[list(x_index)]
...@@ -236,26 +236,28 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -236,26 +236,28 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# Removing unconnected nodes # Removing unconnected nodes
xxx = xx.values xxx = xx.values
threshold = min(xxx.max(axis=1)) threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx > threshold, xxx, 0) matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90)) #matrix_filtered = matrix_filtered.resize((90,90))
except: except:
PrintException() PrintException()
try: try:
G = nx.from_numpy_matrix(matrix_filtered) G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)]))) G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(xx.columns)])))
#print(G) #print(G)
# Removing too connected nodes (find automatic way to do it) # Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
degree = G.degree() degree = G.degree()
to_remove = [n for n in degree if degree[n] <= 1] nodes_to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(to_remove) G.remove_nodes_from(nodes_to_remove)
partition = best_partition(G) partition = best_partition(G)
except: except:
PrintException() PrintException()
if type == "node_link": if type == "node_link":
...@@ -270,7 +272,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -270,7 +272,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# G.add_edge(node, "cluster " + str(partition[node]), weight=3) # G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error: except Exception as error:
print("error01: ",error) print("error01: ",error)
data = json_graph.node_link_data(G) data = json_graph.node_link_data(G)
links = [] links = []
...@@ -285,7 +287,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -285,7 +287,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
# print(data) # print(data)
data["links"] = [] data["links"] = []
data["links"] = links data["links"] = links
elif type == "adjacency": elif type == "adjacency":
for node in G.nodes(): for node in G.nodes():
try: try:
...@@ -298,7 +300,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size= ...@@ -298,7 +300,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', size=
except Exception as error: except Exception as error:
print("error02: ",error) print("error02: ",error)
data = json_graph.node_link_data(G) data = json_graph.node_link_data(G)
# data = json_graph.node_link_data(G, attrs={\ # data = json_graph.node_link_data(G, attrs={\
# 'source':'source',\ # 'source':'source',\
......
...@@ -82,9 +82,8 @@ print('Initialize node types...') ...@@ -82,9 +82,8 @@ print('Initialize node types...')
node_types = [ node_types = [
'Root', 'Trash', 'Root', 'Trash',
'Project', 'Corpus', 'Document', 'Project', 'Corpus', 'Document',
'Stem', 'Lem', 'Tfidf', 'MiamList', 'StopList', 'MainList',
'Synonym', 'Stem', 'Lem', 'Group', 'Tfidf',
'MiamList', 'StopList',
'Cooccurrence', 'WhiteList', 'BlackList' 'Cooccurrence', 'WhiteList', 'BlackList'
] ]
...@@ -93,6 +92,20 @@ for node_type in node_types: ...@@ -93,6 +92,20 @@ for node_type in node_types:
# Integration: resource types # Integration: resource types
print('Initialize users...')
me = session.query(User).filter(User.username=='alexandre').first()
gargantua = session.query(User).filter(User.username=='gargantua').first()
node_root = Node(user_id=gargantua.id, type_id=cache.NodeType['Root'].id, name='Root')
node_stem = Node(user_id=gargantua.id, type_id=cache.NodeType['Stem'].id, name='Stem', parent_id=node_root.id)
node_lem = Node(user_id=gargantua.id, type_id=cache.NodeType['Lem'].id, name='Lem', parent_id=node_root.id)
session.add(node_root)
session.add(node_stem)
session.add(node_lem)
session.commit()
print('Initialize resource...') print('Initialize resource...')
from parsing.parsers_config import parsers from parsing.parsers_config import parsers
......
...@@ -35,7 +35,6 @@ extract_ngrams(corpus, ('title', )) ...@@ -35,7 +35,6 @@ extract_ngrams(corpus, ('title', ))
# print(corpus)
# corpus = session.query(Node).filter(Node.id == 72771).first() # corpus = session.query(Node).filter(Node.id == 72771).first()
# corpus = session.query(Node).filter(Node.id == 73017).first() # corpus = session.query(Node).filter(Node.id == 73017).first()
compute_tfidf(corpus) compute_tfidf(corpus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment