Commit 20061ba9 authored by delanoe's avatar delanoe

[FEAT] Adding bipartite cooccurrences as REST API:

to run it:
1) ./manage syncdb
2) add and parse a new corpus
3) test it:
http://localhost:8000/api/corpus/$corpus_id/graph?field1=journal&field2=ngrams
http://localhost:8000/api/corpus/$corpus_id/graph?field1=authors&field2=ngrams

Now, have to update the JSON format for the explorer.
parent 8a294673
......@@ -4,15 +4,13 @@ from sqlalchemy.orm import aliased
from sqlalchemy.sql import func
from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeNodeNgram, NodeHyperdata, Hyperdata
NodeNodeNgram, NodeHyperdataNgram, NodeHyperdata, Hyperdata
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations
import inspect
# keep list
def do_cooc(corpus=None
, field_X=None, field_Y=None
, field1='ngrams', field2='ngrams'
, miam_id=None, stop_id=None, group_id=None
, cvalue_id=None
, n_min=2, n_max=None
......@@ -34,59 +32,103 @@ def do_cooc(corpus=None
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Get node
node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
, name_str="Cooccurrences corpus " + str(corpus.id) + "list_id: " + str(miam_id)
, name_str="Cooccurrences corpus " \
+ str(corpus.id) + "list_id: " + str(miam_id)
#, hyperdata={'field1': field1, 'field2':field2}
)
# BEGIN
# Saving the parameters of the analysis in the Node JSONB hyperdata field
args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
hyperdata = dict()
for parameter in parameters.keys():
if parameter != 'corpus' and parameter != 'node_cooc':
hyperdata[parameter] = parameters[parameter]
node_cooc.hyperdata = hyperdata
session.add(node_cooc)
session.commit()
# END
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
session.commit()
NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram)
cooc_score = func.sqrt(func.sum(NodeNgramX.weight) * func.sum(NodeNgramY.weight)).label('cooc_score')
doc_id = cache.NodeType['Document'].id
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
)
# Size of the ngrams between n_min and n_max
if field1 == field2 == 'ngrams' :
isMonopartite = True
else:
isMonopartite = False
hyperdata_id = session.query(Hyperdata).filter(Hyperdata.name=='source').first().id
test_query = (session.query(NodeHyperdataNgram)
.join(Node, Node.id == NodeHyperdataNgram.node_id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
.filter(NodeHyperdataNgram.hyperdata_id==hyperdata_id)
)
#print([n for n in test_query])
if isMonopartite :
NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram)
cooc_score = func.sqrt(func.sum(NodeNgramX.weight) * func.sum(NodeNgramY.weight)).label('cooc_score')
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
)
else :
NodeNgramY = aliased(NodeNgram)
cooc_score = func.sqrt(func.sum(NodeHyperdataNgram.score) * func.sum(NodeNgramY.weight)).label('cooc_score')
cooc_query = (session.query(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeHyperdataNgram.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.join(NodeHyperdata, NodeHyperdata.node_id==Node.id)
.join(Hyperdata, Hyperdata.id == NodeHyperdata.hyperdata_id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
.filter(Hyperdata.name == field1)
)
# Size of the ngrams between n_min and n_max
if n_min is not None or n_max is not None:
NgramX = aliased(Ngram)
if isMonopartite:
NgramX = aliased(Ngram)
cooc_query = cooc_query.join(NgramX, NgramX.id == NodeNgramX.ngram_id)
NgramY = aliased(Ngram)
cooc_query = (cooc_query
.join(NgramX, NgramX.id == NodeNgramX.ngram_id)
.join(NgramY, NgramY.id == NodeNgramY.ngram_id)
)
if n_min is not None:
cooc_query = (cooc_query
.filter(NgramX.n >= n_min)
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
if n_max is not None:
cooc_query = (cooc_query
.filter(NgramX.n >= n_min)
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
# Cooc between the dates start and end
# Cooc between the dates start and end
if start is not None:
Start=aliased(NodeHyperdata)
StartFormat = aliased(Hyperdata)
......@@ -107,49 +149,56 @@ def do_cooc(corpus=None
)
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.having(cooc_score > 4)
#.having(cooc_score > 1)
if isMonopartite:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
cooc_query = cooc_query.having(cooc_score > 1)
#.having(cooc_score > 1)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc('cooc_score'))
if isMonopartite:
cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
else:
cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)
#.limit(50)
)
cooc_query = cooc_query.order_by(desc('cooc_score'))
# END of the query
matrix = WeightedMatrix(cooc_query)
#print(matrix)
# Select according some scores
# Select according some scores
if cvalue_id is not None :
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cvalue_list = UnweightedList(session.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.nodex_id == cvalue_id).all()
)
if miam_id is not None :
miam_list = UnweightedList(miam_id)
if stop_id is not None :
stop_list = UnweightedList(stop_id)
if group_id is not None :
group_list = Translations(group_id)
if miam_id is not None and stop_id is None and group_id is None :
cooc = matrix & miam_list
elif miam_id is not None and stop_id is not None and group_id is None :
cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is not None and group_id is not None :
print("miam_id is not None and stop_id is not None and group_id is not None")
#cooc = matrix & (miam_list * group_list - stop_list)
cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is None and group_id is not None :
cooc = matrix & (miam_list * group_list)
else :
cooc = matrix
if isMonopartite:
if miam_id is not None :
miam_list = UnweightedList(miam_id)
if stop_id is not None :
stop_list = UnweightedList(stop_id)
if group_id is not None :
group_list = Translations(group_id)
if miam_id is not None and stop_id is None and group_id is None :
cooc = matrix & miam_list
elif miam_id is not None and stop_id is not None and group_id is None :
cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is not None and group_id is not None :
print("miam_id is not None and stop_id is not None and group_id is not None")
#cooc = matrix & (miam_list * group_list - stop_list)
cooc = matrix & (miam_list - stop_list)
elif miam_id is not None and stop_id is None and group_id is not None :
cooc = matrix & (miam_list * group_list)
else :
cooc = matrix
else:
cooc = matrix
#print(cooc)
#print(" x " * 30)
cooc.save(node_cooc.id)
return(node_cooc.id)
......@@ -24,36 +24,28 @@ from rest_v1_0.api import JsonHttpResponse
from analysis.louvain import best_partition, generate_dendogram, partition_at_level
from ngram.lists import listIds
from sqlalchemy.orm import aliased
def diag_null(x):
return x - x * scipy.eye(x.shape[0])
size = 1000
def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=size):
def do_distance(cooc_id):
'''
get_ccoc : to compute the graph.
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
matrix = defaultdict(lambda : defaultdict(float))
ids = dict()
labels = dict()
weight = dict()
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus).id
cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
# data deleted each time
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).delete()
cooc_id = do_cooc(corpus=corpus, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size)
Cooc = aliased(NodeNgramNgram)
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
for cooc in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all():
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
#print(query)
for cooc in query:
labels[cooc.ngramx_id] = cooc.ngramx_id
labels[cooc.ngramy_id] = cooc.ngramy_id
......@@ -125,12 +117,31 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
degree = G.degree()
nodes_to_remove = [n for n in degree if degree[n] <= 1]
G.remove_nodes_from(nodes_to_remove)
uG = G.to_undirected()
partition = best_partition(uG)
print(partition)
partition = best_partition(G.to_undirected())
print("Density of the graph:", nx.density(G))
return(G,partition,ids,weight)
def get_cooc(request=None, corpus=None
, field1='ngrams', field2='ngrams'
, cooc_id=None, type='node_link', size=1000):
'''
get_ccoc : to compute the graph.
'''
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print("Coocurrences do not exist yet, create it.")
miam_id = get_or_create_node(nodetype='MiamList', corpus=corpus).id
stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
group_id = get_or_create_node(nodetype='Group', corpus=corpus).id
# data deleted each time
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
#session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).delete()
cooc_id = do_cooc(corpus=corpus, field1=field1, field2=field2
, miam_id=miam_id, group_id=group_id, stop_id=stop_id, limit=size)
G, partition, ids, weight = do_distance(cooc_id)
if type == "node_link":
for node in G.nodes():
......
......@@ -252,7 +252,7 @@ def get_or_create_node(nodetype=None,corpus=None,corpus_id=None,name_str=None,hy
)
if name_str is not None:
node = node.filter(Node.name==name_str)
if parameters is not None:
if hyperdata is not None:
for k,v in hyperdata.items():
node = node.filter(Node.hyperdata[k] == v)
node = node.first()
......
......@@ -68,6 +68,7 @@ urlpatterns = patterns('',
# Visualizations
url(r'^project/(\d+)/corpus/(\d+)/chart$', views.chart),
url(r'^project/(\d+)/corpus/(\d+)/explorer$', views.graph),
url(r'^project/(\d+)/corpus/(\d+)/explorer/(\d+)/(\d+)$', views.graph),
url(r'^project/(\d+)/corpus/(\d+)/matrix$', views.matrix),
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), # => api.node.children('type' : 'data', 'format' : 'csv')
......@@ -81,6 +82,8 @@ urlpatterns = patterns('',
url(r'^project/(\d+)/corpus/(\d+)/(\w+)/update$', views.update_nodes),
# TODO rest to update corpus and information for progress bar
url(r'^corpus/(\d+)/sankey$', views.sankey),
url(r'^corpus/(\d+)/sankey.csv$', views.sankey_csv),
############################################################################
url(r'^tests/', include('tests.urls')),
......
......@@ -9,7 +9,7 @@ from admin.utils import DebugTime
from gargantext_web.db import *
from .parsers_config import parsers as _parsers
from ngram.tools import insert_ngrams
# keep all the parsers in a cache
class Parsers(defaultdict):
......@@ -144,9 +144,32 @@ def parse_resources(corpus, user=None, user_id=None):
#print('I am here', node_hyperdata_lists.items())
hyperdata_set = set()
hyperdata_ngrams = set()
node_hyperdata_ngrams = set()
for field in ['source', 'authors', 'journal']:
hyperdata_set.add(session.query(Hyperdata.id).filter(Hyperdata.name==field).first()[0])
#print("hyperdata_set", hyperdata_set)
for key, values in node_hyperdata_lists.items():
#print('here', key, values)
bulk_insert(Node_Hyperdata, ['node_id', 'hyperdata_id', 'value_'+key], values)
if key == 'string':
for value in values:
if value[1] in hyperdata_set:
for val in value[2].split(', '):
hyperdata_ngrams.add((val, len(val.split(' '))))
node_hyperdata_ngrams.add((value[0], value[1], val))
#print(hyperdata_ngrams)
terms_id = insert_ngrams(list(hyperdata_ngrams))
bulk_insert(NodeHyperdataNgram
, ['node_id', 'hyperdata_id', 'ngram_id', 'score']
, [(node_id, hyperdata_id, terms_id[terms], 1)
for node_id, hyperdata_id, terms in list(node_hyperdata_ngrams)])
# mark the corpus as parsed
corpus.parsed = True
......@@ -154,7 +177,6 @@ def parse_resources(corpus, user=None, user_id=None):
# ngrams extraction
from .NgramsExtractors import EnglishNgramsExtractor, FrenchNgramsExtractor, NgramsExtractor
class NgramsExtractors(defaultdict):
def __init__(self):
# English
self['en'] = EnglishNgramsExtractor()
......@@ -230,66 +252,15 @@ def extract_ngrams(corpus, keys):
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list[node_id][terms] += 1
ngrams_data.add((n, terms[:255]))
ngrams_data.add((terms[:255],n))
ngrams_language_data.add((terms, language_id))
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
cursor.execute('''
CREATE TEMPORARY TABLE tmp__ngrams (
id INT,
n INT NOT NULL,
terms VARCHAR(255) NOT NULL
)
''')
bulk_insert('tmp__ngrams', ['n', 'terms'], ngrams_data, cursor=cursor)
# retrieve ngram ids from already inserted stuff
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
''' % (Ngram.__table__.name, ))
# insert, then get the ids back
cursor.execute('''
INSERT INTO
%s (n, terms)
SELECT
n, terms
FROM
tmp__ngrams
WHERE
id IS NULL
''' % (Ngram.__table__.name, ))
ngram_ids = insert_ngrams(ngrams_data)
cursor.execute('''
UPDATE
tmp__ngrams
SET
id = ngram.id
FROM
%s AS ngram
WHERE
ngram.terms = tmp__ngrams.terms
AND
tmp__ngrams.id IS NULL
''' % (Ngram.__table__.name, ))
# get all ids
ngram_ids = dict()
cursor.execute('SELECT id, terms FROM tmp__ngrams')
for row in cursor.fetchall():
ngram_ids[row[1]] = row[0]
#
dbg.show('insert associations')
node_ngram_data = list()
for node_id, ngrams in node_ngram_list.items():
......@@ -304,7 +275,3 @@ def extract_ngrams(corpus, keys):
# commit to database
db.commit()
from rest_v1_0.api import APIView, APIException, JsonHttpResponse, CsvHttpResponse
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext_web.db import session, Node
from analysis.functions import get_cooc
class Graph(APIView):
authentication_classes = (SessionAuthentication, BasicAuthentication)
def get(self, request, corpus_id):
'''
Graph.get :: Get graph data as REST api.
Get all the parameters first
graph?field1=ngrams&field2=ngrams&
'''
field1 = request.GET.get('field1', 'ngrams')
field2 = request.GET.get('field2', 'ngrams')
format_ = request.GET.get('format', 'json')
type_ = request.GET.get('type', 'node_link')
corpus = session.query(Node).filter(Node.id==corpus_id).first()
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',]
if field1 in accepted_field1 :
if field2 in accepted_field2 :
data = get_cooc(corpus=corpus,field1=field1, field2=field2)
if format_ == 'json':
return JsonHttpResponse(data)
else:
return JsonHttpResponse({
'Warning USAGE' : 'One field for each range:'
, 'field1' : accepted_field1
, 'field2' : accepted_field2
})
......@@ -2,7 +2,7 @@ from django.conf.urls import patterns, url
from gargantext_web import views_optimized
from rest_v1_0 import api, ngrams
from rest_v1_0 import api, ngrams, graph
urlpatterns = patterns('',
# REST URLS
......@@ -34,6 +34,9 @@ urlpatterns = patterns('',
url(r'nodes/(\d+)/ngrams$', api.CorpusController.ngrams),
url(r'nodes/(\d+)/ngrams$', api.CorpusController.ngrams),
url(r'nodes/(\d+)/graph$', graph.Graph.as_view()),
url(r'corpus/(\d+)/graph$', graph.Graph.as_view()),
url(r'tfidf/(\d+)/(\w+)$', views_optimized.tfidf),
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment