Commit cb3e51d9 authored by Romain Loth's avatar Romain Loth

Merge testing (commit 'd9d93ae0') into romain-testing

(importing the graph advances in my branch)
parents d5dfc9c4 d9d93ae0
......@@ -395,7 +395,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 599
graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40
,'mapList' : 50
}
......@@ -6,17 +6,14 @@ from rest_framework.views import APIView
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.db_cache import cache, or_
from gargantext.util.validation import validate
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.util.http import ValidationException, APIView, JsonHttpResponse, get_parameters
from gargantext.util.files import upload
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.http import ValidationException, APIView, JsonHttpResponse, get_parameters
from gargantext.util.scheduling import scheduled
from gargantext.util.validation import validate
#import
......
Module Graph Explorer: from text to graph
=========================================
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## Graph Explorer main
0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py
2) Data are retrieved as REST: rest.py
3) Graph is generated (graph.py) through different steps
-> Graph Explorer
-> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py
b) Cooccurences are computed (in live or asynchronously): cooccurrences.py
c) Thresold and distances : distances.py
d) clustering: louvain.py
c) links between communities: bridgeness.py
b) Data are retrieved as REST
rest.py: check REST parameters
c) graph.py:
get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features:
a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO
1) save parameters in hyperdata
2) graph explorer:
* save current graph
2) myGraphs view:
myGraphs view:
* progress bar
* Show already computed graphs vs to be computed with parameters
* show parameters
* copy / paste and change some parameters to generate new graph
......@@ -8,22 +8,27 @@ from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
'''
What is bridgeness ?
Bridgeness = measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default for Python)
# Data are stored in a dict(), (== hashmap by default with Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
#node,type(labels[node])
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
......@@ -65,12 +70,20 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
if bridgeness > 0:
for c1 in com_link.keys():
for c2 in com_link[c1].keys():
index = round(bridgeness*len(com_link[c1][c2]) / (len(com_ids[c1]) + len(com_ids[c2])))
index = round(
bridgeness * len( com_link[c1][c2] )
/ #----------------------------------#
( len(com_ids[c1]) + len(com_ids[c2] ))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0:
for link in sorted(com_link[c1][c2], key=lambda x: x[2], reverse=True)[:index]:
for link in sorted( com_link[c1][c2]
, key=lambda x: x[2]
, reverse=True)[:index]:
#print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info)
......
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata, HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from sqlalchemy import desc, asc, or_, and_
from datetime import datetime
#import inspect
import datetime
from celery import shared_task
def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id )
mapList = UnweightedList( mapList_id )
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
return cooc
@shared_task
def countCooccurrences( corpus_id=None , test= False
def countCooccurrences( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3
, save_on_db= False, # just return the WeightedMatrix,
# (don't write to DB)
, save_on_db= True , reset=True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of parameters are not supported because, lists need to
be merged before.
corpus :: Corpus
mapList_id :: Int
groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Parameters to save in hyperdata of the Node Cooc
# FIXME remove the lines below after factorization of parameters
parameters = dict()
parameters['field1'] = field1
parameters['field2'] = field2
......@@ -57,17 +47,17 @@ def countCooccurrences( corpus_id=None , test= False
# Get corpus as Python object
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node
if not coocNode_id:
coocNode_id0 = ( session.query( Node.id )
# Get node of the Graph
if not cooc_id:
cooc_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
)
.first()
)
if not coocNode_id:
if not cooc_id:
coocNode = corpus.add_child(
typename = "COOCCURRENCES",
name = "GRAPH (in corpus %s)" % corpus.id
......@@ -75,12 +65,16 @@ def countCooccurrences( corpus_id=None , test= False
session.add(coocNode)
session.commit()
coocNode_id = coocNode.id
cooc_id = coocNode.id
else :
coocNode_id = coocNode_id[0]
cooc_id = int(cooc_id[0])
else:
print("GRAPH #%s ... Loading cooccurrences computed already." % cooc_id)
cooc = session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight ).filter( NodeNgramNgram.node_id == cooc_id ).all()
return(int(cooc_id),WeightedMatrix(cooc))
if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
session.commit()
......@@ -161,8 +155,8 @@ def countCooccurrences( corpus_id=None , test= False
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
# TODO : more precise date format here (day is smaller grain actually).
date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
......@@ -177,8 +171,8 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None:
# TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
# TODO : more precise date format here (day is smaller grain actually).
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
......@@ -208,22 +202,30 @@ def countCooccurrences( corpus_id=None , test= False
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(mapList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
# TODO factorize savings on db
if save_on_db:
# Saving cooc Matrix
cooc.save(coocNode_id)
# Saving the cooccurrences
cooc.save(cooc_id)
print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id)
# Saving the parameters
coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
coocNode.hyperdata = parameters
print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata[distance] = dict()
coocNode.hyperdata[distance]["parameters"] = parameters
session.add(coocNode)
session.commit()
# Log message
print("Cooccurrence Matrix saved")
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
#return data
return cooc
return(coocNode.id, cooc)
......@@ -16,16 +16,16 @@ import networkx as nx
def clusterByDistances( cooc_matrix
, field1=None, field2=None
, distance='conditional'):
, distance=None):
'''
do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
raise ValueError("Distance must be in %s" % str(authorized))
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
......
This diff is collapsed.
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graph.graph import get_graph
from graph.utils import compress_graph, format_html
from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
from gargantext.constants import graph_constraints
from traceback import format_tb
def compress_graph(graphdata):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for link in graphdata['links']:
link['w'] = format(link['w'], '.3f') # keep only 3 decimals
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
node['at'] = node['attributes']
del node['attributes']
node['at']['cl'] = node['at']['clust_default']
del node['at']['clust_default']
node['s'] = node['size']
del node['size']
from traceback import format_tb
if node['type'] == "terms":
# its the default type for our format: so we don't need it
del node['type']
else:
node['t'] = node['type']
del node['type']
return graphdata
def format_html(link):
"""
Build an html link adapted to our json message format
"""
return "<a class='msglink' href='%s'>%s</a>" % (link, link)
# TODO check authentication
class Graph(APIView):
'''
REST part for graphs.
......@@ -75,6 +27,16 @@ class Graph(APIView):
# Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# TODO Parameters to save in hyperdata of the Node Cooc
# WARNING: we could factorize the parameters as dict but ...
# ... it causes a bug in asynchronous function !
# Check celery upgrades before.
# Example (for the future):
# parameters = dict()
# parameters['field1'] = field1
# parameters['field2'] = field2
# Get all the parameters in the URL
cooc_id = request.GET.get ('cooc_id' , None )
saveOnly = request.GET.get ('saveOnly' , None )
......@@ -94,8 +56,8 @@ class Graph(APIView):
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default value if no map list
# Get default map List of corpus
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
......@@ -107,7 +69,6 @@ class Graph(APIView):
mapList_id = mapList_id[0]
if mapList_id == None :
# todo add as an error msg ?
raise ValueError("MAPLIST node needed for cooccurrences")
......@@ -123,36 +84,26 @@ class Graph(APIView):
groupList_id = groupList_id[0]
if groupList_id == None :
# todo add as an error msg ?
raise ValueError("GROUPLIST node needed for cooccurrences")
# Check the options
# Declare accepted fields
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]
try:
# Test params
# Check if parameters are accepted
if (field1 in accepted_field1) and (field2 in accepted_field2):
if start is not None and end is not None :
data = get_graph( corpus=corpus, cooc_id = cooc_id
#, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold , distance=distance
, saveOnly=saveOnly
)
else:
data = get_graph( corpus = corpus, cooc_id = cooc_id
#, field1=field1, field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, threshold = threshold
, distance = distance
, bridgeness = bridgeness
, saveOnly=saveOnly
)
data = get_graph( corpus=corpus, cooc_id = cooc_id
, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold
, distance=distance , bridgeness=bridgeness
, saveOnly=saveOnly
)
# data :: Either (Dic Nodes Links) (Dic State Length)
......
def compress_graph(graphdata):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for link in graphdata['links']:
link['w'] = format(link['w'], '.3f') # keep only 3 decimals
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
node['at'] = node['attributes']
del node['attributes']
node['at']['cl'] = node['at']['clust_default']
del node['at']['clust_default']
node['s'] = node['size']
del node['size']
if node['type'] == "terms":
# its the default type for our format: so we don't need it
del node['type']
else:
node['t'] = node['type']
del node['type']
return graphdata
def format_html(link):
"""
Build an html link adapted to our json message format
"""
return "<a class='msglink' href='%s'>%s</a>" % (link, link)
......@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id):
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
'''
# we pass our corpus
......@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id):
@requires_auth
def myGraphs(request, project_id, corpus_id):
'''
List all of my Graphs
List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
'''
user = cache.User[request.user.id]
......
......@@ -26,7 +26,7 @@
<div class="col-md-5 content">
<li>
<h5>{{cooc.name}}</h5>
<h4>{{cooc.name}}</h4>
{{cooc.date}}
{% for key, value in coocs_count.items %}
{% if key == cooc.id %}
......@@ -40,7 +40,7 @@
<li> ~{{ value }} nodes with distances:
<ul>
<li>
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}/explorer?cooc_id={{cooc.id}}&distance=distributional&bridgeness=5">
<a href="/projects/{{project.id}}/corpora/{{corpus.id}}/explorer?cooc_id={{cooc.id}}&distance=conditional&bridgeness=5">
<span class="glyphicon glyphicon-eye-open" aria-hidden="true"></span>
Conditional
</a>
......
......@@ -102,10 +102,13 @@
<div class="panel-body">
<div class="container">
<ul>
<li>
Newsletters : <a target="blank" href="https://phplist.iscpif.fr/?p=subscribe&id=2">subscribe</a>
</li>
<li>
Mailing-lists
<ul>
<li>User mailing-list: soon</li>
<li>User mailing-list: soon </li>
<li>Devel mailing-list: soon</li>
</ul>
</li>
......
......@@ -294,7 +294,7 @@
<p>
Gargantext
<span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
, version 3.0.5.4,
, version 3.0.5.5,
<a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">
Copyrights
<span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment