Commit b1ac2efb authored by delanoe's avatar delanoe

[FACTO] split graph function into 4 main functions.

parent a40f95bb
......@@ -114,9 +114,9 @@ from gargantext.util.taggers import *
LANGUAGES = {
'en': {
'tagger': TurboTagger,
# 'tagger': EnglishMeltTagger,
# 'tagger': NltkTagger,
#'tagger': TurboTagger,
'tagger': EnglishMeltTagger,
#'tagger': NltkTagger,
},
'fr': {
'tagger': FrenchMeltTagger,
......
Module Graph Explorer: from text to graph.
Maintainer: If you see bugs, please report to team@gargantext.org
# Gargantext lib
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
# Article coming soon
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graphExplorer.distances import do_distance
from graphExplorer.cooccurrences import do_cooc
from gargantext.util.db import session
from gargantext.models.ngrams import Ngram
from collections import defaultdict
# Prelude lib
from copy import copy, deepcopy
from collections import defaultdict
from sqlalchemy.orm import aliased
# Math/Graph lib
import math
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
def get_cooc( request=None, corpus=None
, field1='ngrams', field2='ngrams'
, cooc_id=None , type='node_link'
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, size=1000
, bridgeness=5
, mapList_id = None , groupList_id = None
):
'''
get_ccoc : to compute the graph.
'''
if mapList_id == None :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
, Node.parent_id == corpus.id
)
.first()
)
if mapList_id == None :
raise ValueError("MAPLIST node needed for cooccurrences")
if groupList_id == None :
groupList_id = ( session.query ( Node.id )
.filter( Node.typename == "GROUPLIST"
, Node.parent_id == corpus.id
)
.first()
)
if groupList_id == None :
raise ValueError("GROUPLIST node needed for cooccurrences")
if corpus is None:
corpus = session.query(Node).filter(Node.id==corpus_id).first()
cooc_id = do_cooc( corpus=corpus
#, field1="ngrams", field2="ngrams"
, mapList_id=int(mapList_id[0]), groupList_id=int(groupList_id[0])
#, isMonopartite=True
, start=start , end =end
, threshold = threshold #, limit=size
)
G, partition, ids, weight = do_distance ( cooc_id
, field1="ngrams", field2="ngrams"
, isMonopartite=True
, distance=distance
)
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
# Data are stored in a dict(), (== hashmap by default for Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
#node,type(labels[node])
G.node[node_id]['pk'] = ids[node_id][1]
G.node[node_id]['pk'] = ids[node_id][1]
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
links = []
i=1
if bridgeness > 0:
com_link = defaultdict(lambda: defaultdict(list))
......@@ -107,7 +34,6 @@ def get_cooc( request=None, corpus=None
for k, v in partition.items():
com_ids[v].append(k)
for e in G.edges_iter():
s = e[0]
......@@ -180,5 +106,3 @@ def get_cooc( request=None, corpus=None
return(partition)
return(data)
......@@ -9,13 +9,13 @@ from sqlalchemy import desc, asc, or_, and_
#import inspect
import datetime
def do_cooc( corpus=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3):
def countCooccurrences( corpus=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
......
......@@ -14,9 +14,9 @@ import numpy as np
import pandas as pd
import networkx as nx
def do_distance( cooc_id
def clusterByDistances( cooc_id
, field1=None, field2=None
, isMonopartite=True, distance='conditional'):
, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
......
# Gargantext lib
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graphExplorer.cooccurrences import countCooccurrences
from graphExplorer.distances import clusterByDistances
from graphExplorer.bridgeness import filterByBridgeness
# Prelude lib
from copy import copy, deepcopy
from collections import defaultdict
from sqlalchemy.orm import aliased
# Math/Graph lib
import math
import pandas as pd
import numpy as np
import networkx as nx
def get_graph( request=None , corpus=None
, field1='ngrams' , field2='ngrams'
, mapList_id = None , groupList_id = None
, cooc_id=None , type='node_link'
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, bridgeness=5
#, size=1000
):
'''
Get_graph : main steps:
1) count Cooccurrences (function countCooccurrences)
main parameters: threshold
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
3) filter By Bridgeness (filter By Bridgeness)
main parameter: bridgness
4) format the graph (formatGraph)
main parameter: format_
'''
if cooc_id == None:
cooc_id = countCooccurrences( corpus=corpus
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
#, limit=size
)
G, partition, ids, weight = clusterByDistances ( cooc_id
, field1="ngrams", field2="ngrams"
, distance=distance
)
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
return data
......@@ -2,7 +2,7 @@
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graphExplorer.functions import get_cooc
from graphExplorer.graph import get_graph
from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
......@@ -19,38 +19,74 @@ class Graph(APIView):
graph?field1=ngrams&field2=ngrams&
graph?field1=ngrams&field2=ngrams&start=''&end=''
'''
# implicit global session
field1 = str(request.GET.get ('field1' , 'ngrams' ))
field2 = str(request.GET.get ('field2' , 'ngrams' ))
# Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first()
start = request.GET.get ('start' , None )
end = request.GET.get ('end' , None )
# Get all the parameters in the URL
field1 = str(request.GET.get ('field1' , 'ngrams' ))
field2 = str(request.GET.get ('field2' , 'ngrams' ))
threshold = int(request.GET.get ('threshold' , 1 ))
bridgeness = int(request.GET.get ('bridgeness', -1 ))
format_ = str(request.GET.get ('format' , 'json' ))
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
start = request.GET.get ('start' , None )
end = request.GET.get ('end' , None )
corpus = session.query(Node).filter(Node.id==corpus_id).first()
mapList_id = int(request.GET.get ('mapList' , 0 ))
groupList_id = int(request.GET.get ('groupList' , 0 ))
threshold = int(request.GET.get ('threshold' , 1 ))
bridgeness = int(request.GET.get ('bridgeness', -1 ))
format_ = str(request.GET.get ('format' , 'json' ))
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default value if no map list
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
, Node.parent_id == corpus.id
)
.first()
)
mapList_id = mapList_id[0]
if mapList_id == None :
raise ValueError("MAPLIST node needed for cooccurrences")
# Get default value if no group list
if groupList_id == 0 :
groupList_id = ( session.query ( Node.id )
.filter( Node.typename == "GROUPLIST"
, Node.parent_id == corpus.id
)
.first()
)
groupList_id = groupList_id[0]
if groupList_id == None :
raise ValueError("GROUPLIST node needed for cooccurrences")
# Chec the options
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams',]
options = ['start', 'end', 'threshold', 'distance']
accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance' ]
if field1 in accepted_field1 :
if field2 in accepted_field2 :
if start is not None and end is not None :
data = get_cooc( corpus=corpus
#, field1=field1 , field2=field2
, start=start , end=end
, threshold =threshold , distance=distance
data = get_graph( corpus=corpus
#, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold , distance=distance
)
else:
data = get_cooc( corpus = corpus
data = get_graph( corpus = corpus
#, field1=field1, field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, threshold = threshold
, distance = distance
, bridgeness = bridgeness
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment