Commit 082984a9 authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 6d73d2de 83d70c45
...@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \ ...@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations from analysis.lists import WeightedMatrix, UnweightedList, Translations
# keep list
def cooc(corpus=None def cooc(corpus=None
, field_X=None, field_Y=None , field_X=None, field_Y=None
, miam_id=None, stop_id=None, group_id=None , miam_id=None, stop_id=None, group_id=None
...@@ -51,10 +53,11 @@ def cooc(corpus=None ...@@ -51,10 +53,11 @@ def cooc(corpus=None
NodeNgramX = aliased(NodeNgram) NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram) NodeNgramY = aliased(NodeNgram)
cooc_score = func.sqrt(func.sum(NodeNgramX.weight) * func.sum(NodeNgramY.weight)).label('cooc_score')
doc_id = cache.NodeType['Document'].id doc_id = cache.NodeType['Document'].id
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count()) cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id) .join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id) .join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id) .filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
...@@ -104,13 +107,14 @@ def cooc(corpus=None ...@@ -104,13 +107,14 @@ def cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit # Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id) cooc_query = (cooc_query
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id) .filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.having(cooc_score > 1)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id) .group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc(func.count())) .order_by(desc('cooc_score'))
.limit(limit) #.limit(50)
) )
matrix = WeightedMatrix(cooc_query) matrix = WeightedMatrix(cooc_query)
......
...@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz ...@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#print(n) #print(n)
#print(m) #print(m)
nodes_included = 300 #int(round(size/20,0)) nodes_included = 200 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0)) #nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0)) nodes_specific = 200 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0)) #nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size # TODO user the included score for the node size
...@@ -267,6 +267,8 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz ...@@ -267,6 +267,8 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
G.remove_nodes_from(nodes_to_remove) G.remove_nodes_from(nodes_to_remove)
uG = G.to_undirected() uG = G.to_undirected()
partition = best_partition(uG) partition = best_partition(uG)
print("Density of the graph:", nx.density(G))
except: except:
print("-" * 30) print("-" * 30)
PrintException() PrintException()
......
...@@ -2,13 +2,15 @@ ...@@ -2,13 +2,15 @@
from celery import shared_task from celery import shared_task
from node import models from node import models
from django.db import transaction
import cProfile import cProfile
#@app.task(bind=True) #@app.task(bind=True)
@shared_task @shared_task
def debug_task(request): def debug_task(request):
print('Request: {0!r}'.format(request)) print('Request: {0!r}'.format(request))
from gargantext_web.db import session, Node from gargantext_web.db import session, cache, Node
from ngram.workflow import ngram_workflow from ngram.workflow import ngram_workflow
...@@ -48,3 +50,18 @@ def apply_workflow(corpus_id): ...@@ -48,3 +50,18 @@ def apply_workflow(corpus_id):
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id) #ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing(corpus, 0) update_processing(corpus, 0)
#@transaction.commit_manually
@shared_task
def empty_trash(corpus_id):
nodes = models.Node.objects.filter(type_id=cache.NodeType['Trash'].id).all()
with transaction.atomic():
for node in nodes:
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
print("Nodes deleted")
...@@ -29,6 +29,7 @@ import json ...@@ -29,6 +29,7 @@ import json
# SOME FUNCTIONS # SOME FUNCTIONS
from gargantext_web import settings from gargantext_web import settings
from gargantext_web.settings import DEBUG
from django.http import * from django.http import *
from django.shortcuts import render_to_response,redirect from django.shortcuts import render_to_response,redirect
...@@ -43,8 +44,9 @@ from gargantext_web.db import * ...@@ -43,8 +44,9 @@ from gargantext_web.db import *
from sqlalchemy import or_, func from sqlalchemy import or_, func
from gargantext_web import about from gargantext_web import about
from gargantext_web.celery import empty_trash
from gargantext_web.db import NodeNgram, NodeNgramNgram from gargantext_web.db import cache, NodeNgram, NodeNgramNgram
def login_user(request): def login_user(request):
logout(request) logout(request)
...@@ -416,17 +418,6 @@ def newpaginatorJSON(request , corpus_id): ...@@ -416,17 +418,6 @@ def newpaginatorJSON(request , corpus_id):
return JsonHttpResponse(finaldict) return JsonHttpResponse(finaldict)
def empty_trash():
nodes = models.Node.objects.filter(type_id=cache.NodeType['Trash'].id).all()
with transaction.atomic():
for node in nodes:
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
def move_to_trash(node_id): def move_to_trash(node_id):
try: try:
node = session.query(Node).filter(Node.id == node_id).first() node = session.query(Node).filter(Node.id == node_id).first()
...@@ -436,9 +427,16 @@ def move_to_trash(node_id): ...@@ -436,9 +427,16 @@ def move_to_trash(node_id):
session.add(node) session.add(node)
session.commit() session.commit()
if DEBUG is False :
# TODO for the future maybe add id of node
empty_trash.apply_async("corpus_id")
else:
empty_trash("corpus_id")
return(previous_type_id) return(previous_type_id)
except Exception as error: except Exception as error:
print("can not move to trash Node" + node_id + ":" + error) print("can not move to trash Node" + str(node_id) + ":" + str(error))
def move_to_trash_multiple(request): def move_to_trash_multiple(request):
user = request.user user = request.user
...@@ -521,6 +519,24 @@ def chart(request, project_id, corpus_id): ...@@ -521,6 +519,24 @@ def chart(request, project_id, corpus_id):
})) }))
return HttpResponse(html) return HttpResponse(html)
def sankey(request, corpus_id):
t = get_template('sankey.html')
user = request.user
date = datetime.datetime.now()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
html = t.render(Context({\
'debug': settings.DEBUG,
'user' : user,\
'date' : date,\
'corpus' : corpus,\
}))
return HttpResponse(html)
def matrix(request, project_id, corpus_id): def matrix(request, project_id, corpus_id):
t = get_template('matrix.html') t = get_template('matrix.html')
user = request.user user = request.user
...@@ -539,7 +555,7 @@ def matrix(request, project_id, corpus_id): ...@@ -539,7 +555,7 @@ def matrix(request, project_id, corpus_id):
return HttpResponse(html) return HttpResponse(html)
def graph(request, project_id, corpus_id): def graph(request, project_id, corpus_id, generic=100, specific=100):
t = get_template('explorer.html') t = get_template('explorer.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
...@@ -569,6 +585,8 @@ def graph(request, project_id, corpus_id): ...@@ -569,6 +585,8 @@ def graph(request, project_id, corpus_id):
# import pprint # import pprint
# pprint.pprint(results) # pprint.pprint(results)
# if specific != None and generic != None :
graphurl = "corpus/"+str(corpus_id)+"/node_link.json" graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\ html = t.render(Context({\
'debug': settings.DEBUG, 'debug': settings.DEBUG,
...@@ -684,7 +702,7 @@ def send_csv(request, corpus_id): ...@@ -684,7 +702,7 @@ def send_csv(request, corpus_id):
return response return response
# To get the data # To get the data
from rest_v1_0.api import JsonHttpResponse from rest_v1_0.api import JsonHttpResponse,CsvHttpResponse
from analysis.functions import get_cooc from analysis.functions import get_cooc
def node_link(request, corpus_id): def node_link(request, corpus_id):
''' '''
...@@ -692,18 +710,22 @@ def node_link(request, corpus_id): ...@@ -692,18 +710,22 @@ def node_link(request, corpus_id):
''' '''
data = [] data = []
corpus = session.query(Node).filter(Node.id==corpus_id).first() corpus = session.query(Node).filter(Node.id==corpus_id).first()
# filename = settings.MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent_id, corpus_id)
# print("file exists?:",os.path.isfile(filename))
# if os.path.isfile(filename):
# json_data = open(filename,"r")
# data = json.load(json_data)
# json_data.close()
# else:
data = get_cooc(request=request, corpus=corpus, type="node_link") data = get_cooc(request=request, corpus=corpus, type="node_link")
return JsonHttpResponse(data) return JsonHttpResponse(data)
def sankey_csv(request, corpus_id):
data = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
data = [
["source", "target", "value"]
, ["Elvis_1", "Elvis_2", 1]
, ["Elvis_2", "Elvis_3", 2]
, ["Barry", "Elvis_3", 2]
]
return(CsvHttpResponse(data))
def adjacency(request, corpus_id): def adjacency(request, corpus_id):
''' '''
Create the HttpResponse object with the adjacency dataset. Create the HttpResponse object with the adjacency dataset.
......
...@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5 ...@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
} }
''' '''
def getNgrams(corpus=None, limit=160): def getNgrams(corpus=None, limit=1000):
''' '''
getNgrams :: Corpus -> [(Int, String, String, Float)] getNgrams :: Corpus -> [(Int, String, String, Float)]
''' '''
...@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160): ...@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
PrintException() PrintException()
return(terms) return(terms)
def compute_cvalue(corpus=None, limit=160): def compute_cvalue(corpus=None, limit=1000):
''' '''
computeCvalue :: Corpus computeCvalue :: Corpus
frequency :: String -> Int -> Int frequency :: String -> Int -> Int
......
...@@ -137,9 +137,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'): ...@@ -137,9 +137,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert.add((miam_node.id, n[0],1)) miam_to_insert.add((miam_node.id, n[0],1))
#print([n for n in group]) #print([n for n in group])
for g in group: for g in group:
if (miam_node.id, g[0],1) not in miam_to_insert:
#list_to_check.remove(g) #list_to_check.remove(g)
group_to_insert.append((node_group.id, n[0], g[0], 1)) group_to_insert.append((node_group.id, n[0], g[0], 1))
print(n[1], "=", g[1]) print(n[1], "=", g[1])
# Deleting previous groups # Deleting previous groups
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete() session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_group.id).delete()
......
...@@ -127,7 +127,6 @@ def compute_tfidf_global(corpus): ...@@ -127,7 +127,6 @@ def compute_tfidf_global(corpus):
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus) tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# compute terms frequency sum # compute terms frequency sum
db, cursor = get_cursor() db, cursor = get_cursor()
...@@ -240,7 +239,7 @@ def compute_tfidf_global(corpus): ...@@ -240,7 +239,7 @@ def compute_tfidf_global(corpus):
lnD = log(D) lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, )) cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off # show off
dbg.show('insert tfidf for %d documents' % D) dbg.show('insert tfidf for %d documents' % (D, ))
cursor.execute(''' cursor.execute('''
INSERT INTO INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score) %s (nodex_id, nodey_id, ngram_id, score)
......
...@@ -7,17 +7,28 @@ from ngram.group import compute_groups ...@@ -7,17 +7,28 @@ from ngram.group import compute_groups
from ngram.miam import compute_miam from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node from gargantext_web.db import get_or_create_node
def ngram_workflow(corpus): def ngram_workflow(corpus, n=5000):
''' '''
All the workflow to filter the ngrams. All the workflow to filter the ngrams.
''' '''
compute_tfidf(corpus)
compute_tfidf_global(corpus) compute_tfidf_global(corpus)
compute_cvalue(corpus,limit=10000) # size
compute_specificity(corpus,limit=10000) part = round(n * 0.8)
compute_cvalue(corpus,limit=part) # size
part = round(part * 0.6)
compute_specificity(corpus,limit=part)
part = round(part * 0.5)
# compute_stop(corpus) # compute_stop(corpus)
compute_groups(corpus,limit_inf=1000, limit_sup=5000) compute_groups(corpus,limit_inf=part, limit_sup=n)
compute_miam(corpus,limit=3000) # size
# compute_miam(corpus,limit=part) # size
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==244250).first() #corpus=session.query(Node).filter(Node.id==244250).first()
#ngram_workflow(corpus) #ngram_workflow(corpus)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment