Commit 082984a9 authored by PkSM3's avatar PkSM3

Merge branch 'unstable' of ssh://delanoe.org:1979/gargantext into samuel

parents 6d73d2de 83d70c45
......@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
from analysis.lists import WeightedMatrix, UnweightedList, Translations
# keep list
def cooc(corpus=None
, field_X=None, field_Y=None
, miam_id=None, stop_id=None, group_id=None
......@@ -51,10 +53,11 @@ def cooc(corpus=None
NodeNgramX = aliased(NodeNgram)
NodeNgramY = aliased(NodeNgram)
cooc_score = func.sqrt(func.sum(NodeNgramX.weight) * func.sum(NodeNgramY.weight)).label('cooc_score')
doc_id = cache.NodeType['Document'].id
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count())
cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score)
.join(Node, Node.id == NodeNgramX.node_id)
.join(NodeNgramY, NodeNgramY.node_id == Node.id)
.filter(Node.parent_id==corpus.id, Node.type_id==doc_id)
......@@ -104,13 +107,14 @@ def cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
cooc_query = (cooc_query
.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
.having(cooc_score > 1)
.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
.order_by(desc(func.count()))
.order_by(desc('cooc_score'))
.limit(limit)
#.limit(50)
)
matrix = WeightedMatrix(cooc_query)
......
......@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#print(n)
#print(m)
nodes_included = 300 #int(round(size/20,0))
nodes_included = 200 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 300 #int(round(size/10,0))
nodes_specific = 200 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
......@@ -267,6 +267,8 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
G.remove_nodes_from(nodes_to_remove)
uG = G.to_undirected()
partition = best_partition(uG)
print("Density of the graph:", nx.density(G))
except:
print("-" * 30)
PrintException()
......
......@@ -2,13 +2,15 @@
from celery import shared_task
from node import models
from django.db import transaction
import cProfile
#@app.task(bind=True)
@shared_task
def debug_task(request):
print('Request: {0!r}'.format(request))
from gargantext_web.db import session, Node
from gargantext_web.db import session, cache, Node
from ngram.workflow import ngram_workflow
......@@ -48,3 +50,18 @@ def apply_workflow(corpus_id):
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing(corpus, 0)
#@transaction.commit_manually
@shared_task
def empty_trash(corpus_id):
nodes = models.Node.objects.filter(type_id=cache.NodeType['Trash'].id).all()
with transaction.atomic():
for node in nodes:
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
print("Nodes deleted")
......@@ -29,6 +29,7 @@ import json
# SOME FUNCTIONS
from gargantext_web import settings
from gargantext_web.settings import DEBUG
from django.http import *
from django.shortcuts import render_to_response,redirect
......@@ -43,8 +44,9 @@ from gargantext_web.db import *
from sqlalchemy import or_, func
from gargantext_web import about
from gargantext_web.celery import empty_trash
from gargantext_web.db import NodeNgram, NodeNgramNgram
from gargantext_web.db import cache, NodeNgram, NodeNgramNgram
def login_user(request):
logout(request)
......@@ -416,17 +418,6 @@ def newpaginatorJSON(request , corpus_id):
return JsonHttpResponse(finaldict)
def empty_trash():
nodes = models.Node.objects.filter(type_id=cache.NodeType['Trash'].id).all()
with transaction.atomic():
for node in nodes:
try:
node.children.delete()
except Exception as error:
print(error)
node.delete()
def move_to_trash(node_id):
try:
node = session.query(Node).filter(Node.id == node_id).first()
......@@ -436,9 +427,16 @@ def move_to_trash(node_id):
session.add(node)
session.commit()
if DEBUG is False :
# TODO for the future maybe add id of node
empty_trash.apply_async("corpus_id")
else:
empty_trash("corpus_id")
return(previous_type_id)
except Exception as error:
print("can not move to trash Node" + node_id + ":" + error)
print("can not move to trash Node" + str(node_id) + ":" + str(error))
def move_to_trash_multiple(request):
user = request.user
......@@ -521,6 +519,24 @@ def chart(request, project_id, corpus_id):
}))
return HttpResponse(html)
def sankey(request, corpus_id):
t = get_template('sankey.html')
user = request.user
date = datetime.datetime.now()
corpus = session.query(Node).filter(Node.id==corpus_id).first()
html = t.render(Context({\
'debug': settings.DEBUG,
'user' : user,\
'date' : date,\
'corpus' : corpus,\
}))
return HttpResponse(html)
def matrix(request, project_id, corpus_id):
t = get_template('matrix.html')
user = request.user
......@@ -539,7 +555,7 @@ def matrix(request, project_id, corpus_id):
return HttpResponse(html)
def graph(request, project_id, corpus_id):
def graph(request, project_id, corpus_id, generic=100, specific=100):
t = get_template('explorer.html')
user = request.user
date = datetime.datetime.now()
......@@ -569,6 +585,8 @@ def graph(request, project_id, corpus_id):
# import pprint
# pprint.pprint(results)
# if specific != None and generic != None :
graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\
'debug': settings.DEBUG,
......@@ -684,7 +702,7 @@ def send_csv(request, corpus_id):
return response
# To get the data
from rest_v1_0.api import JsonHttpResponse
from rest_v1_0.api import JsonHttpResponse,CsvHttpResponse
from analysis.functions import get_cooc
def node_link(request, corpus_id):
'''
......@@ -692,18 +710,22 @@ def node_link(request, corpus_id):
'''
data = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# filename = settings.MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent_id, corpus_id)
# print("file exists?:",os.path.isfile(filename))
# if os.path.isfile(filename):
# json_data = open(filename,"r")
# data = json.load(json_data)
# json_data.close()
# else:
data = get_cooc(request=request, corpus=corpus, type="node_link")
return JsonHttpResponse(data)
def sankey_csv(request, corpus_id):
data = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
data = [
["source", "target", "value"]
, ["Elvis_1", "Elvis_2", 1]
, ["Elvis_2", "Elvis_3", 2]
, ["Barry", "Elvis_3", 2]
]
return(CsvHttpResponse(data))
def adjacency(request, corpus_id):
'''
Create the HttpResponse object with the adjacency dataset.
......
......@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
}
'''
def getNgrams(corpus=None, limit=160):
def getNgrams(corpus=None, limit=1000):
'''
getNgrams :: Corpus -> [(Int, String, String, Float)]
'''
......@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
PrintException()
return(terms)
def compute_cvalue(corpus=None, limit=160):
def compute_cvalue(corpus=None, limit=1000):
'''
computeCvalue :: Corpus
frequency :: String -> Int -> Int
......
......@@ -137,6 +137,7 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert.add((miam_node.id, n[0],1))
#print([n for n in group])
for g in group:
if (miam_node.id, g[0],1) not in miam_to_insert:
#list_to_check.remove(g)
group_to_insert.append((node_group.id, n[0], g[0], 1))
print(n[1], "=", g[1])
......
......@@ -127,7 +127,6 @@ def compute_tfidf_global(corpus):
tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# compute terms frequency sum
db, cursor = get_cursor()
......@@ -240,7 +239,7 @@ def compute_tfidf_global(corpus):
lnD = log(D)
cursor.execute('UPDATE tmp__idf SET idf = idf + %f' % (lnD, ))
# show off
dbg.show('insert tfidf for %d documents' % D)
dbg.show('insert tfidf for %d documents' % (D, ))
cursor.execute('''
INSERT INTO
%s (nodex_id, nodey_id, ngram_id, score)
......
......@@ -7,17 +7,28 @@ from ngram.group import compute_groups
from ngram.miam import compute_miam
from gargantext_web.db import get_or_create_node
def ngram_workflow(corpus):
def ngram_workflow(corpus, n=5000):
'''
All the workflow to filter the ngrams.
'''
compute_tfidf(corpus)
compute_tfidf_global(corpus)
compute_cvalue(corpus,limit=10000) # size
compute_specificity(corpus,limit=10000)
part = round(n * 0.8)
compute_cvalue(corpus,limit=part) # size
part = round(part * 0.6)
compute_specificity(corpus,limit=part)
part = round(part * 0.5)
# compute_stop(corpus)
compute_groups(corpus,limit_inf=1000, limit_sup=5000)
compute_miam(corpus,limit=3000) # size
compute_groups(corpus,limit_inf=part, limit_sup=n)
# compute_miam(corpus,limit=part) # size
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==244250).first()
#ngram_workflow(corpus)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment