Commit 09b31df8 authored by delanoe's avatar delanoe

Merge branch 'samuel' into unstable

parents cc65af59 8dc52be7
......@@ -57,6 +57,7 @@ class WorkflowTracking:
cursor = connection.cursor()
try:
cursor.execute(the_query)
cursor.execute("COMMIT;")
finally:
connection.close()
except :
......
......@@ -43,9 +43,9 @@ def apply_workflow(corpus_id):
ngram_workflow(corpus)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_state.processing_(corpus, "0")
print("End of the Workflow for corpus %d" % (corpus_id))
update_state.processing_(corpus, "0")
@shared_task
......
......@@ -91,11 +91,10 @@ urlpatterns = patterns('',
############################################################################
url(r'^tests/', include('tests.urls')),
# TODO Samuel, lines below were on your tests, are they still used ?
# can we delete them ?
url(r'^project/(\d+)/corpus/(\d+)/terms/ngrams.json$', samtest.get_ngrams_json),
url(r'^project/(\d+)/corpus/(\d+)/terms$', samtest.get_ngrams),
url(r'^project/(\d+)/corpus/(\d+)/stop_list.json$', samtest.get_stoplist)
url(r'^api/corpus/(\d+)$', samtest.get_corpus_state),
url(r'^test_cores$', samtest.get_cores)
)
......
......@@ -345,12 +345,14 @@ def corpus(request, project_id, corpus_id):
type_doc_id = cache.NodeType['Document'].id
number = session.query(func.count(Node.id)).filter(Node.parent_id==corpus_id, Node.type_id==type_doc_id).all()[0][0]
the_query = """ SELECT hyperdata FROM node_node WHERE id=%d """ % ( int(corpus_id) )
cursor = connection.cursor()
try:
processing = corpus.hyperdata['Processing']
except Exception as error:
print(error)
processing = 0
print('corpus',corpus_id,' , processing', processing)
cursor.execute(the_query)
processing = cursor.fetchone()[0]["Processing"]
except:
processing = "Error"
html = t.render(Context({
'debug': settings.DEBUG,
......@@ -569,6 +571,9 @@ def graph(request, project_id, corpus_id, generic=100, specific=100):
project_type_id = cache.NodeType['Project'].id
corpus_type_id = cache.NodeType['Corpus'].id
miamlist_type_id = cache.NodeType['MiamList'].id
miamlist = session.query(Node).filter(Node.user_id == request.user.id , Node.parent_id==corpus_id , Node.type_id == cache.NodeType['MiamList'].id ).first()
graphurl = "corpus/"+str(corpus_id)+"/node_link.json"
html = t.render(Context({\
......@@ -576,6 +581,7 @@ def graph(request, project_id, corpus_id, generic=100, specific=100):
'user': request.user,\
'date' : date,\
'corpus' : corpus,\
'list_id' : miamlist.id,\
'project' : project,\
'graphfile' : graphurl,\
}))
......
......@@ -140,7 +140,7 @@ def project(request, project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = language_id,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......
......@@ -6,7 +6,7 @@ from ngram.stop import compute_stop
from ngram.group import compute_groups
from gargantext_web.db import get_or_create_node
from ngram.mapList import compute_mapList
from ngram.occurrences import compute_occs
# from ngram.occurrences import compute_occs
from gargantext_web.db import session , Node , NodeNgram
from admin.utils import WorkflowTracking
......@@ -47,9 +47,8 @@ def ngram_workflow(corpus, n=5000):
update_state.processing_(corpus, "TF-IDF local score")
compute_tfidf(corpus)
update_state.processing_(corpus, "OCCS local score")
compute_occs(corpus)
# update_state.processing_(corpus, "OCCS local score")
# compute_occs(corpus)
#corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
......
......@@ -269,42 +269,42 @@ class Node(CTENode):
for ngram_text, weight in associations.items()
])
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
import time
total = 0
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.hyperdata['Processing'] = 1
self.save()
self.parse_resources()
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
print("\n- - - - - - - - - -")
type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
end = time.time()
print("- - - - - - - - - - \n")
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
self.hyperdata['Processing'] = 0
self.save()
# @current_app.task(filter=task_method)
# def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
# import time
# total = 0
# print("LOG::TIME: In workflow() parse_resources()")
# start = time.time()
# self.hyperdata['Processing'] = 1
# self.save()
# self.parse_resources()
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources() [s]",(end - start))
# print("LOG::TIME: In workflow() / parse_resources()")
# start = time.time()
# print("LOG::TIME: In workflow() extract_ngrams()")
# print("\n- - - - - - - - - -")
# type_document = NodeType.objects.get(name='Document')
# self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
# end = time.time()
# print("- - - - - - - - - - \n")
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams() [s]",(end - start))
# print("LOG::TIME: In workflow() / extract_ngrams()")
# start = time.time()
# print("In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# print("LOG::TIME: In workflow() / do_tfidf()")
# print("In workflow() END")
# self.hyperdata['Processing'] = 0
# self.save()
class Node_Hyperdata(models.Model):
node = models.ForeignKey(Node, on_delete=models.CASCADE)
......
......@@ -132,10 +132,7 @@ class List(APIView):
if request.GET.get('custom', False) != False:
ngrams_meta = self.get_metadata( ngram_ids , corpus_id )
ngram_ids = ngrams_meta["data"]
measurements["tfidf"] = {
"s" : ngrams_meta["secs"],
"n": len(ngrams_meta["data"].keys())
}
measurements["tfidf"] = { "s" : ngrams_meta["secs"], "n": len(ngrams_meta["data"].keys()) }
return JsonHttpResponse( {"data":ngram_ids , "time":measurements } )
......@@ -575,6 +572,8 @@ class Keep(APIView):
Delete ngrams from the map list
"""
group_rawreq = dict(request.data)
# print("group_rawreq:")
# print(group_rawreq)
from django.utils.html import escape
ngram_2del = [int(i) for i in list(group_rawreq.keys())]
corpus = session.query(Node).filter( Node.id==corpus_id ).first()
......
......@@ -45,19 +45,22 @@ class MedlineFetcher:
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except:
count=0
queryKey=False
webEnv=False
origQuery=False
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
......@@ -173,8 +176,13 @@ class MedlineFetcher:
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"]>0:
N+=globalresults["count"]
queryhyperdata = {
......@@ -198,4 +206,7 @@ class MedlineFetcher:
if query["retmax"]==0: query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
......@@ -130,7 +130,7 @@ def doTheQuery(request , project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......@@ -243,7 +243,7 @@ def testISTEX(request , project_id):
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
hyperdata = {'Processing' : 1,}
hyperdata = {'Processing' : "Parsing documents",}
)
session.add(corpus)
session.commit()
......
......@@ -85,7 +85,8 @@
</div>
</div>
<span style="display:none;" id="process_state">{{processing}}</span>
<span style="display:none;" id="corpus_id">{{corpus.id}}</span>
<div class="col-md-6">
<div class="jumbotron">
{% if processing == 0 or processing == "0" %}
......@@ -96,8 +97,9 @@
<li>Authors and Terms</li>
</ol>
{% else %}
<h3><img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Networks </h3>
<h6>(Updating: <i>{{processing}}</i>)</h6>
<h6>(Updating: <i id="process_id" data-since="date" >{{processing}}</i>)</h6>
<ol>
<li>Terms</li>
<li>Journals and Terms</li>
......@@ -134,6 +136,35 @@
return window.open(url_,'_blank');
}
var refresh_time = 3000 //ms
function corpus_monitorer() {
var url_ = "/api/corpus/"+$("#corpus_id").text()
$.ajax({
type: "GET",
url: url_,
dataType: "json",
success : function(data, textStatus, jqXHR) {
if(data["Processing"]=="0") {
window.location.reload()
} else {
$("#process_id").html(data["Processing"]+"...")
}
},
error: function(exception) {
console.log("exception!:"+exception.status)
}
});
}
if( $("#process_state").text()=="0" ) {
// workflow : finished!
} else {
setInterval(corpus_monitorer ,refresh_time);
}
</script>
......
......@@ -192,7 +192,7 @@ input[type=radio]:checked + label {
</table>
</p> -->
<p align="right">
<button id="Clean_All" class="btn btn-warning">Clean</button>
<!-- <button id="Clean_All" class="btn btn-warning">Clean</button> -->
<button id="Save_All" class="btn btn-primary">Save</button>
</p>
......
......@@ -136,6 +136,7 @@
<li>
<a>
<div id="graphid" style="visibility: hidden;">{{graphfile}}</div>
<input type="hidden" id="list_id" value="{{ list_id }}"></input>
<div id="jquerytemplatenb" style="visibility: hidden;">{{user.id}}</div>
</a>
</li>
......
......@@ -24,26 +24,6 @@
<script type="text/javascript">
var refresh_time = 5000 //ms
function corpus_monitorer() {
console.log("hola")
// $.ajax({
// type: "GET",
// url: "https://dl.dropboxusercontent.com/u/9975992/climat/ajax_file.json",
// dataType: "json",
// success : function(data, textStatus, jqXHR) {
// if( data.command ) {
// eval( data.command )
// }
// },
// error: function(exception) {
// console.log("exception!:"+exception.status)
// }
// });
}
setInterval(corpus_monitorer ,refresh_time);
</script>
......@@ -363,6 +343,7 @@
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("SUCCESS")
console.log("in getGlobalResults")
console.log(data)
console.log("enabling "+"#"+value.id)
......@@ -379,12 +360,15 @@
$('#submit_thing').prop('disabled', false);
} else {
$("#theresults").html("<i> <b>"+pubmedquery+"</b>: No results!.</i><br>")
if(data[0]==false)
$("#theresults").html("Pubmed connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
},
error: function(result) {
console.log("Data not found");
$("#theresults").html("Pubmed connection error!</i><br>")
$('#submit_thing').prop('disabled', true);
}
});
}
......
......@@ -44,7 +44,7 @@
data-content='
<ul>
<li> Rename </li>
<li> Add new corpus </li>
<li><a href="/project/{{ project.id }}">Add new corpus</a></li>
<li><a href="/delete/{{ project.id }}">Delete</a></li>
</ul>
'>Manage</button>
......
......@@ -56,43 +56,6 @@ from rest_v1_0.api import JsonHttpResponse
from ngram.lists import listIds, listNgramIds, ngramList , doList
def test_page(request , project_id , corpus_id):
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
try:
offset = int(project_id)
offset = int(corpus_id)
except ValueError:
raise Http404()
t = get_template('tests/test_select-boostrap.html')
user = cache.User[request.user.username].id
date = datetime.datetime.now()
project = cache.Node[int(project_id)]
corpus = cache.Node[int(corpus_id)]
type_doc_id = cache.NodeType['Document'].id
number = session.query(func.count(Node.id)).filter(Node.parent_id==corpus_id, Node.type_id==type_doc_id).all()[0][0]
try:
processing = corpus.hyperdata['Processing']
except Exception as error:
print(error)
processing = 0
html = t.render(Context({
'debug': settings.DEBUG,
'user': request.user.username,
'date': date,
'project': project,
'corpus' : corpus,
'processing' : processing,
'number' : number,
}))
return HttpResponse(html)
def get_ngrams(request , project_id , corpus_id ):
if not request.user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
......@@ -133,32 +96,6 @@ def get_ngrams(request , project_id , corpus_id ):
return HttpResponse(html)
def get_stoplist(request , corpus_id , doc_id):
"""Get All for a doc id"""
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
results = StopList.keys() #[ "hola" , "mundo" ]
return JsonHttpResponse(StopList)
def get_journals(request , project_id , corpus_id ):
if not request.user.is_authenticated():
......@@ -216,122 +153,32 @@ def get_journals_json(request , project_id, corpus_id ):
from gargantext_web.db import session, cache, Node, NodeNgram
from sqlalchemy import or_, func
from sqlalchemy.orm import aliased
def get_ngrams_json(request , project_id, corpus_id ):
results = ["holaaaa" , "mundo"]
user_id = request.user.id
whitelist_type_id = cache.NodeType['WhiteList'].id
document_type_id = cache.NodeType['Document'].id
miam_id = listIds(typeList='MiamList', user_id=request.user.id, corpus_id=corpus_id)[0][0]
count_min = 2
size = 1000
corpus_id = int(corpus_id)
lists = dict()
for list_type in ['StopList']:
list_id = list()
list_id = listIds(user_id=request.user.id, corpus_id=int(corpus_id), typeList=list_type)
lists["%s" % list_id[0][0]] = list_type
doc_ngram_list = listNgramIds(corpus_id=corpus_id, list_id=list_id[0][0], doc_id=list_id[0][0], user_id=request.user.id)
StopList = {}
for n in doc_ngram_list:
StopList[ n[0] ] = True
# [ Get Uniq_Occs ]
myamlist_type_id = cache.NodeType['MiamList'].id
myamlist = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == myamlist_type_id ).first()
myamlists = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == myamlist_type_id ).all()
# sql_average = """SELECT avg(weight) as Average FROM node_node_ngram WHERE node_node_ngram.node_id=%d""" % (myamlist.id)
# cursor = connection.cursor()
# cursor.execute(sql_average)
# avg_result = cursor.fetchone()[0]
# threshold = min (10 , math.sqrt(avg_result) )
# OCCs = session.query(Node_Ngram).filter( Node_Ngram.node_id==myamlist.id , Node_Ngram.weight >= threshold ).all()
# [ / Get Uniq_Occs ]
Miam = aliased(NodeNgram)
sql_average = (session.query(NodeNgram.ngram_id, func.sum(NodeNgram.weight))
.join(Node, Node.id == NodeNgram.node_id)
.join(Miam, Miam.ngram_id == NodeNgram.ngram_id)
.filter(Node.parent_id == corpus_id, Node.type_id==cache.NodeType['Document'].id)
.filter(Miam.node_id==myamlist.id)
.group_by(NodeNgram.ngram_id)
.all()
)
# print([n for n in sql_average])
OCCs = {}
for ngram in sql_average:
OCCs [ ngram[0] ] = ngram[1]
# [ Initializing Ngrams_Scores with occ_uniq ]
Ngrams_Scores = {}
for ngram in OCCs:
if ngram not in StopList:
if ngram not in Ngrams_Scores:
Ngrams_Scores[ngram] = {}
Ngrams_Scores[ngram]["scores"] = {
"occ_uniq": round(OCCs[ngram]),
"tfidf_sum": 0.0
}
# [ / Initializing Ngrams_Scores with occ_uniq ]
# [ Getting TF-IDF scores (sum per each ngram) ]
NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
for ngram in NgramTFIDF:
if ngram.ngram_id not in StopList:
if ngram.ngram_id in Ngrams_Scores:
Ngrams_Scores[ngram.ngram_id]["scores"]["tfidf_sum"] += ngram.score
# [ / Getting TF-IDF scores ]
# [ Preparing JSON-Array full of Scores! ]
Metrics = {
"ngrams":[],
"scores": {}
}
ngrams_ids = Ngrams_Scores.keys()
query = session.query(Ngram).filter(Ngram.id.in_( ngrams_ids ))
ngrams_data = query.all()
for ngram in ngrams_data:
if ngram.id not in StopList:
occ_uniq = occ_uniq = Ngrams_Scores[ngram.id]["scores"]["occ_uniq"]
Ngrams_Scores[ngram.id]["name"] = ngram.terms
Ngrams_Scores[ngram.id]["id"] = ngram.id
Ngrams_Scores[ngram.id]["scores"]["tfidf"] = Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"] / occ_uniq
del Ngrams_Scores[ngram.id]["scores"]["tfidf_sum"]
Metrics["ngrams"].append( Ngrams_Scores[ngram.id] )
Metrics["scores"] = {
"initial":"occ_uniq",
"nb_docs":1,
"orig_nb_ngrams":1,
"nb_ngrams":len(Metrics["ngrams"]),
# "occs_threshold":threshold
}
# [ / Preparing JSON-Array full of Scores! ]
# print("miamlist:",myamlist.id)
# print("sql avg:",sql_average)
# print (avg_result)
# print ("LALALALALALALALLLALALALALA")
return JsonHttpResponse(Metrics)
def get_corpuses( request , node_ids ):
ngrams = [int(i) for i in node_ids.split("+") ]
results = session.query(Node.id,Node.hyperdata).filter(Node.id.in_(ngrams) ).all()
for r in results:
print(r)
return JsonHttpResponse( [ "tudo" , "bem" ] )
\ No newline at end of file
return JsonHttpResponse( [ "tudo" , "bem" ] )
def get_cores( request ):
import multiprocessing
cpus = multiprocessing.cpu_count()
return JsonHttpResponse( {"data":cpus} )
def get_corpus_state( request , corpus_id ):
if not request.user.is_authenticated():
return JsonHttpResponse( {"request" : "forbidden"} )
processing = ["Waiting"]
the_query = """ SELECT hyperdata FROM node_node WHERE id=%d """ % ( int(corpus_id) )
cursor = connection.cursor()
try:
cursor.execute(the_query)
processing = cursor.fetchone()[0]
finally:
connection.close()
# processing = corpus.hyperdata['Processing']
return JsonHttpResponse( processing )
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment