import os from django.shortcuts import redirect from django.shortcuts import render from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden from sqlalchemy import func, and_, or_ from sqlalchemy.orm import aliased from collections import defaultdict from datetime import datetime from time import sleep from threading import Thread from node.admin import CustomForm from gargantext_web.db import * from gargantext_web.db import get_or_create_node from gargantext_web.views import session from gargantext_web.settings import DEBUG, MEDIA_ROOT from rest_v1_0.api import JsonHttpResponse from django.db import connection import json import re from parsing.corpustools import add_resource, parse_resources, extract_ngrams from ngram.tfidf import compute_tfidf from gargantext_web.celery import apply_workflow from admin.utils import ensure_dir def project(request, project_id): # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session .query(Node) .filter(Node.id == project_id) .filter(Node.type_id == cache.NodeType['Project'].id) ).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: in_group = """ SELECT user_parent FROM node_user_user WHERE user_id=%d""" % ( int(user.id) ) cursor = connection.cursor() cursor.execute(in_group) in_group = False for c in cursor.fetchall(): if c[0]==project.user_id: in_group = True if not in_group: return JsonHttpResponse( {"request" : "forbidden"} ) # Let's find out about the children nodes of the project ChildrenNode = aliased(Node) # This query is giving you the wrong number of docs from the pubmedquerier (x 5) # ... sqlalchemy.func by Resource.type_id is the guilty # ISSUE L51 corpus_query = (session .query(Node.id, Node.name, func.count(ChildrenNode.id), Node.hyperdata['Processing']) #.query(Node.id, Node.name, Resource.type_id, func.count(ChildrenNode.id)) #.join(Node_Resource, Node_Resource.node_id == Node.id) #.join(Resource, Resource.id == Node_Resource.resource_id) .filter(Node.parent_id == project.id) .filter(Node.type_id == cache.NodeType['Corpus'].id) .filter(and_(ChildrenNode.parent_id == Node.id, ChildrenNode.type_id == cache.NodeType['Document'].id)) .group_by(Node.id, Node.name) .order_by(Node.name) .all() ) corpora_by_resourcetype = defaultdict(list) documents_count_by_resourcetype = defaultdict(int) corpora_count = 0 corpusID_dict = {} for corpus_id, corpus_name, document_count, processing in corpus_query: #print(corpus_id, processing) # Not optimized GOTO ISSUE L51 try: resource_type_id = (session.query(Resource.type_id) .join(Node_Resource, Node_Resource.resource_id == Resource.id) .join(Node, Node.id == Node_Resource.node_id ) .filter(Node.id==corpus_id) .first())[0] except: pass if not corpus_id in corpusID_dict: if resource_type_id is None: resourcetype_name = '(no resource)' else: resourcetype = cache.ResourceType[resource_type_id] resourcetype_name = resourcetype.name corpora_by_resourcetype[resourcetype_name].append({ 'id' : corpus_id, 'name' : corpus_name, 'count' : document_count, 'processing': processing, }) documents_count_by_resourcetype[resourcetype_name] += document_count corpora_count += 1 corpusID_dict[corpus_id]=True # do the donut total_documents_count = sum(documents_count_by_resourcetype.values()) donut = [ { 'source': re.sub(' \(.*$', '', key), 'count': value, 'part' : round(value * 100 / total_documents_count) if total_documents_count else 0, } for key, value in documents_count_by_resourcetype.items() ] # deal with the form if request.method == 'POST': # form validation form = CustomForm(request.POST, request.FILES) if form.is_valid(): # extract information from the form name = form.cleaned_data['name'] thefile = form.cleaned_data['file'] resourcetype = cache.ResourceType[form.cleaned_data['type']] # corpus node instanciation as a Django model corpus = Node( name = name, user_id = request.user.id, parent_id = project_id, type_id = cache.NodeType['Corpus'].id, # no default language at this point language_id = None, hyperdata = {'Processing' : "Parsing documents",} ) session.add(corpus) session.commit() # If user is new, folder does not exist yet, create it then ensure_dir(request.user) # Save the uploaded file filepath = '%s/corpora/%s/%s' % (MEDIA_ROOT, request.user.username, thefile._name) f = open(filepath, 'wb') f.write(thefile.read()) f.close() # add the uploaded resource to the corpus add_resource(corpus, user_id = request.user.id, type_id = resourcetype.id, file = filepath, ) # let's start the workflow try: if DEBUG is False: apply_workflow.apply_async((corpus.id,),) else: #apply_workflow(corpus) thread = Thread(target=apply_workflow, args=(corpus.id, ), daemon=True) thread.start() except Exception as error: print('WORKFLOW ERROR') print(error) # redirect to the main project page # TODO need to wait before response (need corpus update) sleep(2) return HttpResponseRedirect('/project/' + str(project_id)) else: print('ERROR: BAD FORM') else: form = CustomForm() # HTML output return render(request, 'project.html', { 'form' : form, 'user' : user, 'date' : datetime.now(), 'project' : project, 'donut' : donut, 'list_corpora' : dict(corpora_by_resourcetype), 'whitelists' : '', 'blacklists' : '', 'cooclists' : '', 'number' : corpora_count, }) def tfidf(request, corpus_id, ngram_ids): """Takes IDs of corpus and ngram and returns list of relevent documents in json format according to TFIDF score (order is decreasing). """ limit=5 nodes_list = [] # filter input ngram_ids = ngram_ids.split('a') ngram_ids = [int(i) for i in ngram_ids] corpus = session.query(Node).filter(Node.id==corpus_id).first() tfidf_id = get_or_create_node(corpus=corpus, nodetype='Tfidf').id print(tfidf_id) # request data nodes_query = (session .query(Node, func.sum(NodeNodeNgram.score)) .join(NodeNodeNgram, NodeNodeNgram.nodey_id == Node.id) .filter(NodeNodeNgram.nodex_id == tfidf_id) .filter(Node.type_id == cache.NodeType['Document'].id) .filter(or_(*[NodeNodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids])) .group_by(Node) .order_by(func.sum(NodeNodeNgram.score).desc()) .limit(limit) ) # print("\n") # print("in TFIDF:") # print("\tcorpus_id:",corpus_id) # convert query result to a list of dicts if nodes_query is None: print("TFIDF error, juste take sums") nodes_query = (session .query(Node, func.sum(NodeNgram.weight)) .join(NodeNgram, NodeNgram.node_id == Node.id) .filter(Node.parent_id == corpus_id) .filter(Node.type_id == cache.NodeType['Document'].id) .filter(or_(*[NodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids])) .group_by(Node) .order_by(func.sum(NodeNgram.weight).desc()) .limit(limit) ) for node, score in nodes_query: print("\t corpus:",corpus_id,"\t",node.name) node_dict = { 'id': node.id, 'score': score, } for key in ('title', 'publication_date', 'journal', 'authors', 'fields'): if key in node.hyperdata: node_dict[key] = node.hyperdata[key] nodes_list.append(node_dict) return JsonHttpResponse(nodes_list) def getCorpusIntersection(request , corpuses_ids): FinalDict = False if request.method == 'POST' and "nodeids" in request.POST and len(request.POST["nodeids"])>0 : import ast import networkx as nx node_ids = [int(i) for i in (ast.literal_eval( request.POST["nodeids"] )) ] # Here are the visible nodes of the initial semantic map. corpuses_ids = corpuses_ids.split('a') corpuses_ids = [int(i) for i in corpuses_ids] print(corpuses_ids) # corpus[1] will be the corpus to compare def get_score(corpus_id): cooc_type_id = cache.NodeType['Cooccurrence'].id cooc_ids = (session.query(Node.id) .filter(Node.user_id == request.user.id , Node.parent_id==corpus_id , Node.type_id == cooc_type_id ) .first() ) if len(cooc_ids)==0: return JsonHttpResponse(FinalDict) # If corpus[1] has a coocurrence.id then lets continue Coocs = {} G = nx.Graph() # undirected graph only # because direction doesnt matter here # coocs is triangular matrix ngrams_data = ( session.query(NodeNgramNgram) .filter( NodeNgramNgram.node_id==cooc_ids[0] , or_( NodeNgramNgram.ngramx_id.in_( node_ids ) , NodeNgramNgram.ngramy_id.in_( node_ids ) ) ) .group_by(NodeNgramNgram) .all() ) for ngram in ngrams_data : # are there visible nodes in the X-axis of corpus to compare ? G.add_edge( ngram.ngramx_id , ngram.ngramy_id , weight=ngram.score) print(corpus_id, ngram) for e in G.edges_iter() : n1 = e[0] n2 = e[1] # print( G[n1][n2]["weight"] , "\t", n1,",",n2 ) if n1 not in Coocs : Coocs[n1] = 0 if n2 not in Coocs : Coocs[n2] = 0 Coocs[n1] += G[n1][n2]["weight"] Coocs[n2] += G[n1][n2]["weight"] return(Coocs,G) Coocs_0,G_0 = get_score( corpuses_ids[0] ) Coocs_1,G_1 = get_score( corpuses_ids[1] ) FinalDict = {} measure = 'cooc' if measure == 'jacquard': for node in node_ids : if node in G_1.nodes() and node in G_0.nodes(): neighbors_0 = set(G_0.neighbors(node)) neighbors_1 = set(G_1.neighbors(node)) jacquard = len(neighbors_0.intersection(neighbors_1)) / len(neighbors_0.union(neighbors_1)) FinalDict[node] = jacquard * 3 elif node in G_0.nodes() and node not in G_1.nodes() : FinalDict[node] = 2 elif node not in G_0.nodes() and node in G_1.nodes() : FinalDict[node] = 1 else: FinalDict[node] = 0 elif measure == 'cooc': for node in node_ids : if node in G_1.nodes() and node in G_0.nodes(): score_0 = Coocs_0[node] / G_0.degree(node) score_1 = Coocs_1[node] / G_1.degree(node) FinalDict[node] = 5 * score_0 / score_1 elif node in G_0.nodes() and node not in G_1.nodes() : FinalDict[node] = 0.5 elif node not in G_0.nodes() and node in G_1.nodes() : FinalDict[node] = 0.2 else: FinalDict[node] = 0 print(FinalDict) #print(node,score) # Getting AVG-COOC of each ngram that exists in the cooc-matrix of the compared-corpus. return JsonHttpResponse(FinalDict) def getUserPortfolio(request , project_id): user = request.user user_id = cache.User[request.user.username].id project_type_id = cache.NodeType['Project'].id corpus_type_id = cache.NodeType['Corpus'].id results = {} projs = session.query(Node).filter(Node.user_id == user_id,Node.type_id==project_type_id ).all() in_group = """ SELECT user_parent FROM node_user_user WHERE user_id=%d""" % ( int(user_id) ) cursor = connection.cursor() cursor.execute(in_group) for c in cursor.fetchall(): user_parent = c[0] more_projs = session.query(Node).filter(Node.user_id == user_parent,Node.type_id==project_type_id ).all() if more_projs!=None: for p in more_projs: projs.append( p ) for i in projs: # print (i.id,i.name) if i.id not in results: results[i.id] = {} results[i.id]["proj_name"] = i.name results[i.id]["corpuses"] = [] corpuses = session.query(Node).filter(Node.parent_id==i.id , Node.type_id==corpus_type_id).all() for j in corpuses: doc_count = session.query(func.count(Node.id)).filter(Node.parent_id==j.id).all()[0][0] if doc_count >= 10: # print(session.query(Node).filter(Node.id==j.id).first()) info = { "id":j.id , "name":j.name , "c":doc_count } results[i.id]["corpuses"].append(info) # print("\t\t",j.id , j.name , doc_count) if len(results[i.id]["corpuses"])==0: del results[i.id] return JsonHttpResponse( results )