Commit 2d1a9b89 authored by PkSM3's avatar PkSM3

[UPDATE] newuser-nirvana: workflow__MOV + graph generation (writing ngrams in DB not included)

parent b7edf98a
...@@ -245,27 +245,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150 ...@@ -245,27 +245,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
return data return data
#def tfidf(corpus, document, ngram):
# '''
# Compute TF-IDF (Term Frequency - Inverse Document Frequency)
# See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
# '''
# try:
# occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
# ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
# term_frequency = occurences_of_ngram / ngrams_by_document
#
# xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
# yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
# inverse_document_frequency= log(xx/yy)
#
# # result = tf * idf
# result = term_frequency * inverse_document_frequency
# except Exception as error:
# print(error, ngram)
# result = 0
# return result
from analysis.tfidf import tfidf from analysis.tfidf import tfidf
def do_tfidf(corpus, reset=True): def do_tfidf(corpus, reset=True):
......
...@@ -25,6 +25,7 @@ from django import forms ...@@ -25,6 +25,7 @@ from django import forms
from collections import defaultdict from collections import defaultdict
from parsing.FileParsers import * from parsing.FileParsers import *
import os
# SOME FUNCTIONS # SOME FUNCTIONS
...@@ -282,9 +283,8 @@ def project(request, project_id): ...@@ -282,9 +283,8 @@ def project(request, project_id):
cooclists = ""#.children.filter(type=type_cooclist) cooclists = ""#.children.filter(type=type_cooclist)
for corpus in corpora: for corpus in corpora:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count = Node.objects.filter(parent=corpus, type=type_document).count() docs_count = Node.objects.filter(parent=corpus, type=type_document).count()
# print("corpus:", corpus.pk , " | name:",corpus.name , " | type:",corpus.type_id , " | #docs:",docs_count)
docs_total += docs_count docs_total += docs_count
corpus_view = dict() corpus_view = dict()
...@@ -727,6 +727,7 @@ def graph(request, project_id, corpus_id): ...@@ -727,6 +727,7 @@ def graph(request, project_id, corpus_id):
'date' : date,\ 'date' : date,\
'corpus' : corpus,\ 'corpus' : corpus,\
'project' : project,\ 'project' : project,\
'graphfile' : "hola_mundo",\
})) }))
return HttpResponse(html) return HttpResponse(html)
...@@ -839,18 +840,23 @@ def send_csv(request, corpus_id): ...@@ -839,18 +840,23 @@ def send_csv(request, corpus_id):
from gargantext_web.api import JsonHttpResponse from gargantext_web.api import JsonHttpResponse
from analysis.functions import get_cooc from analysis.functions import get_cooc
import json import json
from gargantext_web.settings import MEDIA_ROOT
def node_link(request, corpus_id): def node_link(request, corpus_id):
''' '''
Create the HttpResponse object with the node_link dataset. Create the HttpResponse object with the node_link dataset.
''' '''
import time
print("In node_link() START") data = []
start = time.time()
corpus = Node.objects.get(id=corpus_id)
filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent.id, corpus_id)
print("file exists?:",os.path.isfile(filename))
if os.path.isfile(filename):
json_data = open(filename,"r")
data = json.load(json_data)
json_data.close()
else:
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link") data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
end = time.time()
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" get_cooc() [s]",(end - start))
print("In node_link() END")
return JsonHttpResponse(data) return JsonHttpResponse(data)
def adjacency(request, corpus_id): def adjacency(request, corpus_id):
......
...@@ -287,16 +287,6 @@ class Node(CTENode): ...@@ -287,16 +287,6 @@ class Node(CTENode):
for p in proc: for p in proc:
p.join() p.join()
def pushScore( self , FINAL , n1,n2, score):
if not FINAL.has_key(n1):
FINAL[n1]=[]
FINAL[n1].append(score)
if not FINAL.has_key(n2):
FINAL[n2]=[]
FINAL[n2].append(score)
def parse_resources__MOV(self, verbose=False): def parse_resources__MOV(self, verbose=False):
# parse all resources into a list of metadata # parse all resources into a list of metadata
metadata_list = [] metadata_list = []
...@@ -436,10 +426,12 @@ class Node(CTENode): ...@@ -436,10 +426,12 @@ class Node(CTENode):
docID = i[0] docID = i[0]
associations = i[1] associations = i[1]
# [ considering just {2,3}-grams ]
termsCount = 0 termsCount = 0
for ngram_text, weight in associations.items(): for ngram_text, weight in associations.items():
if ngram_text in NGram2ID: # considering just {2,3}-grams if ngram_text in NGram2ID: # considering just {2,3}-grams
termsCount+=1 termsCount+=1
# [ / considering just {2,3}-grams ]
ngrams_by_document = termsCount # i re-calculed this because of *02* ngrams_by_document = termsCount # i re-calculed this because of *02*
terms = [] terms = []
...@@ -562,12 +554,12 @@ class Node(CTENode): ...@@ -562,12 +554,12 @@ class Node(CTENode):
total += (end - start) total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
# print("LOG::TIME: In workflow() writeMetadata__MOV()") print("LOG::TIME: In workflow() writeMetadata__MOV()")
# start = time.time() start = time.time()
# self.writeMetadata__MOV( metadata_list=theMetadata ) self.writeMetadata__MOV( metadata_list=theMetadata )
# end = time.time() end = time.time()
# total += (end - start) total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()") print("LOG::TIME: In workflow() extract_ngrams__MOV()")
...@@ -585,33 +577,28 @@ class Node(CTENode): ...@@ -585,33 +577,28 @@ class Node(CTENode):
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start)) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # print("LOG::TIME: In workflow() / do_tfidf()") # # print("LOG::TIME: In workflow() / do_tfidf()")
start = time.time()
# print("\n= = = = = = = = = = = = = = = =") print("LOG::TIME: In workflow() do_coocmatrix()")
# print("NUMBER OF NGRAMS:",len(resultDict["G"]))
# # M = resultDict["metrics"]
# # Metrics2 = sorted(M, key=lambda x: M[x]['C'])
# # for i in Metrics2:
# # print("as: ",i,":",M[i])
# print("= = = = = = = = = = = = = = = =\n")
jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150) jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150)
end = time.time()
import pprint total += (end - start)
pprint.pprint(jsongraph) print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_coocmatrix() [s]",(end - start))
print("the user:",self.user)
print("the project id:",self.parent.id)
print("the corpus id:",self.id)
# timestamp = str(datetime.datetime.now().isoformat())
# # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (self.user , self.parent.id, self.id)
import json
f = open(filename,"w")
f.write( json.dumps(jsongraph) )
f.close()
# # # this is not working # # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) ) # # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END") print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" In workflow() END")
......
...@@ -132,10 +132,14 @@ def doTheQuery(request , project_id): ...@@ -132,10 +132,14 @@ def doTheQuery(request , project_id):
# do the WorkFlow # do the WorkFlow
try: try:
if DEBUG is True: if DEBUG is True:
corpus.workflow() # corpus.workflow() # old times...
# corpus.workflow__MOV() corpus.workflow__MOV()
# corpus.write_everything_to_DB()
else: else:
corpus.workflow.apply_async((), countdown=3) # corpus.workflow.apply_async((), countdown=3)
corpus.workflow__MOV() # synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
return JsonHttpResponse(["workflow","finished"]) return JsonHttpResponse(["workflow","finished"])
except Exception as error: except Exception as error:
......
...@@ -84,10 +84,7 @@ ...@@ -84,10 +84,7 @@
<ul> <ul>
{% for corpus in corpora %} {% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %} <li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> <a href="/project/{{project.id}}/corpus/{{corpus.id}}"> {{corpus.name}} </a> , {{ corpus.count }} Documents
{{corpus.name}}
</a>
, {{ corpus.count }} Documents
{% else %} {% else %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :) {{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %} {% endifnotequal %}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment