[UPDATE] newuser-nirvana: workflow__MOV + graph generation (writing ngrams in DB not included)

2d1a9b89 · PkSM3 · b7edf98a · 2d1a9b89 · 2d1a9b89 · 2d1a9b89
Commit 2d1a9b89 authored Mar 02, 2015 by PkSM3
Showing with 52 additions and 79 deletions

functions.py analysis/functions.py +0 -21

views.py gargantext_web/views.py +17 -11

models.py node/models.py +24 -37

views.py scrap_pubmed/views.py +7 -3

project.html templates/project.html +4 -7

No files found.
--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -245,27 +245,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
    return data


-#def tfidf(corpus, document, ngram):
-#    '''
-#    Compute TF-IDF (Term Frequency - Inverse Document Frequency)
-#    See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
-#    '''
-#    try:
-#        occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
-#        ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
-#        term_frequency = occurences_of_ngram / ngrams_by_document
-#    
-#        xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
-#        yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
-#        inverse_document_frequency= log(xx/yy)
-#        
-#        # result = tf * idf
-#        result = term_frequency * inverse_document_frequency
-#    except Exception as error:
-#        print(error, ngram)
-#        result = 0
-#    return result
-
 from analysis.tfidf import tfidf

 def do_tfidf(corpus, reset=True):

--- a/gargantext_web/views.py
+++ b/gargantext_web/views.py
@@ -25,6 +25,7 @@ from django import forms
 from collections import defaultdict

 from parsing.FileParsers import *
+import os

 # SOME FUNCTIONS

@@ -282,9 +283,8 @@ def project(request, project_id):
    cooclists       = ""#.children.filter(type=type_cooclist)
    
    for corpus in corpora:
-        # print("corpus", corpus.pk , corpus.name , corpus.type_id)
-
        docs_count =  Node.objects.filter(parent=corpus, type=type_document).count()
+        # print("corpus:", corpus.pk , " | name:",corpus.name , " | type:",corpus.type_id , " | #docs:",docs_count)
        docs_total += docs_count
        
        corpus_view = dict()
@@ -727,6 +727,7 @@ def graph(request, project_id, corpus_id):
            'date'      : date,\
            'corpus'    : corpus,\
            'project'   : project,\
+            'graphfile' : "hola_mundo",\
            }))
    
    return HttpResponse(html)
@@ -839,18 +840,23 @@ def send_csv(request, corpus_id):
 from gargantext_web.api import JsonHttpResponse
 from analysis.functions import get_cooc
 import json
-
+from gargantext_web.settings import MEDIA_ROOT
 def node_link(request, corpus_id):
    '''
    Create the HttpResponse object with the node_link dataset.
    '''   
-    import time
-    print("In node_link() START")
-    start = time.time()
+
+    data = []
+    
+    corpus = Node.objects.get(id=corpus_id)
+    filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent.id, corpus_id)
+    print("file exists?:",os.path.isfile(filename))
+    if os.path.isfile(filename):
+        json_data = open(filename,"r")
+        data = json.load(json_data)
+        json_data.close()
+    else:
        data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
-    end = time.time()
-    print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" get_cooc() [s]",(end - start))
-    print("In node_link() END")
    return JsonHttpResponse(data)

 def adjacency(request, corpus_id):

--- a/node/models.py
+++ b/node/models.py
@@ -287,16 +287,6 @@ class Node(CTENode):
        for p in proc:
            p.join()

-
-    def pushScore( self , FINAL , n1,n2, score):
-        if not FINAL.has_key(n1):
-            FINAL[n1]=[]
-        FINAL[n1].append(score)
-        
-        if not FINAL.has_key(n2):
-            FINAL[n2]=[]
-        FINAL[n2].append(score)
-
    def parse_resources__MOV(self, verbose=False):
        # parse all resources into a list of metadata
        metadata_list = []
@@ -436,10 +426,12 @@ class Node(CTENode):
            docID = i[0]
            associations = i[1]

+            # [ considering just {2,3}-grams ]
            termsCount = 0
            for ngram_text, weight in associations.items():
                if ngram_text in NGram2ID: # considering just {2,3}-grams
                    termsCount+=1
+            # [ / considering just {2,3}-grams ]

            ngrams_by_document = termsCount # i re-calculed this because of *02*
            terms = []
@@ -562,12 +554,12 @@ class Node(CTENode):
        total += (end - start)
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))

-        # print("LOG::TIME: In workflow()    writeMetadata__MOV()")
-        # start = time.time()
-        # self.writeMetadata__MOV( metadata_list=theMetadata )
-        # end = time.time()
-        # total += (end - start)
-        # print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
+        print("LOG::TIME: In workflow()    writeMetadata__MOV()")
+        start = time.time()
+        self.writeMetadata__MOV( metadata_list=theMetadata )
+        end = time.time()
+        total += (end - start)
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))


        print("LOG::TIME: In workflow()    extract_ngrams__MOV()")
@@ -585,33 +577,28 @@ class Node(CTENode):
        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
        # # print("LOG::TIME: In workflow()    / do_tfidf()")

-
-        # print("\n= = = = = = = = = = = = = = = =")
-        # print("NUMBER OF NGRAMS:",len(resultDict["G"]))
-        # # M = resultDict["metrics"]
-        # # Metrics2 = sorted(M, key=lambda x: M[x]['C'])
-
-        # # for i in Metrics2:
-        # #     print("as: ",i,":",M[i])
-        # print("= = = = = = = = = = = = = = = =\n")
-
+        start = time.time()
+        print("LOG::TIME: In workflow()    do_coocmatrix()")
        jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150)
-
-        import pprint
-        pprint.pprint(jsongraph)
+        end = time.time()
+        total += (end - start)
+        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_coocmatrix() [s]",(end - start))
+
+        print("the user:",self.user)
+        print("the project id:",self.parent.id)
+        print("the corpus id:",self.id)
+        # timestamp = str(datetime.datetime.now().isoformat())
+        # # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
+        filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (self.user , self.parent.id, self.id)
+        import json
+        f = open(filename,"w")
+        f.write( json.dumps(jsongraph) )
+        f.close()


        # # # this is not working
        # # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
        
-        # start = time.time()
-        # print("LOG::TIME: In workflow()    do_tfidf()")
-        # from analysis.functions import do_tfidf
-        # do_tfidf(self)
-        # end = time.time()
-        # total += (end - start)
-        # print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
-        # # # print("LOG::TIME: In workflow()    / do_tfidf()")
        print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+"   In workflow() END")



--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -132,10 +132,14 @@ def doTheQuery(request , project_id):
 		# do the WorkFlow
 		try:
 			if DEBUG is True:
-				corpus.workflow()
-				# corpus.workflow__MOV()
+				# corpus.workflow() # old times...
+				corpus.workflow__MOV()
+				# corpus.write_everything_to_DB()
 			else:
-				corpus.workflow.apply_async((), countdown=3)
+				# corpus.workflow.apply_async((), countdown=3)
+				corpus.workflow__MOV() # synchronous! because is faaast
+				# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
+

 			return JsonHttpResponse(["workflow","finished"])
 		except Exception as error:

--- a/templates/project.html
+++ b/templates/project.html
@@ -84,10 +84,7 @@
 												<ul>
 														{% for corpus in corpora %}
 														<li> {% ifnotequal corpus.count 0 %}
-																		<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> 
-																			{{corpus.name}}
-																		</a>
-																		, {{ corpus.count }} Documents 
+																		<a href="/project/{{project.id}}/corpus/{{corpus.id}}">  {{corpus.name}} </a> , {{ corpus.count }} Documents 
 															 {% else %}
 																 	{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
 															 {% endifnotequal %}