[UPDATE] last progress (nothing definitive)

fcd75235 · PkSM3 · 5f4f3e0b · fcd75235 · fcd75235 · fcd75235
Commit fcd75235 authored Mar 25, 2015 by PkSM3
Showing with 146 additions and 82 deletions

requirements.txt init/requirements.txt +2 -2

corpustools.py parsing/corpustools.py +32 -5

MedlineFetcherDavid2015.py scrap_pubmed/MedlineFetcherDavid2015.py +9 -8

views.py scrap_pubmed/views.py +103 -67

No files found.
--- a/init/requirements.txt
+++ b/init/requirements.txt
@@ -38,9 +38,9 @@ graphviz==0.4
 ipython==2.2.0
 kombu==3.0.23
 lxml==3.4.1
-matplotlib==1.4.0
+#matplotlib==1.4.0
 networkx==1.9
-nltk==3.0a4
+#nltk==3.0a4
 nose==1.3.4
 numpy==1.8.2
 pandas==0.14.1

--- a/parsing/corpustools.py
+++ b/parsing/corpustools.py
@@ -206,34 +206,54 @@ def extract_ngrams(corpus, keys):
        .filter(Node.type_id == cache.NodeType['Document'].id)
    )
    # prepare data to be inserted
+    print("\n= = = = = =")
    dbg.show('find ngrams')
+    print('000001')
    languages_by_id = {
        language.id: language.iso2
        for language in session.query(Language)
    }
    
+    print('000002')
    ngrams_data = set()
    ngrams_language_data = set()
    ngrams_tag_data = set()

+    print('000003')
    node_ngram_list = defaultdict(lambda: defaultdict(int))
    for nodeinfo in metadata_query:
+        print('\t000004')
        node_id = nodeinfo[0]
        language_id = nodeinfo[1]
+
        if language_id is None:
            language_iso2 = default_language_iso2
        else:
            language_iso2 = languages_by_id.get(language_id, None)
        if language_iso2 is None:
            continue
+
+        print('\t000005')
+        print('\t',language_iso2)
        ngramsextractor = ngramsextractors[language_iso2]
+        print('\t',ngramsextractor)
+        print('\t000006')
        for text in nodeinfo[2:]:
            if text is not None and len(text):
+                print('\t\t000007')
                ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
+                # print(ngrams)
+                print('\t\t000008')
                for ngram in ngrams:
+                    print('\t\t\t000009')
+                    print('\t\t\t',ngram)
                    n = len(ngram)
+                    print('\t\t\tn:',n)
+                    print('\t\t\t000010')
                    terms    = ' '.join([token for token, tag in ngram]).lower()
-
+                    print('\t\t\t000011')
+                    import pprint
+                    pprint.pprint(cache.Tag)
                    # TODO BUG here
                    if n == 1:
                        tag_id   = cache.Tag[ngram[0][1]].id
@@ -243,13 +263,20 @@ def extract_ngrams(corpus, keys):
                        tag_id   = cache.Tag['NN'].id
                        #tag_id   =  14
                        #print('tag_id_2', tag_id)
-
+                    print('\t\t\t000012')
                    node_ngram_list[node_id][terms] += 1
-                    
+                    print('\t\t\t000013')
                    ngrams_data.add((n, terms))
+                    print('\t\t\t000014')
                    ngrams_language_data.add((terms, language_id))
+                    print('\t\t\t000015')
                    ngrams_tag_data.add((terms, tag_id))
-
+                    print('\t\t\t000016')
+                print('\t\t000018')
+        print('\t\t000019')
+        # dbg.show('\t000007')
+    
+    print('000020')
    # insert ngrams to temporary table
    dbg.show('find ids for the %d ngrams' % len(ngrams_data))
    db, cursor = get_cursor()
@@ -320,10 +347,10 @@ def extract_ngrams(corpus, keys):
    # commit to database
    db.commit()

+    print("= = = = = =\n")


 # tfidf calculation
-
 def compute_tfidf(corpus):
    dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
    # compute terms frequency sum

--- a/scrap_pubmed/MedlineFetcherDavid2015.py
+++ b/scrap_pubmed/MedlineFetcherDavid2015.py
@@ -40,7 +40,7 @@ class MedlineFetcher:

        "Get number of results for query 'query' in variable 'count'"
        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
-        print(query)
+        # print(query)
        origQuery = query
        query = query.replace(' ', '%20')
            
@@ -79,7 +79,7 @@ class MedlineFetcher:

        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
        
-        print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
+        # print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')

        retstart = 0
        eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
@@ -94,7 +94,7 @@ class MedlineFetcher:
    def downloadFile(self, item):
        url = item[0]
        filename = item[1]
-        print("\tin test_downloadFile:")
+        # print("\tin test_downloadFile:")
        # print(url,filename)
        data = urlopen(url)
        f = codecs.open(filename, "w" ,encoding='utf-8')
@@ -110,7 +110,7 @@ class MedlineFetcher:
    def test_downloadFile(self, item):
        url = item[0]
        filename = item[1]
-        print("\tin downloadFile:")
+        # print("\tin downloadFile:")
        data = urlopen(url)
        return data

@@ -119,7 +119,7 @@ class MedlineFetcher:
        # time.sleep(1) # pretend to do some lengthy work.
        returnvalue = self.medlineEsearch(item)
        with self.lock:
-            print(threading.current_thread().name, item)
+            # print(threading.current_thread().name, item)
            return returnvalue

    # The worker thread pulls an item from the queue and processes it
@@ -160,13 +160,13 @@ class MedlineFetcher:

        N = 0

-        print ("MedlineFetcher::serialFetcher :")
+        # print ("MedlineFetcher::serialFetcher :")
        thequeries = []
        globalresults = []
        for i in range(yearsNumber):
            year = str(2015 - i)
-            print ('YEAR ' + year)
-            print ('---------\n')
+            # print ('YEAR ' + year)
+            # print ('---------\n')
            pubmedquery = str(year) + '[dp] '+query
            self.q.put( pubmedquery ) #put task in the queue
        
@@ -196,5 +196,6 @@ class MedlineFetcher:
            retmax_forthisyear = int(round(globalLimit*proportion))
            query["retmax"] = retmax_forthisyear
            if query["retmax"]==0: query["retmax"]+=1
+            print(query["string"],"\t[",k,">",query["retmax"],"]")

        return thequeries
--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
-from django.shortcuts import redirect
-from django.shortcuts import render

-from django.http import Http404, HttpResponse, HttpResponseRedirect
 from django.template.loader import get_template
 from django.template import Context
 from django.contrib.auth.models import User, Group

 from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher

-from gargantext_web.api import JsonHttpResponse
 from urllib.request import urlopen, urlretrieve
 import json

-from gargantext_web.settings import MEDIA_ROOT
 # from datetime import datetime
 import time
 import datetime
@@ -21,9 +16,23 @@ import threading
 from django.core.files import File
 from gargantext_web.settings import DEBUG

-from node.models import Language, ResourceType, Resource, \
-        Node, NodeType, Node_Resource, Project, Corpus, \
-        Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
+
+from django.shortcuts import redirect
+from django.shortcuts import render
+from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden
+
+from sqlalchemy import func
+from sqlalchemy.orm import aliased
+
+from collections import defaultdict
+import threading
+
+from node.admin import CustomForm
+from gargantext_web.db import *
+from gargantext_web.settings import DEBUG, MEDIA_ROOT
+from gargantext_web.api import JsonHttpResponse
+
+from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf


 def getGlobalStats(request ):
@@ -31,7 +40,7 @@ def getGlobalStats(request ):
 	alist = ["bar","foo"]

 	if request.method == "POST":
-		N = 100
+		N = 10
 		query = request.POST["query"]
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
 		print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
@@ -72,17 +81,57 @@ def getGlobalStatsISTEXT(request ):
 def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]

-	if request.method == "POST":
-		# query = request.POST["query"]
-		# name = request.POST["string"]
+	# SQLAlchemy session
+	session = Session()

-		# instancia = MedlineFetcher()
-		# thequeries = json.loads(query)
+	# do we have a valid project id?
+	try:
+		project_id = int(project_id)
+	except ValueError:
+		raise Http404()

-		# urlreqs = []
-		# for yearquery in thequeries:
-		# 	urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
-		# alist = ["tudo fixe" , "tudo bem"]
+	# do we have a valid project?
+	project = (session
+		.query(Node)
+		.filter(Node.id == project_id)
+		.filter(Node.type_id == cache.NodeType['Project'].id)
+	).first()
+
+	if project is None:
+		raise Http404()
+
+	# do we have a valid user?
+	user = request.user
+	if not user.is_authenticated():
+		return redirect('/login/?next=%s' % request.path)
+	if project.user_id != user.id:
+		return HttpResponseForbidden()
+
+
+	if request.method == "POST":
+		query = request.POST["query"]
+		name = request.POST["string"]
+
+		instancia = MedlineFetcher()
+		thequeries = json.loads(query)
+
+		urlreqs = []
+		for yearquery in thequeries:
+			urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+		alist = ["tudo fixe" , "tudo bem"]
+
+		resourcetype = cache.ResourceType["pubmed"]
+
+		# corpus node instanciation as a Django model
+		corpus = Node(
+			name = name,
+			user_id = request.user.id,
+			parent_id = project_id,
+			type_id = cache.NodeType['Corpus'].id,
+			language_id = None,
+		)
+		session.add(corpus)
+		session.commit()

 		# """
 		# urlreqs: List of urls to query.
@@ -91,57 +140,44 @@ def doTheQuery(request , project_id):
 		# 	eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
 		# """

-		# thefile = "how we do this here?"
-		# resource_type = ResourceType.objects.get(name="pubmed" )

-		# parent      = Node.objects.get(id=project_id)
-		# node_type   = NodeType.objects.get(name='Corpus')
-		# type_id = NodeType.objects.get(name='Document').id
-		# user_id = User.objects.get( username=request.user ).id
-
-		# corpus = Node(
-		# 	user=request.user,
-		# 	parent=parent,
-		# 	type=node_type,
-		# 	name=name,
-		# )
-		# corpus.save()
-
-		# tasks = MedlineFetcher()
-		# for i in range(8):
-		# 	t = threading.Thread(target=tasks.worker2) #thing to do
-		# 	t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-		# 	t.start()
-		# for url in urlreqs:
-		# 	filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-		# 	tasks.q.put( [url , filename]) #put a task in th queue
-		# tasks.q.join() # wait until everything is finished
-
-		# dwnldsOK = 0
-		# for filename in tasks.firstResults:
-		# 	if filename!=False:
-		# 		corpus.add_resource( user=request.user, type=resource_type, file=filename )
-		# 		dwnldsOK+=1
+		tasks = MedlineFetcher()
+		for i in range(8):
+			t = threading.Thread(target=tasks.worker2) #thing to do
+			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+			t.start()
+		for url in urlreqs:
+			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+			tasks.q.put( [url , filename]) #put a task in th queue
+		tasks.q.join() # wait until everything is finished
+
+		dwnldsOK = 0
+		for filename in tasks.firstResults:
+			if filename!=False:
+				# add the uploaded resource to the corpus
+				add_resource(corpus,
+					user_id = request.user.id,
+					type_id = resourcetype.id,
+					file = filename,
+				)
+				dwnldsOK+=1
 			
-		# if dwnldsOK == 0: return JsonHttpResponse(["fail"])
-
-		# # do the WorkFlow
-		# try:
-		# 	if DEBUG is True:
-		# 		# corpus.workflow() # old times...
-		# 		corpus.workflow__MOV()
-		# 		# corpus.write_everything_to_DB()
-		# 	else:
-		# 		# corpus.workflow.apply_async((), countdown=3)
-		# 		corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
-		# 		# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
-
-
-		# 	return JsonHttpResponse(["workflow","finished"])
-		# except Exception as error:
-		# 	print(error)
-
-		return JsonHttpResponse(["out of service for the moment"])
+		if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+
+		try: parse_resources(corpus)
+		except Exception as error: print("!OK parse:",error)
+
+		try: extract_ngrams(corpus, ['title'])
+		except Exception as error: print("!OK ngrams:",error)
+
+		# try: compute_tfidf(corpus)
+		# except Exception as error: print("!OK tfidf:",error)
+		
+		# # except Exception as error:
+		# # 	print('WORKFLOW ERROR')
+		# # 	print(error)
+		# # # redirect to the main project page
+		return HttpResponseRedirect('/project/' + str(project_id))

 	data = alist
 	return JsonHttpResponse(data)