[UPDATE] dynamic query for pubmed: 90%

92a7ee21 · PkSM3 · 955dd49b · 92a7ee21 · 92a7ee21 · 92a7ee21
Commit 92a7ee21 authored Jan 23, 2015 by PkSM3
10 changed files
--- a/gargantext_web/settings.py
+++ b/gargantext_web/settings.py
@@ -66,6 +66,7 @@ INSTALLED_APPS = (
    'cte_tree',
    'node',
    'ngram',
+    'scrap_pubmed',
    'django_hstore',
    'djcelery',
    'aldjemy',

--- a/gargantext_web/urls.py
+++ b/gargantext_web/urls.py
@@ -6,6 +6,7 @@ from django.contrib.auth.views import login
 from gargantext_web import views

 import gargantext_web.api
+import scrap_pubmed.views as pubmedscrapper


 admin.autodiscover()
@@ -65,7 +66,10 @@ urlpatterns = patterns('',
    url(r'^ngrams$', views.ngrams),
    url(r'^nodeinfo/(\d+)$', views.nodeinfo),
    url(r'^tests/mvc$', views.tests_mvc),
-    url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments)
+    url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
+    url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
+    url(r'^tests/pubmedquery/go$', pubmedscrapper.doTheQuery)
+
 )



--- a/scrap_pubmed/MedlineFetcherDavid2015.py
+++ b/scrap_pubmed/MedlineFetcherDavid2015.py
+# ****************************
+# *****  Medline Fetcher *****
+# ****************************
+
+# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
+import sys
+if sys.version_info >= (3, 0): from urllib.request import urlopen
+else: from urllib import urlopen
+import os
+import time
+# import libxml2
+from lxml import etree
+
+class MedlineFetcher:
+
+    def __init__(self):
+        self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
+        self.pubMedDB = 'Pubmed'
+        self.reportType = 'medline'
+        self.personalpath_mainPath = 'MedLine/'
+        if not os.path.isdir(self.personalpath_mainPath):
+            os.makedirs(self.personalpath_mainPath)
+            print ('Created directory ' + self.personalpath_mainPath)
+
+    # Return the:
+    # - count = 
+    # - queryKey = 
+    # - webEnv = 
+    def medlineEsearch(self , query):
+
+        print ("MedlineFetcher::medlineEsearch :")
+
+        "Get number of results for query 'query' in variable 'count'"
+        "Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
+        
+        query = query.replace(' ', '%20')
+            
+        eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
+        eSearchResult = urlopen(eSearch)
+        data = eSearchResult.read()
+
+        root = etree.XML(data)
+
+        findcount = etree.XPath("/eSearchResult/Count/text()")
+        count = findcount(root)[0]
+        
+        findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
+        queryKey = findquerykey(root)[0]
+
+        findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
+        webEnv = findwebenv(root)[0]
+
+        # doc = libxml2.parseDoc(data)
+        # count = doc.xpathEval('eSearchResult/Count/text()')[0]
+        # queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
+        # webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
+        # print count, queryKey, webEnv
+        values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
+        return values
+
+
+    # RETMAX:
+    # Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
+    # maximum of 100,000 records
+    def medlineEfetchRAW( self , fullquery):
+        
+
+        query = fullquery["string"]
+        retmax = fullquery["retmax"]
+        count = fullquery["count"]
+        queryKey = fullquery["queryKey"]
+        webEnv = fullquery["webEnv"]
+
+        print ("MedlineFetcher::medlineEfetchRAW :")
+
+        "Fetch medline result for query 'query', saving results to file every 'retmax' articles"
+
+        queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
+        
+
+        # pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
+        # if not os.path.isdir(pubmedqueryfolder):
+        #     os.makedirs(pubmedqueryfolder)
+
+        pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
+        pubMedResultFile = open(pubMedResultFileName, 'w')
+        
+
+        print ('Query "' , query , '"\t:\t' , count , ' results')
+        print ('Starting fetching at ' , time.asctime(time.localtime()) )
+
+        retstart = 0
+        while(retstart < count):
+            eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)                
+            eFetchResult = urlopen(eFetch)
+            if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
+            else: pubMedResultFile.write(eFetchResult.read())
+            retstart += retmax
+            break # you shall not pass !!
+
+        pubMedResultFile.close()
+        print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
+        print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
+        print("------------------------------------------")
+
+
+
+    # GLOBALLIMIT:
+    # I will retrieve this exact amount of publications.
+    # The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
+    # - k : Number of publications of x year (according to pubmed)
+    # - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
+    # - GlobalLimit : Number of publications i want.
+    def serialFetcher(self , yearsNumber , query, globalLimit):
+
+        N = 0
+
+        print ("MedlineFetcher::serialFetcher :")
+        thequeries = []
+        for i in range(yearsNumber):
+            year = str(2015 - i)
+            print ('YEAR ' + year)
+            print ('---------\n')
+            # medlineEfetch(str(year) + '[dp] '+query , 20000)
+            # medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
+            pubmedquery = str(year) + '[dp] '+query
+            globalresults = self.medlineEsearch(pubmedquery)
+            N+=globalresults["count"]
+            querymetadata = { 
+                "string": pubmedquery , 
+                "count": globalresults["count"] , 
+                "queryKey":globalresults["queryKey"] , 
+                "webEnv":globalresults["webEnv"] , 
+                "retmax":0 
+            }
+            thequeries.append ( querymetadata )
+
+        print("Total Number:", N,"publications")
+        print("And i want just:",globalLimit,"publications")
+        print("---------------------------------------\n")
+
+        for query in thequeries:
+            k = query["count"]
+            percentage = k/float(N)
+            retmax_forthisyear = int(round(globalLimit*percentage))
+            query["retmax"] = retmax_forthisyear
+            # self.medlineEfetchRAW( query )
+
+        print ('Done !')
+        return thequeries
+
+
+
+# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
+# query = str(2015)+ '[dp] '+'microbiota'
+# medlineEsearch( query )
+
+# 
--- a/scrap_pubmed/__init__.py
+++ b/scrap_pubmed/__init__.py
--- a/scrap_pubmed/admin.py
+++ b/scrap_pubmed/admin.py
+from django.contrib import admin
+
+# Register your models here.
--- a/scrap_pubmed/models.py
+++ b/scrap_pubmed/models.py
+from django.db import models
+
+# Create your models here.
--- a/scrap_pubmed/tests.py
+++ b/scrap_pubmed/tests.py
+from django.test import TestCase
+
+# Create your tests here.
--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
+from django.shortcuts import redirect
+from django.shortcuts import render
+
+from django.http import Http404, HttpResponse, HttpResponseRedirect
+from django.template.loader import get_template
+from django.template import Context
+
+from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
+
+from gargantext_web.api import JsonHttpResponse
+# Create your views here.
+
+def getGlobalStats(request ):
+	print(request.method)
+	alist = ["bar","foo"]
+
+	if request.method == "POST":
+		query = request.POST["query"]
+		instancia = MedlineFetcher()
+		alist = instancia.serialFetcher( 5, query , 200 )
+
+	data = alist
+	return JsonHttpResponse(data)
+
+
+def doTheQuery(request ):
+	print(request.method)
+	alist = ["hola","mundo"]
+
+	if request.method == "POST":
+		query = request.POST
+		print(query)
+
+	data = alist
+	return JsonHttpResponse(data)
\ No newline at end of file
--- a/templates/project.html
+++ b/templates/project.html
@@ -25,7 +25,7 @@
        <div class="col-md-6">
            {% if project %}
            <h1>{{ project.name }}</h1>
-						<!--<h3> {{number}} corpora </h3>--!>
+						<!--<h3> {{number}} corpora </h3>-->
            {% endif %}
        </div>


--- a/templates/project_dynamic-pubmed.html
+++ b/templates/project_dynamic-pubmed.html
+
+{% extends "menu.html" %}
+
+{% block css %}
+{% load staticfiles %}
+<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
+
+<link rel="stylesheet" type="text/css" href="{% static "css/morris.css" %}">
+<link rel="stylesheet" type="text/css" href="{% static "css/jquery.easy-pie-chart.css"%}">
+<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
+
+<script src="{% static "js/raphael-min.js"%}"></script>
+<script src="{% static "js/morris.min.js"%}"></script>
+
+
+{% endblock %}
+
+
+
+{% block content %}
+
+<div class="container theme-showcase" role="main">
+        <div class="jumbotron">
+				<div class="row">
+        <div class="col-md-6">
+            {% if project %}
+            <h1>{{ project.name }}</h1>
+						<!--<h3> {{number}} corpora </h3>-->
+            {% endif %}
+        </div>
+
+        <div class="col-md-4">
+				<p>
+				{% if donut %}
+	  		<div id="hero-donut" style="height: 200px;"></div>
+				{% endif %}
+        <center>
+            <button 
+								type="button" 
+								class="btn btn-primary btn-lg" 
+								data-container="body" 
+								data-toggle="popover" 
+								data-placement="bottom"
+								>Add a corpus</button>
+								<div id="popover-content" class="hide">
+
+					<form enctype="multipart/form-data" action="/project/{{project.id}}/" method="post">
+    {% csrf_token %}
+		{{ form.non_field_errors }}
+		{{ form.as_p}}
+		
+		{{ formResource.non_field_errors }}
+		{{ formResource.as_p}}
+		<input onclick='$("#semLoader").css("visibility", "visible"); $("#semLoader").show();' type="submit" name="submit" id="submit" class="btn" value="Add this corpus" /><div>
+		<div id="pubmedcrawl" align="right"><a data-toggle="modal" href="#stack1">&#10142; Query directly in PubMed</a></div>
+    </center>
+				</p>
+
+								</div>
+		        </div>
+
+        </div>
+
+
+        </div>
+        </div>
+</div>
+<!-- Add jumbotron container for each type of corpus (presse, science etc.) -->
+
+          <div id="semLoader" style="position:absolute; top:50%; left:40%; width:80px; visibility: hidden;">
+							<img src="{% static "js/libs/img2/loading-bar.gif" %}"></img>
+          </div>
+
+<div class="container">
+				{% if list_corpora  %}
+								<h1>Resources</h1>
+								<h2>Corpora</h2>
+										<ul>
+										{% for key, corpora in list_corpora.items %}
+										<li>{{ key }}</li>
+												<ul>
+														{% for corpus in corpora %}
+														<li> {% ifnotequal corpus.count 0 %}
+																		<a href="/project/{{project.id}}/corpus/{{corpus.id}}"> 
+																			{{corpus.name}}
+																		</a>
+																		, {{ corpus.count }} Documents 
+																 {% else %}
+																 {{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
+																 {% endifnotequal %}
+																		<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom" 
+																		data-content='
+																		<ul>
+																		<li> Rename </li>
+																		<li> Add new documents </li>
+																		<li><a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Delete</a></li>
+																		</ul>
+																		'>Manage</button>
+																</li>
+														{% endfor %}
+												</ul>
+										{% endfor %}
+										</ul>
+				{% endif %}
+
+
+								{% if list_corporax  %}
+										<div class="col-md-4">
+                        <h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a>
+												</h3>
+												<h4>{{ corpus.count }} Documents </h4>
+												<h5>Activity:</h5>
+												<div class="chart" data-percent="73">73%</div>
+										</div>
+								{% endif %}
+				
+						{% if whitelists  %}
+						<h2>Lists of Ngrams</h2>
+								<h3>White Lists</h2>
+								{% for list in whitelists %}
+										<ul>
+                    <li> {{list.name }}
+										</ul>
+								{% endfor %}
+						{% endif %}
+						
+            {% if whitelists  %}
+						<h3>Black Lists</h2>
+								{% for list in blacklists %}
+										<ul>
+                    <li> {{list.name }}
+										</ul>
+								{% endfor %}
+						{% endif %}
+						
+
+				
+        {% if cooclists  %}
+						<h2>Results (graphs)</h2>
+						<h3>Cooccurrences Lists</h2>
+								{% for list in cooclists %}
+										<ul>
+                    <li> {{list.name }}
+										</ul>
+								{% endfor %}
+						{% endif %}
+			  
+</div>
+
+
+  <!-- Modal -->
+  <div class="modal fade" id="stack1" tabindex="-1" role="dialog" aria-labelledby="myModalLabel" aria-hidden="true">
+    <div class="modal-dialog">
+      <div class="modal-content">
+
+		<div class="modal-header">
+			<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
+			<h3>Query to PubMed</h3>
+		</div>
+		<div class="modal-body">
+			<p>One fine body…</p>
+			<input id="daquery" type="text" class="input-lg" data-tabindex="2">
+			<a onclick="getGlobalResults();" class="btn">Scan</a>
+			<div id="results"></div>
+		</div>
+		<div class="modal-footer">
+		  <button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
+		  <button onclick="doTheQuery();" disabled id="id_thebutton" type="button" class="btn btn-primary">Explore a sample!</button>
+		</div>
+      </div><!-- /.modal-content -->
+    </div><!-- /.modal-dialog -->
+  </div><!-- /.modal -->
+
+
+<script>
+		function getCookie(name) {
+		    var cookieValue = null;
+		    if (document.cookie && document.cookie != '') {
+		        var cookies = document.cookie.split(';');
+		        for (var i = 0; i < cookies.length; i++) {
+		            var cookie = jQuery.trim(cookies[i]);
+		            // Does this cookie string begin with the name we want?
+		            if (cookie.substring(0, name.length + 1) == (name + '=')) {
+		                cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
+		                break;
+		            }
+		        }
+		    }
+		    return cookieValue;
+		}
+
+		var thequeries = []
+
+		function doTheQuery() {
+			if ( $('#id_thebutton').prop('disabled') ) return;
+			console.log("in doTheQuery:");
+
+		    $.ajax({
+			  // contentType: "application/json",
+		      url: window.location.origin+"/tests/pubmedquery/go",
+		      data: formData,
+		      type: 'POST',
+		      beforeSend: function(xhr) {
+		        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+		      },
+		      success: function(data) {
+				console.log("in doTheQuery()")
+		        console.log(data)
+		      },
+		        error: function(result) {
+		            console.log("Data not found");
+		        }
+		    });
+
+		}
+
+		function getGlobalResults(){
+			// AJAX to django
+			var pubmedquery = $("#daquery").val()
+			var formData = {query:pubmedquery}
+			$("#results").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
+
+		    $.ajax({
+			  // contentType: "application/json",
+		      url: window.location.origin+"/tests/pubmedquery",
+		      data: formData,
+		      type: 'POST',
+		      beforeSend: function(xhr) {
+		        xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
+		      },
+		      success: function(data) {
+				console.log("in getGlobalResults")
+		        console.log(data)
+
+	            thequeries = data
+	            var N=0,k=0;
+	            for(var i in thequeries) N += thequeries[i].count
+
+	            if(N>0) {
+	            	$("#results").html("Result: "+N+" publications in the last 5 years")
+	            	$('#id_thebutton').prop('disabled', false);
+	            }
+
+		      },
+		        error: function(result) {
+		            console.log("Data not found");
+		        }
+		    });
+			
+			
+		}
+        // Morris Donut Chart
+        Morris.Donut({
+            element: 'hero-donut',
+            data: [
+            {% if donut %}
+            {% for part in donut %}
+						{label: '{{ part.source }}', value: {{ part.part }} },
+            {% endfor %}
+            {% endif %}
+
+            ],
+            colors: ["@white", "@white"],
+            //colors: ["#30a1ec", "#76bdee"],
+            formatter: function (y) { return y + "%" }
+        });
+
+</script>
+
+
+
+
+{% endblock %}