Commit 92a7ee21 authored by PkSM3's avatar PkSM3

[UPDATE] dynamic query for pubmed: 90%

parent 955dd49b
......@@ -66,6 +66,7 @@ INSTALLED_APPS = (
'cte_tree',
'node',
'ngram',
'scrap_pubmed',
'django_hstore',
'djcelery',
'aldjemy',
......
......@@ -6,6 +6,7 @@ from django.contrib.auth.views import login
from gargantext_web import views
import gargantext_web.api
import scrap_pubmed.views as pubmedscrapper
admin.autodiscover()
......@@ -65,7 +66,10 @@ urlpatterns = patterns('',
url(r'^ngrams$', views.ngrams),
url(r'^nodeinfo/(\d+)$', views.nodeinfo),
url(r'^tests/mvc$', views.tests_mvc),
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments)
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/pubmedquery/go$', pubmedscrapper.doTheQuery)
)
......
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time
# import libxml2
from lxml import etree
class MedlineFetcher:
def __init__(self):
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
self.personalpath_mainPath = 'MedLine/'
if not os.path.isdir(self.personalpath_mainPath):
os.makedirs(self.personalpath_mainPath)
print ('Created directory ' + self.personalpath_mainPath)
# Return the:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
# doc = libxml2.parseDoc(data)
# count = doc.xpathEval('eSearchResult/Count/text()')[0]
# queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv
values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery["string"]
retmax = fullquery["retmax"]
count = fullquery["count"]
queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"]
print ("MedlineFetcher::medlineEfetchRAW :")
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
# if not os.path.isdir(pubmedqueryfolder):
# os.makedirs(pubmedqueryfolder)
pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
pubMedResultFile = open(pubMedResultFileName, 'w')
print ('Query "' , query , '"\t:\t' , count , ' results')
print ('Starting fetching at ' , time.asctime(time.localtime()) )
retstart = 0
while(retstart < count):
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
eFetchResult = urlopen(eFetch)
if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
else: pubMedResultFile.write(eFetchResult.read())
retstart += retmax
break # you shall not pass !!
pubMedResultFile.close()
print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
print("------------------------------------------")
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
N = 0
print ("MedlineFetcher::serialFetcher :")
thequeries = []
for i in range(yearsNumber):
year = str(2015 - i)
print ('YEAR ' + year)
print ('---------\n')
# medlineEfetch(str(year) + '[dp] '+query , 20000)
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery = str(year) + '[dp] '+query
globalresults = self.medlineEsearch(pubmedquery)
N+=globalresults["count"]
querymetadata = {
"string": pubmedquery ,
"count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] ,
"retmax":0
}
thequeries.append ( querymetadata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for query in thequeries:
k = query["count"]
percentage = k/float(N)
retmax_forthisyear = int(round(globalLimit*percentage))
query["retmax"] = retmax_forthisyear
# self.medlineEfetchRAW( query )
print ('Done !')
return thequeries
# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
# query = str(2015)+ '[dp] '+'microbiota'
# medlineEsearch( query )
#
from django.contrib import admin
# Register your models here.
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template
from django.template import Context
from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
from gargantext_web.api import JsonHttpResponse
# Create your views here.
def getGlobalStats(request ):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
query = request.POST["query"]
instancia = MedlineFetcher()
alist = instancia.serialFetcher( 5, query , 200 )
data = alist
return JsonHttpResponse(data)
def doTheQuery(request ):
print(request.method)
alist = ["hola","mundo"]
if request.method == "POST":
query = request.POST
print(query)
data = alist
return JsonHttpResponse(data)
\ No newline at end of file
......@@ -25,7 +25,7 @@
<div class="col-md-6">
{% if project %}
<h1>{{ project.name }}</h1>
<!--<h3> {{number}} corpora </h3>--!>
<!--<h3> {{number}} corpora </h3>-->
{% endif %}
</div>
......
{% extends "menu.html" %}
{% block css %}
{% load staticfiles %}
<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
<link rel="stylesheet" type="text/css" href="{% static "css/morris.css" %}">
<link rel="stylesheet" type="text/css" href="{% static "css/jquery.easy-pie-chart.css"%}">
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script src="{% static "js/raphael-min.js"%}"></script>
<script src="{% static "js/morris.min.js"%}"></script>
{% endblock %}
{% block content %}
<div class="container theme-showcase" role="main">
<div class="jumbotron">
<div class="row">
<div class="col-md-6">
{% if project %}
<h1>{{ project.name }}</h1>
<!--<h3> {{number}} corpora </h3>-->
{% endif %}
</div>
<div class="col-md-4">
<p>
{% if donut %}
<div id="hero-donut" style="height: 200px;"></div>
{% endif %}
<center>
<button
type="button"
class="btn btn-primary btn-lg"
data-container="body"
data-toggle="popover"
data-placement="bottom"
>Add a corpus</button>
<div id="popover-content" class="hide">
<form enctype="multipart/form-data" action="/project/{{project.id}}/" method="post">
{% csrf_token %}
{{ form.non_field_errors }}
{{ form.as_p}}
{{ formResource.non_field_errors }}
{{ formResource.as_p}}
<input onclick='$("#semLoader").css("visibility", "visible"); $("#semLoader").show();' type="submit" name="submit" id="submit" class="btn" value="Add this corpus" /><div>
<div id="pubmedcrawl" align="right"><a data-toggle="modal" href="#stack1">&#10142; Query directly in PubMed</a></div>
</center>
</p>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Add jumbotron container for each type of corpus (presse, science etc.) -->
<div id="semLoader" style="position:absolute; top:50%; left:40%; width:80px; visibility: hidden;">
<img src="{% static "js/libs/img2/loading-bar.gif" %}"></img>
</div>
<div class="container">
{% if list_corpora %}
<h1>Resources</h1>
<h2>Corpora</h2>
<ul>
{% for key, corpora in list_corpora.items %}
<li>{{ key }}</li>
<ul>
{% for corpus in corpora %}
<li> {% ifnotequal corpus.count 0 %}
<a href="/project/{{project.id}}/corpus/{{corpus.id}}">
{{corpus.name}}
</a>
, {{ corpus.count }} Documents
{% else %}
{{corpus.name}} : <img width="20px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img> Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
<button type="button" class="btn btn-xs btn-default" data-container="body" data-toggle="popover" data-placement="bottom"
data-content='
<ul>
<li> Rename </li>
<li> Add new documents </li>
<li><a href="/project/{{ project.id }}/corpus/{{ corpus.id}}/delete">Delete</a></li>
</ul>
'>Manage</button>
</li>
{% endfor %}
</ul>
{% endfor %}
</ul>
{% endif %}
{% if list_corporax %}
<div class="col-md-4">
<h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}">{{corpus.name}}</a>
</h3>
<h4>{{ corpus.count }} Documents </h4>
<h5>Activity:</h5>
<div class="chart" data-percent="73">73%</div>
</div>
{% endif %}
{% if whitelists %}
<h2>Lists of Ngrams</h2>
<h3>White Lists</h2>
{% for list in whitelists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
{% if whitelists %}
<h3>Black Lists</h2>
{% for list in blacklists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
{% if cooclists %}
<h2>Results (graphs)</h2>
<h3>Cooccurrences Lists</h2>
{% for list in cooclists %}
<ul>
<li> {{list.name }}
</ul>
{% endfor %}
{% endif %}
</div>
<!-- Modal -->
<div class="modal fade" id="stack1" tabindex="-1" role="dialog" aria-labelledby="myModalLabel" aria-hidden="true">
<div class="modal-dialog">
<div class="modal-content">
<div class="modal-header">
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">×</button>
<h3>Query to PubMed</h3>
</div>
<div class="modal-body">
<p>One fine body…</p>
<input id="daquery" type="text" class="input-lg" data-tabindex="2">
<a onclick="getGlobalResults();" class="btn">Scan</a>
<div id="results"></div>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
<button onclick="doTheQuery();" disabled id="id_thebutton" type="button" class="btn btn-primary">Explore a sample!</button>
</div>
</div><!-- /.modal-content -->
</div><!-- /.modal-dialog -->
</div><!-- /.modal -->
<script>
function getCookie(name) {
var cookieValue = null;
if (document.cookie && document.cookie != '') {
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {
var cookie = jQuery.trim(cookies[i]);
// Does this cookie string begin with the name we want?
if (cookie.substring(0, name.length + 1) == (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
}
return cookieValue;
}
var thequeries = []
function doTheQuery() {
if ( $('#id_thebutton').prop('disabled') ) return;
console.log("in doTheQuery:");
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery/go",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in doTheQuery()")
console.log(data)
},
error: function(result) {
console.log("Data not found");
}
});
}
function getGlobalResults(){
// AJAX to django
var pubmedquery = $("#daquery").val()
var formData = {query:pubmedquery}
$("#results").html('<img width="30px" src="{% static "js/libs/img2/loading-bar.gif" %}"></img>')
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
success: function(data) {
console.log("in getGlobalResults")
console.log(data)
thequeries = data
var N=0,k=0;
for(var i in thequeries) N += thequeries[i].count
if(N>0) {
$("#results").html("Result: "+N+" publications in the last 5 years")
$('#id_thebutton').prop('disabled', false);
}
},
error: function(result) {
console.log("Data not found");
}
});
}
// Morris Donut Chart
Morris.Donut({
element: 'hero-donut',
data: [
{% if donut %}
{% for part in donut %}
{label: '{{ part.source }}', value: {{ part.part }} },
{% endfor %}
{% endif %}
],
colors: ["@white", "@white"],
//colors: ["#30a1ec", "#76bdee"],
formatter: function (y) { return y + "%" }
});
</script>
{% endblock %}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment