Commit 3832070e authored by delanoe's avatar delanoe

[FEAT] Adding scrappers (not finished yet).

parent daca4e31
......@@ -21,27 +21,29 @@ import gargantext.views.pages.urls
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
# Module "Graph Explorer"
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph
from graphExplorer.views import explorer
urlpatterns = [
url(r'^admin/', admin.site.urls),
url(r'^generated/', include(gargantext.views.generated.urls)),
url(r'^api/', include(gargantext.views.api.urls)),
url(r'^', include(gargantext.views.pages.urls)),
# Module Annotation
# tempo: unchanged doc-annotations routes --
url(r'^annotations/', include(annotations_urls)),
url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
# Module "Graph Explorer"
url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer),
url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view()),
# to be removed:
url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
]
from scrappers import urls as scrappers_urls
urlpatterns = [ url(r'^admin/', admin.site.urls)
, url(r'^generated/', include(gargantext.views.generated.urls))
, url(r'^api/', include(gargantext.views.api.urls))
, url(r'^', include(gargantext.views.pages.urls))
# Module Annotation
# tempo: unchanged doc-annotations routes --
, url(r'^annotations/', include(annotations_urls))
, url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view)
# Module "Graph Explorer"
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer)
, url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view())
# to be removed:
, url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
, url(r'^scrappers/', include(scrappers_urls))
]
import os
from gargantext.settings import MEDIA_ROOT
def ensure_dir(user):
'''
If user is new, folder does not exist yet, create it then
'''
dirpath = '%s/corpora/%s' % (MEDIA_ROOT, user.username)
if not os.path.exists(dirpath):
print("Creating folder %s" % dirpath)
os.makedirs(dirpath)
# ****************************
# ***** Medline Fetcher *****
# ****************************
# MEDLINE USER REQUIREMENT : Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays
import sys
if sys.version_info >= (3, 0): from urllib.request import urlopen
else: from urllib import urlopen
import os
import time
# import libxml2
from lxml import etree
import datetime
from django.core.files import File
import codecs
import threading
from queue import Queue
# import time
class MedlineFetcher:
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
# print(query)
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
try:
eSearchResult = urlopen(eSearch)
data = eSearchResult.read()
root = etree.XML(data)
findcount = etree.XPath("/eSearchResult/Count/text()")
count = findcount(root)[0]
findquerykey = etree.XPath("/eSearchResult/QueryKey/text()")
queryKey = findquerykey(root)[0]
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
except:
count=0
queryKey=False
webEnv=False
origQuery=False
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
# RETMAX:
# Total number of UIDs from the retrieved set to be shown in the XML output (default=20)
# maximum of 100,000 records
def medlineEfetchRAW( self , fullquery):
query = fullquery["string"]
retmax = fullquery["retmax"]
count = fullquery["count"]
queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"]
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
def ensure_dir(self , f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
# generic!
def downloadFile(self, item):
url = item[0]
filename = item[1]
# print("\tin test_downloadFile:")
# print(url,filename)
data = urlopen(url)
f = codecs.open(filename, "w" ,encoding='utf-8')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
# print("\tin downloadFile:")
data = urlopen(url)
return data
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
results = []
try: result = self.downloadFile(item)
except: result = False
self.firstResults.append(result)
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
# The publications per year i'll retrieve per year will be = (k/N)*GlobalLimit <- i'll use this as RETMAX
# - k : Number of publications of x year (according to pubmed)
# - N : Sum of every k belonging to {X} (total number of pubs according to pubmed)
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
Total = 0
Fails = 0
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
Total += 1
if globalresults["queryKey"]==False:
Fails += 1
if globalresults["count"]>0:
N+=globalresults["count"]
queryhyperdata = {
"string": globalresults["query"] ,
"count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] ,
"retmax":0
}
thequeries.append ( queryhyperdata )
print("Total Number:", N,"publications")
print("And i want just:",globalLimit,"publications")
print("---------------------------------------\n")
for i,query in enumerate(thequeries):
k = query["count"]
proportion = k/float(N)
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
if ((Fails+1)/(Total+1))==1 : # for identifying the epic fail or connection error
thequeries = [False]
return thequeries
This diff is collapsed.
from django.conf.urls import url
import scrappers.pubmed as pubmed
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = [ url(r'^pubmed/query$', pubmed.getGlobalStats)
,
]
......@@ -3,15 +3,16 @@
{% block css %}
{% load staticfiles %}
<link rel="stylesheet" href="{% static "css/bootstrap.css" %}">
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/gargantext/garganrest.js" %}"></script>
<link rel="stylesheet" href="http://code.jquery.com/ui/1.11.2/themes/smoothness/jquery-ui.css">
<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
<link rel="stylesheet" href="{% static "css/morris.css" %}">
<script src="{% static "js/raphael-min.js"%}"></script>
<script type="text/javascript" src="{% static "js/morris.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/jquery/jquery.min.js" %}"></script>
<script type="text/javascript" src="{% static "js/gargantext/garganrest.js" %}"></script>
<link rel="stylesheet" href="http://code.jquery.com/ui/1.11.2/themes/smoothness/jquery-ui.css">
<style type="text/css">
.ui-autocomplete {
z-index: 5000;
......@@ -203,7 +204,7 @@
<div id="pubmedcrawl" style="visibility: hidden;">
Do you have a file already? &nbsp;
<input type="radio" id="file_yes" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="true" checked>Yes </input>
<input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input>
<input type="radio" id="file_no" name="file1" onclick="FileOrNotFile(this.value);" class="file1" value="false">No </input>
</div>
</td>
</tr>
......@@ -329,10 +330,10 @@
var theType = $("#id_type option:selected").html();
if(theType=="Pubmed (xml format)") {
if( theType=="Pubmed (xml format)") {
$.ajax({
// contentType: "application/json",
url: window.location.origin+"/tests/pubmedquery",
url: window.location.origin+"/scrappers/pubmed/query",
data: formData,
type: 'POST',
beforeSend: function(xhr) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment