Commit d0003ef9 authored by PkSM3's avatar PkSM3

[FEAT] pubmed scrapper

parent ed1311f3
......@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True):
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
print(Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")))
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document):
try:
......
......@@ -69,7 +69,8 @@ urlpatterns = patterns('',
url(r'^tests/mvc-listdocuments$', views.tests_mvc_listdocuments),
url(r'^tests/pubmedquery$', pubmedscrapper.getGlobalStats),
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery)
url(r'^tests/project/(\d+)/pubmedquery/go$', pubmedscrapper.doTheQuery),
url(r'^tests/project/(\d+)/ISTEXquery/go$', pubmedscrapper.testISTEX)
)
......
......@@ -223,8 +223,14 @@ def project(request, project_id):
corpus_view['count'] = corpus.children.count()
#just get first element of the corpora and get his type.
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view)
resource_corpus = Node_Resource.objects.filter(node=corpus)
if len(resource_corpus)>0:
# print(Node_Resource.objects.filter(node=corpus).all())
corpus_type = Node_Resource.objects.filter(node=corpus)[0].resource.type
list_corpora[corpus_type].append(corpus_view)
donut_part[corpus_type] += docs_count
else: print(" Node_Resource = this.corpus(",corpus.pk,") ... nothing, why?")
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
......@@ -237,6 +243,8 @@ def project(request, project_id):
if docs_total == 0 or docs_total is None:
docs_total = 1
# The donut will show: percentage by
donut = [ {'source': key,
'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \
......@@ -246,12 +254,15 @@ def project(request, project_id):
if request.method == 'POST':
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(id=str( form.cleaned_data['type'] ))
print(request.POST['type'])
print(form.cleaned_data['type'])
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
print("-------------")
print(name,"|",resource_type,"|",thefile)
......@@ -326,6 +337,7 @@ def project(request, project_id):
})
else:
form = CustomForm()
return render(request, 'project.html', {
'form' : form,
......@@ -748,9 +760,12 @@ def node_link(request, corpus_id):
'''
Create the HttpResponse object with the node_link dataset.
'''
import time
print("In node_link() START")
start = time.time()
data = get_cooc(request=request, corpus_id=corpus_id, type="node_link")
end = time.time()
print ("LOG::TIME: get_cooc() [s]",(end - start))
print("In node_link() END")
return JsonHttpResponse(data)
......
......@@ -98,13 +98,10 @@ from django import forms
from django.utils.translation import ugettext_lazy as _
class CustomForm(forms.Form):
name = forms.CharField( label='Name', max_length=199 , required=True)
parsing_options = ResourceType.objects.all().values_list('id', 'name')
type = forms.IntegerField( widget=forms.Select( choices= parsing_options) , required=True )
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
type = ModelChoiceField( ResourceType.objects.all() , widget=forms.Select(attrs={'onchange':'CustomForSelect( $("option:selected", this).text() );'}) )
file = forms.FileField()
# Description: clean_file()
"""
* file_.content_type - Example: ['application/pdf', 'image/jpeg']
......
......@@ -163,6 +163,7 @@ class Node(CTENode):
for node_resource in self.node_resource.filter(parsed=False):
resource = node_resource.resource
parser = defaultdict(lambda:FileParser.FileParser, {
'istext' : ISText,
'pubmed' : PubmedFileParser,
'isi' : IsiFileParser,
'ris' : RisFileParser,
......@@ -171,6 +172,7 @@ class Node(CTENode):
'europress_english' : EuropressFileParser,
})[resource.type.name]()
metadata_list += parser.parse(str(resource.file))
# print(parser.parse(str(resource.file)))
# retrieve info from the database
type_id = NodeType.objects.get(name='Document').id
langages_cache = LanguagesCache()
......@@ -183,6 +185,8 @@ class Node(CTENode):
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
# print("metadata_values:")
# print("\t",metadata_values,"\n- - - - - - - - - - - - ")
Node(
user_id = user_id,
type_id = type_id,
......@@ -191,7 +195,6 @@ class Node(CTENode):
language_id = language.id if language else None,
metadata = metadata_values
).save()
# make metadata filterable
self.children.all().make_metadata_filterable()
......@@ -236,17 +239,32 @@ class Node(CTENode):
@current_app.task(filter=task_method)
def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
print("In workflow() parse_resources()")
import time
print("LOG::TIME: In workflow() parse_resources()")
start = time.time()
self.parse_resources()
print("In workflow() / parse_resources()")
print("In workflow() extract_ngrams()")
end = time.time()
print ("LOG::TIME: parse_resources() [s]",(end - start))
print("LOG::TIME: In workflow() / parse_resources()")
start = time.time()
print("LOG::TIME: In workflow() extract_ngrams()")
type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
print("In workflow() / extract_ngrams()")
end = time.time()
print ("LOG::TIME: ",(end - start))
print ("LOG::TIME: extract_ngrams() [s]",(end - start))
print("LOG::TIME: In workflow() / extract_ngrams()")
start = time.time()
print("In workflow() do_tfidf()")
from analysis.functions import do_tfidf
do_tfidf(self)
print("In workflow() / do_tfidf()")
end = time.time()
print ("LOG::TIME: do_tfidf() [s]",(end - start))
print("LOG::TIME: In workflow() / do_tfidf()")
print("In workflow() END")
class Node_Metadata(models.Model):
......
......@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
"realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate',
......@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser):
except:
pass
#Title-Decision
Title=""
if not metadata["title"] or metadata["title"]=="":
if "title2" in metadata:
metadata["title"] = metadata["title2"]
else: metadata["title"] = ""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate = ""
......@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser):
if "publication_month" in metadata: PubmedDate+=" "+metadata["publication_month"]
if "publication_day" in metadata: PubmedDate+=" "+metadata["publication_day"]
Decision=""
if len(RealDate)>4:
if len(RealDate)>8: decision = datetime.strptime(RealDate, '%Y %b %d').date()
else: decision = datetime.strptime(RealDate, '%Y %b').date()
else: decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
if len(RealDate)>8:
try: Decision = datetime.strptime(RealDate, '%Y %b %d').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else:
try: Decision = datetime.strptime(RealDate, '%Y %b').date()
except: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
else: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date()
if "publication_year" in metadata: metadata["publication_year"] = str(decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(decision.day)
if "publication_year" in metadata: metadata["publication_year"] = str(Decision.year)
if "publication_month" in metadata: metadata["publication_month"] = str(Decision.month)
if "publication_day" in metadata: metadata["publication_day"] = str(Decision.day)
if "realdate_year_" in metadata: metadata.pop("realdate_year_")
if "realdate_month_" in metadata: metadata.pop("realdate_month_")
if "realdate_day_" in metadata: metadata.pop("realdate_day_")
if "title2" in metadata: metadata.pop("title2")
# print(metadata)
metadata_list.append(metadata)
# return the list of metadata
return metadata_list
......@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser
from parsing.FileParsers.ISText import ISText
......@@ -10,29 +10,36 @@ import os
import time
# import libxml2
from lxml import etree
from datetime import datetime
from django.core.files import File
import threading
from queue import Queue
import time
class MedlineFetcher:
def __init__(self):
self.queue_size = 8
self.q = Queue()
self.firstResults = []
self.lock = threading.Lock() # lock to serialize console output
self.pubMedEutilsURL = 'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self.pubMedDB = 'Pubmed'
self.reportType = 'medline'
self.personalpath_mainPath = 'MedLine/'
if not os.path.isdir(self.personalpath_mainPath):
os.makedirs(self.personalpath_mainPath)
print ('Created directory ' + self.personalpath_mainPath)
# Return the:
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def medlineEsearch(self , query):
print ("MedlineFetcher::medlineEsearch :")
# print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
origQuery = query
query = query.replace(' ', '%20')
eSearch = '%s/esearch.fcgi?db=%s&retmax=1&usehistory=y&term=%s' %(self.pubMedEutilsURL, self.pubMedDB, query)
......@@ -50,13 +57,7 @@ class MedlineFetcher:
findwebenv = etree.XPath("/eSearchResult/WebEnv/text()")
webEnv = findwebenv(root)[0]
# doc = libxml2.parseDoc(data)
# count = doc.xpathEval('eSearchResult/Count/text()')[0]
# queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv
values = { "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
print(values)
values = { "query":origQuery , "count": int(str(count)), "queryKey": queryKey , "webEnv":webEnv }
return values
......@@ -72,40 +73,58 @@ class MedlineFetcher:
queryKey = fullquery["queryKey"]
webEnv = fullquery["webEnv"]
print ("MedlineFetcher::medlineEfetchRAW :")
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
# pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
# if not os.path.isdir(pubmedqueryfolder):
# os.makedirs(pubmedqueryfolder)
pubMedResultFileName = self.personalpath_mainPath + 'Pubmed_' + queryNoSpace + '.xml'
pubMedResultFile = open(pubMedResultFileName, 'w')
print ('Query "' , query , '"\t:\t' , count , ' results')
print ('Starting fetching at ' , time.asctime(time.localtime()) )
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
# while(retstart < count):
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
return eFetch
# if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
# else: pubMedResultFile.write(eFetchResult.read())
# retstart += retmax
# break # you shall not pass !!
# pubMedResultFile.close()
# print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
# print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
# print("------------------------------------------")
# return ["everything","ok"]
# generic!
def downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
print(url,filename)
data = urlopen(url)
f = open(filename, 'w')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
with self.lock:
print(threading.current_thread().name, filename+" OK")
return filename
# generic!
def do_work(self,item):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
def worker(self):
while True:
item = self.q.get()
self.firstResults.append(self.do_work(item))
self.q.task_done()
def worker2(self):
while True:
item = self.q.get()
self.firstResults.append(self.downloadFile(item))
self.q.task_done()
def chunks(self , l , n):
print("chunks:")
for i in range(0, len(l), n):
yield l[i:i+n]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
......@@ -115,22 +134,34 @@ class MedlineFetcher:
# - GlobalLimit : Number of publications i want.
def serialFetcher(self , yearsNumber , query, globalLimit):
# Create the queue and thread pool.
for i in range(self.queue_size):
t = threading.Thread(target=self.worker)
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
start = time.perf_counter()
N = 0
print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
print ('YEAR ' + year)
print ('---------\n')
# medlineEfetch(str(year) + '[dp] '+query , 20000)
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery = str(year) + '[dp] '+query
globalresults = self.medlineEsearch(pubmedquery)
self.q.put( pubmedquery ) #put task in the queue
self.q.join()
print('time:',time.perf_counter() - start)
for globalresults in self.firstResults:
# globalresults = self.medlineEsearch(pubmedquery)
if globalresults["count"]>0:
N+=globalresults["count"]
querymetadata = {
"string": pubmedquery ,
"string": globalresults["query"] ,
"count": globalresults["count"] ,
"queryKey":globalresults["queryKey"] ,
"webEnv":globalresults["webEnv"] ,
......@@ -149,11 +180,3 @@ class MedlineFetcher:
query["retmax"] = retmax_forthisyear
return thequeries
# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
# query = str(2015)+ '[dp] '+'microbiota'
# medlineEsearch( query )
#
......@@ -14,6 +14,8 @@ import json
from gargantext_web.settings import MEDIA_ROOT
from datetime import datetime
import time
import threading
from django.core.files import File
from gargantext_web.settings import DEBUG
......@@ -28,14 +30,16 @@ def getGlobalStats(request ):
if request.method == "POST":
query = request.POST["query"]
print ("LOG::TIME: query =", query )
print ("LOG::TIME: N =", 300 )
instancia = MedlineFetcher()
alist = instancia.serialFetcher( 5, query , 100 )
# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
alist = instancia.serialFetcher( 5, query , 300 )
data = alist
return JsonHttpResponse(data)
from parsing.FileParsers import PubmedFileParser
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
......@@ -78,17 +82,20 @@ def doTheQuery(request , project_id):
corpus.save()
try:
tasks = MedlineFetcher()
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
print(url)
data = urlopen(url)
xmlname = MEDIA_ROOT + '/corpora/%s/%s.xml' % (request.user, str(datetime.now().microsecond))
f = open(xmlname, 'w')
myfile = File(f)
myfile.write( data.read().decode('utf-8') )
myfile.close()
f.close()
corpus.add_resource( user=request.user, type=resource_type, file=xmlname )
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
......@@ -96,7 +103,6 @@ def doTheQuery(request , project_id):
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
......@@ -106,4 +112,79 @@ def doTheQuery(request , project_id):
print("lele",error)
data = alist
return JsonHttpResponse(data)
\ No newline at end of file
return JsonHttpResponse(data)
def testISTEX(request , project_id):
print(request.method)
alist = ["bar","foo"]
if request.method == "POST":
# print(alist)
query = "-"
query_string = "-"
N = 60
if "query" in request.POST: query = request.POST["query"]
if "string" in request.POST: query_string = request.POST["string"].replace(" ","+")
# if "N" in request.POST: N = request.POST["N"]
print(query_string , query , N)
urlreqs = []
pagesize = 50
tasks = MedlineFetcher()
chunks = list(tasks.chunks(range(N), pagesize))
for k in chunks:
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print(urlreqs)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
resource_type = ResourceType.objects.get(name="istext" )
parent = Node.objects.get(id=project_id)
node_type = NodeType.objects.get(name='Corpus')
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=query,
)
corpus.save()
# configuring your queue with the event
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
for filename in tasks.firstResults:
corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus.save()
# do the WorkFlow
try:
if DEBUG is True:
corpus.workflow()
else:
corpus.workflow.apply_async((), countdown=3)
return JsonHttpResponse(["workflow","finished"])
except Exception as error:
print(error)
data = [query_string,query,N]
return JsonHttpResponse(data)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment