Commit c816759e authored by sim's avatar sim

Add pubmed scraper

parent 68aef175
from .pubmed import PubmedScraper
from scrapy.utils.iterators import xmliter_lxml as xmliter
from gargantext.datasource import Scraper, Request
from gargantext.datasource.items import DocumentLoader
from gargantext.datasource.responses import XmlResponse
__all__ = ['PubmedScraper']
class PubmedDocumentLoader(DocumentLoader):
def parse(self, article):
LD = '*[self::Year or self::Month or self::Day or self::Hour or self::Minute or self::Second]'
pubmed_id = article.xpath('PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]/text()').extract_first()
self.add_values({
'id': pubmed_id,
'url': 'https://www.ncbi.nlm.nih.gov/pubmed/%s' % pubmed_id,
'authors': [
{
'firstname': author.xpath('ForeName/text()').extract_first(),
'lastname': author.xpath('LastName/text()').extract_first(),
'affiliation': author.xpath('AffiliationInfo/Affiliation/text()').extract_first(),
}
for author in article.xpath('MedlineCitation/Article/AuthorList/Author')
]
})
self.add_xpaths_text({
'title': 'MedlineCitation/Article/ArticleTitle',
'abstract': 'MedlineCitation/Article/Abstract/AbstractText',
'source': 'MedlineCitation/Article/Journal/Title',
'lang': 'MedlineCitation/Article/Language',
# https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#pubdate
'publication': 'PubmedData/History/PubMedPubDate[@PubStatus="accepted"]/' + LD,
# https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#datecompleted
'creation': 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/' + LD,
})
class PubmedScraper(Scraper):
"""Pubmed scraper for Medline database using E-utilities
`API documentation <https://dataguide.nlm.nih.gov/eutilities/utilities.html>`
`PubMed <https://en.wikipedia.org/wiki/PubMed>`_ is a free search engine
accessing primarily the MEDLINE database of references and abstracts on
life sciences and biomedical topics. The United States National Library of
Medicine (NLM) at the National Institutes of Health maintains the database
as part of the Entrez system of information retrieval.
"""
name = 'pubmed'
expects = XmlResponse
default_parser = 'parse_efetch'
base_url = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
webenv = None
querykey = None
retmax = Scraper.BATCH_SIZE
def dispatch(self):
if not (self.webenv and self.querykey and self.count is not None):
yield Request('%s/esearch.fcgi' % self.base_url,
callback=self.parse_esearch,
params={
'db': 'pubmed',
'retmode': 'xml',
'usehistory': 'y',
'sort': 'pub+date',
'term': self.query
})
yield from self.request_results()
def request_results(self):
if not self.count_only and self.webenv and self.querykey:
# XXX PubMed documentation is confusing: need to start at 0, not 1
for retstart in range(0, self.limit, self.retmax):
yield Request('%s/efetch.fcgi' % self.base_url,
callback=self.parse_efetch,
params={
'db': 'pubmed',
'retstart': retstart,
'retmax': self.retmax,
'retmode': 'xml',
'term': self.query,
'query_key': self.querykey,
'WebEnv': self.webenv,
})
def parse_esearch(self, response):
result = response.xpath('/eSearchResult')
self.count = int(result.xpath('./Count/text()').extract_first())
self.webenv = result.xpath('./WebEnv/text()').extract_first()
self.querykey = result.xpath('./QueryKey/text()').extract_first()
yield {
'count': self.count,
'query': {
'webenv': self.webenv,
'querykey': self.querykey,
},
}
yield from self.request_results()
def parse_efetch(self, response):
for article in xmliter(response, 'PubmedArticle'):
yield PubmedDocumentLoader(article).load()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment