Add pubmed scraper

c816759e · sim · 68aef175 · c816759e · c816759e
Commit c816759e authored Mar 14, 2018 by sim
Hide whitespace changes
Inline Side-by-side

Showing with 112 additions and 0 deletions

__init__.py gargantext/scrapers/__init__.py +1 -0

pubmed.py gargantext/scrapers/pubmed.py +111 -0

No files found.
--- a/gargantext/scrapers/__init__.py
+++ b/gargantext/scrapers/__init__.py
+from .pubmed import PubmedScraper
--- a/gargantext/scrapers/pubmed.py
+++ b/gargantext/scrapers/pubmed.py
+from scrapy.utils.iterators import xmliter_lxml as xmliter
+
+from gargantext.datasource import Scraper, Request
+from gargantext.datasource.items import DocumentLoader
+from gargantext.datasource.responses import XmlResponse
+
+
+__all__ = ['PubmedScraper']
+
+
+class PubmedDocumentLoader(DocumentLoader):
+    def parse(self, article):
+        LD = '*[self::Year or self::Month or self::Day or self::Hour or self::Minute or self::Second]'
+
+        pubmed_id = article.xpath('PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]/text()').extract_first()
+
+        self.add_values({
+            'id': pubmed_id,
+            'url': 'https://www.ncbi.nlm.nih.gov/pubmed/%s' % pubmed_id,
+            'authors': [
+                {
+                    'firstname': author.xpath('ForeName/text()').extract_first(),
+                    'lastname': author.xpath('LastName/text()').extract_first(),
+                    'affiliation': author.xpath('AffiliationInfo/Affiliation/text()').extract_first(),
+                }
+                for author in article.xpath('MedlineCitation/Article/AuthorList/Author')
+            ]
+        })
+
+        self.add_xpaths_text({
+            'title': 'MedlineCitation/Article/ArticleTitle',
+            'abstract': 'MedlineCitation/Article/Abstract/AbstractText',
+            'source': 'MedlineCitation/Article/Journal/Title',
+            'lang': 'MedlineCitation/Article/Language',
+            # https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#pubdate
+            'publication': 'PubmedData/History/PubMedPubDate[@PubStatus="accepted"]/' + LD,
+            # https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#datecompleted
+            'creation': 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/' + LD,
+        })
+
+
+class PubmedScraper(Scraper):
+    """Pubmed scraper for Medline database using E-utilities
+
+    `API documentation <https://dataguide.nlm.nih.gov/eutilities/utilities.html>`
+
+    `PubMed <https://en.wikipedia.org/wiki/PubMed>`_ is a free search engine
+    accessing primarily the MEDLINE database of references and abstracts on
+    life sciences and biomedical topics. The United States National Library of
+    Medicine (NLM) at the National Institutes of Health maintains the database
+    as part of the Entrez system of information retrieval.
+    """
+    name = 'pubmed'
+    expects = XmlResponse
+    default_parser = 'parse_efetch'
+    base_url = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
+
+    webenv = None
+    querykey = None
+    retmax = Scraper.BATCH_SIZE
+
+    def dispatch(self):
+        if not (self.webenv and self.querykey and self.count is not None):
+            yield Request('%s/esearch.fcgi' % self.base_url,
+                callback=self.parse_esearch,
+                params={
+                    'db': 'pubmed',
+                    'retmode': 'xml',
+                    'usehistory': 'y',
+                    'sort': 'pub+date',
+                    'term': self.query
+            })
+
+        yield from self.request_results()
+
+    def request_results(self):
+        if not self.count_only and self.webenv and self.querykey:
+            # XXX PubMed documentation is confusing: need to start at 0, not 1
+            for retstart in range(0, self.limit, self.retmax):
+                yield Request('%s/efetch.fcgi' % self.base_url,
+                    callback=self.parse_efetch,
+                    params={
+                        'db': 'pubmed',
+                        'retstart': retstart,
+                        'retmax': self.retmax,
+                        'retmode': 'xml',
+                        'term': self.query,
+                        'query_key': self.querykey,
+                        'WebEnv': self.webenv,
+                })
+
+    def parse_esearch(self, response):
+        result = response.xpath('/eSearchResult')
+
+        self.count = int(result.xpath('./Count/text()').extract_first())
+        self.webenv = result.xpath('./WebEnv/text()').extract_first()
+        self.querykey = result.xpath('./QueryKey/text()').extract_first()
+
+        yield {
+            'count': self.count,
+            'query': {
+                'webenv': self.webenv,
+                'querykey': self.querykey,
+            },
+        }
+
+        yield from self.request_results()
+
+    def parse_efetch(self, response):
+        for article in xmliter(response, 'PubmedArticle'):
+            yield PubmedDocumentLoader(article).load()