Commit b053ed96 authored by sim's avatar sim

Pubmed scraper: use JournalIssue to get publication date

parent ed572281
......@@ -10,7 +10,10 @@ __all__ = ['PubmedScraper']
class PubmedDocumentLoader(DocumentLoader):
def parse(self, article):
# https://dtd.nlm.nih.gov/ncbi/pubmed/out/doc/2018/el-PubMedPubDate.html
LD = '*[self::Year or self::Month or self::Day or self::Hour or self::Minute or self::Second]'
# https://dtd.nlm.nih.gov/ncbi/pubmed/out/doc/2018/el-PubDate.html
PD = '*[self::MedlineDate or self::Year or self::Month or self::Day or self::Season]'
pubmed_id = article.xpath('PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]/text()').extract_first()
......@@ -32,8 +35,8 @@ class PubmedDocumentLoader(DocumentLoader):
'abstract': 'MedlineCitation/Article/Abstract/AbstractText',
'source': 'MedlineCitation/Article/Journal/Title',
'lang': 'MedlineCitation/Article/Language',
# https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#pubdate
'publication': 'PubmedData/History/PubMedPubDate[@PubStatus="accepted"]/' + LD,
# https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#journalissue
'publication': 'MedlineCitation/Article/Journal/JournalIssue/PubDate/' + PD,
# https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#datecompleted
'creation': 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/' + LD,
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment