from django.db import transaction from lxml import etree from .FileParser import FileParser from ..NgramsExtractors import * from datetime import datetime from io import BytesIO class PubmedFileParser(FileParser): def _parse(self, file): # open the file as XML xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml = "" if type(file)==bytes: xml = etree.parse( BytesIO(file) , parser=xml_parser) else: xml = etree.parse(file, parser=xml_parser) xml_articles = xml.findall('PubmedArticle') # initialize the list of hyperdata hyperdata_list = [] # parse all the articles, one by one for xml_article in xml_articles: # extract data from the document hyperdata = {} hyperdata_path = { "journal" : 'MedlineCitation/Article/Journal/Title', "title" : 'MedlineCitation/Article/ArticleTitle', "abstract" : 'MedlineCitation/Article/Abstract/AbstractText', "title2" : 'MedlineCitation/Article/VernacularTitle', "language_iso3" : 'MedlineCitation/Article/Language', "doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]', "realdate_full_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate', "realdate_year_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Year', "realdate_month_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Month', "realdate_day_" : 'MedlineCitation/Article/Journal/JournalIssue/PubDate/Day', "publication_year" : 'MedlineCitation/DateCreated/Year', "publication_month" : 'MedlineCitation/DateCreated/Month', "publication_day" : 'MedlineCitation/DateCreated/Day', "authors" : 'MedlineCitation/Article/AuthorList', } for key, path in hyperdata_path.items(): try: xml_node = xml_article.find(path) # Authors tag if key == 'authors': hyperdata[key] = ', '.join([ xml_author.find('ForeName').text + ' ' + xml_author.find('LastName').text for xml_author in xml_node ]) else: hyperdata[key] = xml_node.text except: pass #Title-Decision Title="" if not hyperdata["title"] or hyperdata["title"]=="": if "title2" in hyperdata: hyperdata["title"] = hyperdata["title2"] else: hyperdata["title"] = "" # Date-Decision # forge.iscpif.fr/issues/1418 RealDate = "" if "realdate_full_" in hyperdata: RealDate = hyperdata["realdate_full_"] else: if "realdate_year_" in hyperdata: RealDate+=hyperdata["realdate_year_"] if "realdate_month_" in hyperdata: RealDate+=" "+hyperdata["realdate_month_"] if "realdate_day_" in hyperdata: RealDate+=" "+hyperdata["realdate_day_"] hyperdata["realdate_full_"] = RealDate RealDate = RealDate.split("-")[0] PubmedDate = "" if "publication_year" in hyperdata: PubmedDate+=hyperdata["publication_year"] if "publication_month" in hyperdata: PubmedDate+=" "+hyperdata["publication_month"] if "publication_day" in hyperdata: PubmedDate+=" "+hyperdata["publication_day"] Decision="" if len(RealDate)>4: if len(RealDate)>8: try: Decision = datetime.strptime(RealDate, '%Y %b %d').date() except: try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except: Decision=False else: try: Decision = datetime.strptime(RealDate, '%Y %b').date() except: try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except: Decision=False else: try: Decision = datetime.strptime(PubmedDate, '%Y %m %d').date() except: Decision=False if Decision!=False: if "publication_year" in hyperdata: hyperdata["publication_year"] = str(Decision.year) if "publication_month" in hyperdata: hyperdata["publication_month"] = str(Decision.month) if "publication_day" in hyperdata: hyperdata["publication_day"] = str(Decision.day) if "realdate_year_" in hyperdata: hyperdata.pop("realdate_year_") if "realdate_month_" in hyperdata: hyperdata.pop("realdate_month_") if "realdate_day_" in hyperdata: hyperdata.pop("realdate_day_") if "title2" in hyperdata: hyperdata.pop("title2") hyperdata_list.append(hyperdata) # return the list of hyperdata return hyperdata_list