Commit c77bf9d2 authored by Mathieu Rodic's avatar Mathieu Rodic

[FEATURE] File parsers - Started working on the IsiFileParser class

parent 6c4a607c
from parsing.FileParsers.FileParser import FileParser
class EuropressFileParser(FileParser):
def parse():
pass
#import FileParser
#
#class EuropressFileParser(FileParser, contents):
#
# def parse():
# pass
#
from django.db import transaction from django.db import transaction
from FileParser import FileParser from parsing.FileParsers.FileParser import FileParser
class IsiFileParser(FileParser): class IsiFileParser(FileParser):
def parse(self, parentNode): _parameters = {
# read the file, line by line "ER": {"type": "delimiter"},
for line in self.__file: "AU": {"type": "metadata", "key": "authors", "concatenate": False},
"AB": {"type": "metadata", "key": "abstract", "concatenate": True},
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
} }
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database def parse(self, parentNode=None, tag=True):
yield self.create_document( metadata = {}
parentNode = parentNode last_key = None
title = metadata["title"], last_values = []
contents = contents, for line in self.file:
language = self._languages_iso3[metadata["language"].lower()] if len(line) > 2:
metadata = metadata, parameter_key = line[:2]
guid = metadata["doi"], if parameter_key != last_key:
) if last_key is not None:
parameter = self._parameters[last_key]
if parameter["type"] == "metadata":
metadata[parameter["key"]] = ' '.join(last_values) if parameter["concatenate"] else last_values
elif parameter["type"] == "metadata":
print(metadata)
metadata = {}
break
parameter = self._parameters[last_key]
last_key = parameter_key
last_values = []
last_values.append(line[3:-1])
\ No newline at end of file
...@@ -7,7 +7,7 @@ import datetime ...@@ -7,7 +7,7 @@ import datetime
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def parse(self, parentNode, tag=True): def parse(self, parentNode=None, tag=True):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
documents = [] documents = []
......
#from parsing.FileParsers import EuropressFileParser from parsing.FileParsers.IsiFileParser import IsiFileParser
from parsing.FileParsers import PubmedFileParser from parsing.FileParsers.PubmedFileParser import PubmedFileParser
from parsing.FileParsers.EuropressFileParser import EuropressFileParser
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment