Commit e6f6eca9 authored by Mathieu Rodic's avatar Mathieu Rodic

More corrections

parent 07a6f374
from django.db import transaction
from FileParser import FileParser
class EuropressFileParser(FileParser, contents):
class EuropressFileParser(FileParser):
def parse: def parse(self, parentNode):
pass pass
\ No newline at end of file
...@@ -84,7 +84,8 @@ class FileParser: ...@@ -84,7 +84,8 @@ class FileParser:
resource = Resource(guid=guid) resource = Resource(guid=guid)
# If the parent node already has a child with this resource, pass # If the parent node already has a child with this resource, pass
# (is it a good thing?) # (is it a good thing?)
if parentNode.get_descendants(). if parentNode.get_descendants().filter(resource=resource).exists():
return None
# create the document itself # create the document itself
childNode = Node( childNode = Node(
user = parentNode.pk, user = parentNode.pk,
......
from django.db import transaction
from FileParser import FileParser
class IsiFileParser(FileParser):
def parse(self, parentNode):
# read the file, line by line
for line in self.__file:
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
yield self.create_document(
parentNode = parentNode
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language"].lower()]
metadata = metadata,
guid = metadata["doi"],
)
...@@ -6,11 +6,12 @@ class PubmedFileParser(FileParser): ...@@ -6,11 +6,12 @@ class PubmedFileParser(FileParser):
def parse(self, parentNode): def parse(self, parentNode):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True) xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser) xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one # parse all the articles, one by one
# all database operations should be performed within one transaction # all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
documents = []
with transaction.atomic(): with transaction.atomic():
for xml_article in xml_articles: for xml_article in xml_articles:
# extract data from the document # extract data from the document
...@@ -28,7 +29,7 @@ class PubmedFileParser(FileParser): ...@@ -28,7 +29,7 @@ class PubmedFileParser(FileParser):
} }
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database # create the document in the database
yield self.create_document( document = self.create_document(
parentNode = parentNode parentNode = parentNode
title = metadata["title"], title = metadata["title"],
contents = contents, contents = contents,
...@@ -36,3 +37,6 @@ class PubmedFileParser(FileParser): ...@@ -36,3 +37,6 @@ class PubmedFileParser(FileParser):
metadata = metadata, metadata = metadata,
guid = metadata["doi"], guid = metadata["doi"],
) )
if document:
documents.append(document)
return documents
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment