Commit c3969192 authored by Mathieu Rodic's avatar Mathieu Rodic

Kept working on the parsers.

parent ea8c9e28
......@@ -15,17 +15,31 @@ class FileParser:
"""Add a document to the database.
"""
def create_document(self, title, contents, metadata, resource_guid=None):
def create_document(self, title, contents, language, metadata, guid=None):
# create or retrieve a resource for that document, based on its user id
if resource_guid is None:
resource = Resource(guid=resource_guid)
if guid is None:
resource = Resource(guid=guid)
else:
try:
resource = Resource.get(guid=resource_guid)
resource = Resource.get(guid=guid)
except:
resource = Resource(guid=resource_guid)
resource = Resource(guid=guid)
# create the document itself
document =
document = Node(
# WRITE STUFF HERE!!!
)
# parse it!
# TODO: beware the language!!!!
if self._parsers[language] = None:
self._parsers[language] = NltkParser
# WRITE STUFF HERE!!!
# return the created document
return document
"""Useful method to detect the document encoding.
Not sure it should be here actually.
......
......@@ -8,7 +8,7 @@ class PubmedFileParser(FileParser):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the abstracts
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
......@@ -18,13 +18,21 @@ class PubmedFileParser(FileParser):
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
"date": datetime.date(year, month, day),
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]')
# other metadata should also be included: submission date, etc.
"language": xml_article.find('MedlineCitation/Article/Language').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
childNode =
childNode
childNode = self.create_document(
title = metadata["title"],
contents = contents,
language = metadata["language"],
metadata = metadata,
guid = metadata["doi"],
)
parentNode.add_child(childNode)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment