Commit c3969192 authored by Mathieu Rodic's avatar Mathieu Rodic

Kept working on the parsers.

parent ea8c9e28
...@@ -15,17 +15,31 @@ class FileParser: ...@@ -15,17 +15,31 @@ class FileParser:
"""Add a document to the database. """Add a document to the database.
""" """
def create_document(self, title, contents, metadata, resource_guid=None): def create_document(self, title, contents, language, metadata, guid=None):
# create or retrieve a resource for that document, based on its user id # create or retrieve a resource for that document, based on its user id
if resource_guid is None: if guid is None:
resource = Resource(guid=resource_guid) resource = Resource(guid=guid)
else: else:
try: try:
resource = Resource.get(guid=resource_guid) resource = Resource.get(guid=guid)
except: except:
resource = Resource(guid=resource_guid) resource = Resource(guid=guid)
# create the document itself # create the document itself
document = document = Node(
# WRITE STUFF HERE!!!
)
# parse it!
# TODO: beware the language!!!!
if self._parsers[language] = None:
self._parsers[language] = NltkParser
# WRITE STUFF HERE!!!
# return the created document
return document
"""Useful method to detect the document encoding. """Useful method to detect the document encoding.
Not sure it should be here actually. Not sure it should be here actually.
......
...@@ -8,7 +8,7 @@ class PubmedFileParser(FileParser): ...@@ -8,7 +8,7 @@ class PubmedFileParser(FileParser):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True) xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(self._file, parser=xml_parser) xml = etree.parse(self._file, parser=xml_parser)
# parse all the abstracts # parse all the articles, one by one
# all database operations should be performed within one transaction # all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle') xml_articles = xml.findall('PubmedArticle')
with transaction.atomic(): with transaction.atomic():
...@@ -18,13 +18,21 @@ class PubmedFileParser(FileParser): ...@@ -18,13 +18,21 @@ class PubmedFileParser(FileParser):
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text) date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text) date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = { metadata = {
"date": datetime.date(year, month, day), # other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text "journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text "title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]') "language": xml_article.find('MedlineCitation/Article/Language').text
# other metadata should also be included: submission date, etc. "doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
} }
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database # create the document in the database
childNode = childNode = self.create_document(
childNode title = metadata["title"],
contents = contents,
language = metadata["language"],
metadata = metadata,
guid = metadata["doi"],
)
parentNode.add_child(childNode)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment