Commit ea8c9e28 authored by Mathieu Rodic's avatar Mathieu Rodic

Worked on Pubmed parser.

Added a Resource database object.
parent f0604128
......@@ -4,10 +4,10 @@ import Collections
"""
class FileParser:
def __init__(self, file=None, path="", encoding="utf8"):
def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item...
if file is None:
self._file = open(path, "rb")
self._file = open(filepath, "rb")
else:
self._file = file
# ...and parse!
......@@ -15,7 +15,7 @@ class FileParser:
"""Add a document to the database.
"""
def add_document(self, parent, title, contents, metadata, resource_guid=None):
def create_document(self, title, contents, metadata, resource_guid=None):
# create or retrieve a resource for that document, based on its user id
if resource_guid is None:
resource = Resource(guid=resource_guid)
......
from django.db import transaction
from FileParser import FileParser
class PubmedFileParser(FileParser):
def parse(self):
def parse(self, parentNode):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# find all the abstracts
xml_docs = xml.findall('PubmedArticle/MedlineCitation')
for xml_doc in xml_docs:
metadata = {}
date_year = int(xml_doc.find('DateCreated/Year').text)
date_month = int(xml_doc.find('DateCreated/Month').text)
date_day = int(xml_doc.find('DateCreated/Day').text)
metadata["date"] = datetime.date(year, month, day)
metadata["journal"] = xml_doc.find('Article/Journal/Title').text
metadata["title"] = xml_doc.find('Article/ArticleTitle').text
contents =
\ No newline at end of file
# parse all the abstracts
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
"date": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]')
# other metadata should also be included: submission date, etc.
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
childNode =
childNode
......@@ -10,21 +10,29 @@ from time import time
def upload_to(instance, filename):
return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename)
class Resource(models.Model):
guid = models.CharField(max_length=255)
file = models.FileField(upload_to=upload_to, blank=True)
class NodeType(models.Model):
name = models.CharField(max_length=200)
name = models.CharField(max_length=200)
def __str__(self):
return self.name
class Node(MP_Node):
#parent = models.ForeignKey('self', related_name='children_set', null=True, db_index=True)
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200)
user = models.ForeignKey(User)
type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200)
date = models.DateField(default=timezone.now(), blank=True)
metadata = hstore.DictionaryField(blank=True)
date = models.DateField(default=timezone.now(), blank=True)
file = models.FileField(upload_to=upload_to, blank=True)
metadata = hstore.DictionaryField(blank=True)
# the 'file' column should be deprecated soon;
# use resources instead.
file = models.FileField(upload_to=upload_to, blank=True)
resources = models.ManyToManyField(Resource)
#objects = hstore.HStoreManager()
def __str__(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment