Commit ea8c9e28 authored by Mathieu Rodic's avatar Mathieu Rodic

Worked on Pubmed parser.

Added a Resource database object.
parent f0604128
...@@ -4,10 +4,10 @@ import Collections ...@@ -4,10 +4,10 @@ import Collections
""" """
class FileParser: class FileParser:
def __init__(self, file=None, path="", encoding="utf8"): def __init__(self, file=None, filepath="", encoding="utf8"):
# ...get the file item... # ...get the file item...
if file is None: if file is None:
self._file = open(path, "rb") self._file = open(filepath, "rb")
else: else:
self._file = file self._file = file
# ...and parse! # ...and parse!
...@@ -15,7 +15,7 @@ class FileParser: ...@@ -15,7 +15,7 @@ class FileParser:
"""Add a document to the database. """Add a document to the database.
""" """
def add_document(self, parent, title, contents, metadata, resource_guid=None): def create_document(self, title, contents, metadata, resource_guid=None):
# create or retrieve a resource for that document, based on its user id # create or retrieve a resource for that document, based on its user id
if resource_guid is None: if resource_guid is None:
resource = Resource(guid=resource_guid) resource = Resource(guid=resource_guid)
......
from django.db import transaction
from FileParser import FileParser from FileParser import FileParser
class PubmedFileParser(FileParser): class PubmedFileParser(FileParser):
def parse(self): def parse(self, parentNode):
# open the file as XML # open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True) xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(self._file, parser=xml_parser) xml = etree.parse(self._file, parser=xml_parser)
# find all the abstracts # parse all the abstracts
xml_docs = xml.findall('PubmedArticle/MedlineCitation') # all database operations should be performed within one transaction
for xml_doc in xml_docs: xml_articles = xml.findall('PubmedArticle')
metadata = {} with transaction.atomic():
date_year = int(xml_doc.find('DateCreated/Year').text) for xml_article in xml_articles:
date_month = int(xml_doc.find('DateCreated/Month').text) # extract data from the document
date_day = int(xml_doc.find('DateCreated/Day').text) date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
metadata["date"] = datetime.date(year, month, day) date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
metadata["journal"] = xml_doc.find('Article/Journal/Title').text date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata["title"] = xml_doc.find('Article/ArticleTitle').text metadata = {
contents = "date": datetime.date(year, month, day),
\ No newline at end of file "journal": xml_article.find('MedlineCitation/Article/Journal/Title').text
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]')
# other metadata should also be included: submission date, etc.
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
childNode =
childNode
...@@ -10,21 +10,29 @@ from time import time ...@@ -10,21 +10,29 @@ from time import time
def upload_to(instance, filename): def upload_to(instance, filename):
return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename) return 'corpora/%s/%f/%s' % (instance.user.username, time(), filename)
class Resource(models.Model):
guid = models.CharField(max_length=255)
file = models.FileField(upload_to=upload_to, blank=True)
class NodeType(models.Model): class NodeType(models.Model):
name = models.CharField(max_length=200) name = models.CharField(max_length=200)
def __str__(self): def __str__(self):
return self.name return self.name
class Node(MP_Node): class Node(MP_Node):
#parent = models.ForeignKey('self', related_name='children_set', null=True, db_index=True) #parent = models.ForeignKey('self', related_name='children_set', null=True, db_index=True)
user = models.ForeignKey(User) user = models.ForeignKey(User)
type = models.ForeignKey(NodeType) type = models.ForeignKey(NodeType)
name = models.CharField(max_length=200) name = models.CharField(max_length=200)
date = models.DateField(default=timezone.now(), blank=True)
metadata = hstore.DictionaryField(blank=True)
date = models.DateField(default=timezone.now(), blank=True) # the 'file' column should be deprecated soon;
file = models.FileField(upload_to=upload_to, blank=True) # use resources instead.
metadata = hstore.DictionaryField(blank=True) file = models.FileField(upload_to=upload_to, blank=True)
resources = models.ManyToManyField(Resource)
#objects = hstore.HStoreManager() #objects = hstore.HStoreManager()
def __str__(self): def __str__(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment