Kept working on the parsers.

c3969192 · Mathieu Rodic · ea8c9e28 · c3969192 · c3969192
Commit c3969192 authored Oct 15, 2014 by Mathieu Rodic
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 12 deletions

FileParser.py mat-parsing/FileParsers/FileParser.py +20 -6

PubmedFileParser.py mat-parsing/FileParsers/PubmedFileParser.py +14 -6

No files found.
--- a/mat-parsing/FileParsers/FileParser.py
+++ b/mat-parsing/FileParsers/FileParser.py
@@ -15,17 +15,31 @@ class FileParser:
    """Add a document to the database.
    """
-    def create_document(self, title, contents, metadata, resource_guid=None):
+    def create_document(self, title, contents, language, metadata, guid=None):
        # create or retrieve a resource for that document, based on its user id
-        if resource_guid is None:
+        if guid is None:
-            resource = Resource(guid=resource_guid)
+            resource = Resource(guid=guid)
        else:
            try:
-                resource = Resource.get(guid=resource_guid)
+                resource = Resource.get(guid=guid)
            except:
-                resource = Resource(guid=resource_guid)
+                resource = Resource(guid=guid)
        # create the document itself
-        document = 
+        document = Node(
+            # WRITE STUFF HERE!!!
+        )
+        # parse it!
+        # TODO: beware the language!!!!
+        if self._parsers[language] = None:
+            self._parsers[language] = NltkParser
+        # WRITE STUFF HERE!!!
+        # return the created document
+        return document
    """Useful method to detect the document encoding.
    Not sure it should be here actually.

--- a/mat-parsing/FileParsers/PubmedFileParser.py
+++ b/mat-parsing/FileParsers/PubmedFileParser.py
@@ -8,7 +8,7 @@ class PubmedFileParser(FileParser):
        # open the file as XML
        xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
        xml = etree.parse(self._file, parser=xml_parser)
-        # parse all the abstracts
+        # parse all the articles, one by one
        # all database operations should be performed within one transaction
        xml_articles = xml.findall('PubmedArticle')
        with transaction.atomic():
@@ -18,13 +18,21 @@ class PubmedFileParser(FileParser):
                date_month  = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
                date_day    = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
                metadata    = {
-                    "date":     datetime.date(year, month, day),
+                    # other metadata should also be included:
+                    # authors, submission date, etc.
+                    "date_pub": datetime.date(year, month, day),
                    "journal":  xml_article.find('MedlineCitation/Article/Journal/Title').text
                    "title":    xml_article.find('MedlineCitation/Article/ArticleTitle').text
-                    "doi":      xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]')
+                    "language": xml_article.find('MedlineCitation/Article/Language').text
-                    # other metadata should also be included: submission date, etc.
+                    "doi":      xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
                }
                contents    = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
                # create the document in the database
-                childNode   = 
+                childNode   = self.create_document(
-                childNode
+                    title       = metadata["title"],
+                    contents    = contents,
+                    language    = metadata["language"],
+                    metadata    = metadata,
+                    guid        = metadata["doi"],
+                )
+                parentNode.add_child(childNode)