Added the file parsers.

Allows parsing from various sources.

Added the file parsers.
Allows parsing from various sources.
0602bc3b · Mathieu Rodic · c868c1c3 · 0602bc3b · 0602bc3b · 0602bc3b
Commit 0602bc3b authored Oct 15, 2014 by Mathieu Rodic
6 changed files
--- a/mat-parsing/FileParsers/FileParser.py
+++ b/mat-parsing/FileParsers/FileParser.py
+import Collections
+
+"""Base class for performing files parsing depending on their type.
+"""
+class FileParser:
+    
+    def __init__(self, file=None, path="", encoding="utf8"):
+        # ...get the file item...
+        if file is None:
+            self._file = open(path, "rb")
+        else:
+            self._file = file
+        # ...and parse!
+        self.parse()
+    
+    """Add a document to the database.
+    """
+    def add_document(self, parent, title, contents, metadata, resource_guid=None):
+        # create or retrieve a resource for that document, based on its user id
+        if resource_guid is None:
+            resource = Resource(guid=resource_guid)
+        else:
+            try:
+                resource = Resource.get(guid=resource_guid)
+            except:
+                resource = Resource(guid=resource_guid)
+        # create the document itself
+        document = 
+    
+    """Useful method to detect the document encoding.
+    Not sure it should be here actually.
+    """
+    def detect_encoding(self, string):
+        # see the chardet library
+        pass
+    
+    """Parse the data.
+    This method shall be overriden by inherited classes.
+    """
+    def parse(self):
+        pass
\ No newline at end of file
--- a/mat-parsing/FileParsers/PubmedFileParser.py
+++ b/mat-parsing/FileParsers/PubmedFileParser.py
+from FileParser import FileParser
+
+
+class PubmedFileParser(FileParser):
+    
+    def parse(self):
+        # open the file as XML
+        xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
+        xml = etree.parse(self._file, parser=xml_parser)
+        # find all the abstracts
+        xml_docs = xml.findall('PubmedArticle/MedlineCitation')
+        for xml_doc in xml_docs:
+            metadata = {}
+            date_year = int(xml_doc.find('DateCreated/Year').text)
+            date_month = int(xml_doc.find('DateCreated/Month').text)
+            date_day = int(xml_doc.find('DateCreated/Day').text)
+            metadata["date"] = datetime.date(year, month, day)
+            metadata["journal"] = xml_doc.find('Article/Journal/Title').text
+            metadata["title"] = xml_doc.find('Article/ArticleTitle').text
+            contents = 
\ No newline at end of file
--- a/mat-parsing/FileParsers/__init__.py
+++ b/mat-parsing/FileParsers/__init__.py
-import Collections
-
-
-class FileParser:
-    
-    def __init__(self, file=None, path=""):
-        # initialize output...
-        self.text = ""
-        self.metadata = {}
-        self.ngram_count = Collections.defaultdict(int)
-        # ...get contents...
-        if file is None:
-            file = open(path, "rb")
-        self.contents = file.readall()
-        # ...parse, then extract the words!
-        self.parse()
-        self.extract()
-    
-    def detect_encoding(self, string):
-        # see chardet
-        pass
-    
-    def parse(self, contents):
-        pass
-        
-    def extract(self):
-        re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
-            (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
-            | \w+(?:-\w+)* # words with optional internal hyphens
-            | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
-            | \.\.\. # ellipsis
-            | [][.,;"'?():-_`] # these are separate tokens
-            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
-        for line in self.text.split('\n'):
-            for token in re_sentence.findall(line):
-                pass
-   
-   
-from EuropressFileParser import EuropressFileParser
\ No newline at end of file
+from EuropressFileParser import EuropressFileParser
+from PubmedFileParser import PubmedFileParser
\ No newline at end of file
--- a/mat-parsing/Taggers/NltkTagger.py
+++ b/mat-parsing/Taggers/NltkTagger.py
@@ -12,5 +12,6 @@ class NltkTagger(Tagger):

 # tagger = NltkTagger()
 # tagger.start()
-# tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
+# # tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
+# tagger.send_text("This is not a sentence.")
 # print(tagger.end())
\ No newline at end of file
--- a/mat-parsing/Taggers/Tagger.py
+++ b/mat-parsing/Taggers/Tagger.py
@@ -12,7 +12,7 @@ _re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
    ''', re.UNICODE | re.MULTILINE | re.DOTALL)


-"""This class is a model for performing tagging in a pipeline fashion.
+"""This base class is a model for performing tagging in a pipeline fashion.
 When started, it initiates the parser;
 when passed text, the text is piped to the parser.
 When ended, the parser is closed and the tagged word returned
@@ -24,7 +24,7 @@ class Tagger:
        self.buffer = []
    
    """Initialize the tagger.
-    This method shall be overriden by inherited classes.
+    This method can be overriden by inherited classes.
    """
    def start(self):
        pass
@@ -44,8 +44,8 @@ class Tagger:
            )
    
    """Ends the tagger and returns the tagged tokens.
-    This method shall be overriden by inherited classes.
-    Example of output: [('The', 'DET'), ('dog', 'NOM'), ('is', 'VER'), ('green', 'ADJ'), ('.', 'PUN')]
+    This method can be overriden by inherited classes.
+    Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
    """
    def end(self):
        return self.buffer

--- a/mat-parsing/__init__.py
+++ b/mat-parsing/__init__.py
+from Taggers import *
+from FileParsers import *
+
+
 import zipfile
 import Collections
 # import chardet