Commit 0602bc3b authored by Mathieu Rodic's avatar Mathieu Rodic

Added the file parsers.

Allows parsing from various sources.
parent c868c1c3
import Collections
"""Base class for performing files parsing depending on their type.
"""
class FileParser:
def __init__(self, file=None, path="", encoding="utf8"):
# ...get the file item...
if file is None:
self._file = open(path, "rb")
else:
self._file = file
# ...and parse!
self.parse()
"""Add a document to the database.
"""
def add_document(self, parent, title, contents, metadata, resource_guid=None):
# create or retrieve a resource for that document, based on its user id
if resource_guid is None:
resource = Resource(guid=resource_guid)
else:
try:
resource = Resource.get(guid=resource_guid)
except:
resource = Resource(guid=resource_guid)
# create the document itself
document =
"""Useful method to detect the document encoding.
Not sure it should be here actually.
"""
def detect_encoding(self, string):
# see the chardet library
pass
"""Parse the data.
This method shall be overriden by inherited classes.
"""
def parse(self):
pass
\ No newline at end of file
from FileParser import FileParser
class PubmedFileParser(FileParser):
def parse(self):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False,recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# find all the abstracts
xml_docs = xml.findall('PubmedArticle/MedlineCitation')
for xml_doc in xml_docs:
metadata = {}
date_year = int(xml_doc.find('DateCreated/Year').text)
date_month = int(xml_doc.find('DateCreated/Month').text)
date_day = int(xml_doc.find('DateCreated/Day').text)
metadata["date"] = datetime.date(year, month, day)
metadata["journal"] = xml_doc.find('Article/Journal/Title').text
metadata["title"] = xml_doc.find('Article/ArticleTitle').text
contents =
\ No newline at end of file
import Collections
class FileParser:
def __init__(self, file=None, path=""):
# initialize output...
self.text = ""
self.metadata = {}
self.ngram_count = Collections.defaultdict(int)
# ...get contents...
if file is None:
file = open(path, "rb")
self.contents = file.readall()
# ...parse, then extract the words!
self.parse()
self.extract()
def detect_encoding(self, string):
# see chardet
pass
def parse(self, contents):
pass
def extract(self):
re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
for line in self.text.split('\n'):
for token in re_sentence.findall(line):
pass
from EuropressFileParser import EuropressFileParser
\ No newline at end of file
from EuropressFileParser import EuropressFileParser
from PubmedFileParser import PubmedFileParser
\ No newline at end of file
......@@ -12,5 +12,6 @@ class NltkTagger(Tagger):
# tagger = NltkTagger()
# tagger.start()
# tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
# # tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
# tagger.send_text("This is not a sentence.")
# print(tagger.end())
\ No newline at end of file
......@@ -12,7 +12,7 @@ _re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
''', re.UNICODE | re.MULTILINE | re.DOTALL)
"""This class is a model for performing tagging in a pipeline fashion.
"""This base class is a model for performing tagging in a pipeline fashion.
When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned
......@@ -24,7 +24,7 @@ class Tagger:
self.buffer = []
"""Initialize the tagger.
This method shall be overriden by inherited classes.
This method can be overriden by inherited classes.
"""
def start(self):
pass
......@@ -44,8 +44,8 @@ class Tagger:
)
"""Ends the tagger and returns the tagged tokens.
This method shall be overriden by inherited classes.
Example of output: [('The', 'DET'), ('dog', 'NOM'), ('is', 'VER'), ('green', 'ADJ'), ('.', 'PUN')]
This method can be overriden by inherited classes.
Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
"""
def end(self):
return self.buffer
......
from Taggers import *
from FileParsers import *
import zipfile
import Collections
# import chardet
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment