Commit 47d8168a authored by Mathieu Rodic's avatar Mathieu Rodic

Adding a new structure to parse documents - still in progress.

parent 4b397ad5
class EuropressFileParser(FileParser, contents):
def parse:
pass
\ No newline at end of file
import Collections
class FileParser:
def __init__(self, file=None, path=""):
# initialize output...
self.text = ""
self.metadata = {}
self.ngram_count = Collections.defaultdict(int)
# ...get contents...
if file is None:
file = open(path, "rb")
self.contents = file.readall()
# ...parse, then extract the words!
self.parse()
self.extract()
def detect_encoding(self, string):
# see chardet
pass
def parse(self, contents):
pass
def extract(self):
re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
for line in self.text.split('\n'):
for token in re_sentence.findall(line):
pass
from EuropressFileParser import EuropressFileParser
\ No newline at end of file
class TreeTagger:
pass
\ No newline at end of file
class Tagger:
def start(self):
self.buffer = []
def send(self, text):
pass
def end(self):
return self.buffer
from NltkTagger import NltkTagger
from TreeTagger import TreeTagger
\ No newline at end of file
import zipfile
import Collections
# import chardet
# This allows the fast retrieval of ngram ids
# from the cache instead of using the database
class Ngram_Cache:
def __init__(self):
self._cache = {}
def get(self, terms):
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = NGram.get(terms=terms)
except:
ngram = NGram(terms=terms, n=len(terms))
ngram.save()
self._cache[terms] = ngram.pk
return self._cache[terms]
class Parser:
def __init__(self):
pass
def parse_file(self, file):
# CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
pass
def parse_node(self, node):
for resource in node.resources:
if node.resources.file and zipfile.is_zipfile(node.resources.file):
with zipfile.ZipFile(node.resources.file, "r") as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
node.add_child(
type = NodeType.get(name="Document"),
user = node.user,
)
def parse_node_recursively(self, node):
self.parse_node(node)
for descendant in node.get_descendants():
self.parse_node(descendant)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment