Adding a new structure to parse documents - still in progress.

47d8168a · Mathieu Rodic · 4b397ad5 · 47d8168a · 47d8168a · 47d8168a
Commit 47d8168a authored Oct 14, 2014 by Mathieu Rodic
5 changed files
--- a/mat-parsing/FileParsers/EuropressFileParser.py
+++ b/mat-parsing/FileParsers/EuropressFileParser.py
+class EuropressFileParser(FileParser, contents):
+    def parse:
+        pass
\ No newline at end of file
--- a/mat-parsing/FileParsers/__init__.py
+++ b/mat-parsing/FileParsers/__init__.py
+import Collections
+class FileParser:
+    def __init__(self, file=None, path=""):
+        # initialize output...
+        self.text = ""
+        self.metadata = {}
+        self.ngram_count = Collections.defaultdict(int)
+        # ...get contents...
+        if file is None:
+            file = open(path, "rb")
+        self.contents = file.readall()
+        # ...parse, then extract the words!
+        self.parse()
+        self.extract()
+    def detect_encoding(self, string):
+        # see chardet
+        pass
+    def parse(self, contents):
+        pass
+    def extract(self):
+        re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
+            (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
+            | \w+(?:-\w+)* # words with optional internal hyphens
+            | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
+            | \.\.\. # ellipsis
+            | [][.,;"'?():-_`] # these are separate tokens
+            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
+        for line in self.text.split('\n'):
+            for token in re_sentence.findall(line):
+                pass
+from EuropressFileParser import EuropressFileParser
\ No newline at end of file
--- a/mat-parsing/Taggers/TreeTagger.py
+++ b/mat-parsing/Taggers/TreeTagger.py
+class TreeTagger:
+    pass
\ No newline at end of file
--- a/mat-parsing/Taggers/__init__.py
+++ b/mat-parsing/Taggers/__init__.py
+class Tagger:
+    def start(self):
+        self.buffer = []
+    def send(self, text):
+        pass
+    def end(self):
+        return self.buffer
+from NltkTagger import NltkTagger
+from TreeTagger import TreeTagger
\ No newline at end of file
--- a/mat-parsing/__init__.py
+++ b/mat-parsing/__init__.py
+import zipfile
+import Collections
+# import chardet
+# This allows the fast retrieval of ngram ids
+# from the cache instead of using the database
+class Ngram_Cache:
+    def __init__(self):
+        self._cache = {}
+    def get(self, terms):
+        terms = terms.strip().lower()
+        if terms not in self._cache:
+            try:
+                ngram = NGram.get(terms=terms)
+            except:
+                ngram = NGram(terms=terms, n=len(terms))
+                ngram.save()
+            self._cache[terms] = ngram.pk
+        return self._cache[terms]
+class Parser:
+    def __init__(self):
+        pass
+    def parse_file(self, file):
+        # CHECKER GUID!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        pass
+    def parse_node(self, node):
+        for resource in node.resources:
+            if node.resources.file and zipfile.is_zipfile(node.resources.file):
+                with zipfile.ZipFile(node.resources.file, "r") as zipFile:
+                    for filename in zipFile.namelist():
+                        file = zipFile.open(filename, "r")
+                        node.add_child(
+                            type = NodeType.get(name="Document"),
+                            user = node.user,
+                        )
+    def parse_node_recursively(self, node):
+        self.parse_node(node)
+        for descendant in node.get_descendants():
+            self.parse_node(descendant)