Added new classes, for token tagging. They are now fully working!

For now, two taggers can be used, using a common interface: * TreeTagger * NltkTagger

Added new classes, for token tagging. They are now fully working!
For now, two taggers can be used, using a common interface: * TreeTagger * NltkTagger
c868c1c3 · Mathieu Rodic · 0885f2ef · c868c1c3 · c868c1c3 · c868c1c3
Commit c868c1c3 authored Oct 14, 2014 by Mathieu Rodic
Showing with 163 additions and 14 deletions

NltkTagger.py mat-parsing/Taggers/NltkTagger.py +16 -0

Tagger.py mat-parsing/Taggers/Tagger.py +60 -0

TreeTagger.py mat-parsing/Taggers/TreeTagger.py +87 -2

__init__.py mat-parsing/Taggers/__init__.py +0 -12

No files found.
--- a/mat-parsing/Taggers/NltkTagger.py
+++ b/mat-parsing/Taggers/NltkTagger.py
+from Tagger import Tagger
+
+import nltk
+
+
+class NltkTagger(Tagger):
+    
+    def send_tokens(self, tokens):
+        self.buffer += nltk.pos_tag(tokens)
+
+
+
+# tagger = NltkTagger()
+# tagger.start()
+# tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
+# print(tagger.end())
\ No newline at end of file
--- a/mat-parsing/Taggers/Tagger.py
+++ b/mat-parsing/Taggers/Tagger.py
+import re
+
+
+"""This regular expression is really good at tokenizing a text!
+"""
+_re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
+    (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
+    | \w+(?:-\w+)*                  # words with optional internal hyphens
+    | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
+    | \.\.\.                        # ellipsis
+    | [][.,;"'?!():-_`]             # these are separate tokens
+    ''', re.UNICODE | re.MULTILINE | re.DOTALL)
+
+
+"""This class is a model for performing tagging in a pipeline fashion.
+When started, it initiates the parser;
+when passed text, the text is piped to the parser.
+When ended, the parser is closed and the tagged word returned
+in a tuple format.
+"""
+class Tagger:
+
+    def __init__(self):
+        self.buffer = []
+    
+    """Initialize the tagger.
+    This method shall be overriden by inherited classes.
+    """
+    def start(self):
+        pass
+
+    """Send a list of tokens to be tagged.
+    This method shall be overriden by inherited classes.
+    """
+    def send_tokens(self, tokens):
+        pass
+        
+    """Send a text to be tagged.
+    """
+    def send_text(self, text):
+        for line in text.split('\n'):
+            self.send_tokens(
+                _re_sentence.findall(line)
+            )
+    
+    """Ends the tagger and returns the tagged tokens.
+    This method shall be overriden by inherited classes.
+    Example of output: [('The', 'DET'), ('dog', 'NOM'), ('is', 'VER'), ('green', 'ADJ'), ('.', 'PUN')]
+    """
+    def end(self):
+        return self.buffer
+    
+    """Starts the tagger, pipes the text,
+    ends the tagger, returns the result.
+    """
+    def tag(self, text):
+        self.start()
+        self.send_text(text)
+        return self.end()
+        
--- a/mat-parsing/Taggers/TreeTagger.py
+++ b/mat-parsing/Taggers/TreeTagger.py
-class TreeTagger:
-    pass
\ No newline at end of file
+from Tagger import Tagger
+
+import subprocess
+import threading
+import time
+
+
+# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
+
+class identity_dict(dict):
+    def __missing__(self, key):
+        return key
+    
+_tag_replacements = identity_dict({
+    "NOM": "NN",
+    "NAM": "NN",
+    "ADJ": "NN",
+    "VER": "JJ",
+    "PREP": "PRP",
+    "KON": "CC",
+    "DET": "DT",
+    "PRO": "DT",
+    # Do we also have to take semicolons, comas and other points into account?
+})
+
+
+def _readOutput(output, buffer):
+    while True:
+        line = output.readline()
+        if line:
+            if line == b"<end/>\n":
+                break
+            token, tag = line.decode('utf8').split()[:2]
+            tag = _tag_replacements[tag.split(':')[0]]
+            buffer.append((token, tag))
+        else:
+            time.sleep(0.1)
+
+
+"""Use TreeTagger for the tagging.
+Shall be used for french texts.
+"""
+class TreeTagger(Tagger):
+    
+    def start(self, treeTaggerPath = "../../../nlp/pythonwrapperP3/treetagger"):
+        binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
+        tagcmdlist = [
+            binaryFile,
+            "%s/lib/french-utf8.par" % treeTaggerPath,
+            "-token",
+            "-lemma",
+            "-sgml",
+            "-quiet"
+        ]
+        self._popen = subprocess.Popen(
+            tagcmdlist,     # Use a list of params in place of a string.
+            bufsize=0,      # Not buffered to retrieve data asap from TreeTagger
+            executable=binaryFile, # As we have it, specify it
+            stdin=subprocess.PIPE,  # Get a pipe to write input data to TreeTagger process
+            stdout=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+            stderr=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
+        )
+        self._input, self._output = self._popen.stdin, self._popen.stdout
+        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
+        self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
+        self._thread.start()
+        
+    def send_tokens(self, tokens):
+        for token in tokens:
+            self._input.write(bytes(token + "\n", "utf8"))
+    
+    def end(self):
+        # send some dummy tokens, then wait for the text to be treated
+        self.send_tokens("<end/> Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split())
+        # wait for the thread to end
+        self._thread.join()
+        # terminates the 'treetagger' process
+        self._popen.kill()
+        self._popen.terminate()
+        # returns the tagged tokens
+        return self.buffer
+
+
+# tagger = TreeTagger()
+# tagger.start()
+# tagger.send_text("Ceci n'est pas une phrase, n'est-ce pas? Parfois, il faut tester des phrases ; mêmes celles avec des points-virgules.")
+# print(tagger.end())
\ No newline at end of file
--- a/mat-parsing/Taggers/__init__.py
+++ b/mat-parsing/Taggers/__init__.py
-class Tagger:
-
-    def start(self):
-        self.buffer = []
-        
-    def send(self, text):
-        pass
-        
-    def end(self):
-        return self.buffer
-
-        
 from NltkTagger import NltkTagger
 from TreeTagger import TreeTagger
\ No newline at end of file