Changed the taggers to facilitate their use.

See test.py for examples.

Changed the taggers to facilitate their use.
See test.py for examples.
4394900c · Mathieu Rodic · 727d5711 · 4394900c · 727d5711 · 4394900c
Commit 4394900c authored Oct 18, 2014 by Mathieu Rodic
5 changed files
--- a/mat-parsing/Taggers/NltkTagger.py
+++ b/mat-parsing/Taggers/NltkTagger.py
@@ -5,13 +5,6 @@ import nltk

 class NltkTagger(Tagger):
    
-    def send_tokens(self, tokens):
-        self.buffer += nltk.pos_tag(tokens)
+    def tag_tokens(self, tokens, single=True):
+        return nltk.pos_tag(tokens)

-
-
-# tagger = NltkTagger()
-# tagger.start()
-# # tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
-# tagger.send_text("This is not a sentence.")
-# print(tagger.end())
\ No newline at end of file
--- a/mat-parsing/Taggers/Tagger.dev.py
+++ b/mat-parsing/Taggers/Tagger.dev.py
-import re
-
-
-"""This regular expression is really good at tokenizing a text!
-"""
-_re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
-    (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
-    | \w+(?:-\w+)*                  # words with optional internal hyphens
-    | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
-    | \.\.\.                        # ellipsis
-    | [][.,;"'?!():-_`]             # these are separate tokens
-    ''', re.UNICODE | re.MULTILINE | re.DOTALL)
-
-
-"""This base class is a model for performing tagging in a pipeline fashion.
-When started, it initiates the parser;
-when passed text, the text is piped to the parser.
-When ended, the parser is closed and the tagged word returned
-in a tuple format.
-"""
-class Tagger:
-
-    def __init__(self):
-        self.buffer = []
-        self.start()
-        
-    def __del__(self):
-        self.stop()
-    
-    """Initializes the tagger. This method is called by the constructor.
-    This method can be overriden by inherited classes.
-    """
-    def start(self):
-        pass
-    
-    """Ends the tagger.
-    This method is called by the destructor.
-    This method can be overriden by inherited classes.
-    """
-    def end(self):
-        pass
-
-    """This method is userful in the case of pipelines requiring
-    boundaries around blocks of text.
-    """
-    def tagging_start(self):
-        pass
-        
-    def tagging_end(self):
-        pass
-        
-    """Returns the tagged tokens.
-    This method shall be overriden by inherited classes.
-    Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
-    Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
-    """
-    def tag_tokens(self, tokens, block=False):
-        if not block:
-            self.tagging_start()
-        # do something with the tokens here
-        if not block:
-            self.tagging_end()
-            return []
-        
-    """Send a text to be tagged.
-    """
-    def tag_text(self, text):
-        self.tagging_start()
-        for line in text.split('\n'):
-            tokens = _re_sentence.findall(line)
-            self.tag_tokens(tokens, True)
-        self.tagging_end()
-        return []
-
--- a/mat-parsing/Taggers/Tagger.py
+++ b/mat-parsing/Taggers/Tagger.py
 import re


-"""This regular expression is really good at tokenizing a text!
-"""
-_re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
-    (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
-    | \w+(?:-\w+)*                  # words with optional internal hyphens
-    | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
-    | \.\.\.                        # ellipsis
-    | [][.,;"'?!():-_`]             # these are separate tokens
-    ''', re.UNICODE | re.MULTILINE | re.DOTALL)
-
-
 """This base class is a model for performing tagging in a pipeline fashion.
 When started, it initiates the parser;
 when passed text, the text is piped to the parser.
@@ -21,40 +10,63 @@ in a tuple format.
 class Tagger:

    def __init__(self):
+        # This regular expression is really good at tokenizing a text!
+        self._re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
+            (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
+            | \w+(?:-\w+)*                  # words with optional internal hyphens
+            | \$?\d+(?:\.\d+)?%?            # currency and percentages, e.g. $12.40, 82%
+            | \.\.\.                        # ellipsis
+            | [][.,;"'?!():-_`]             # these are separate tokens
+            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
        self.buffer = []
+        self.start()
+        
+    def __del__(self):
+        self.stop()
    
-    """Initialize the tagger.
+    """Initializes the tagger. This method is called by the constructor.
    This method can be overriden by inherited classes.
    """
    def start(self):
        pass
+    
+    """Ends the tagger.
+    This method is called by the destructor.
+    This method can be overriden by inherited classes.
+    """
+    def stop(self):
+        pass

-    """Send a list of tokens to be tagged.
-    This method shall be overriden by inherited classes.
+    """This method is userful in the case of pipelines requiring
+    boundaries around blocks of text.
    """
-    def send_tokens(self, tokens):
+    def tagging_start(self):
        pass
        
-    """Send a text to be tagged.
-    """
-    def send_text(self, text):
-        for line in text.split('\n'):
-            self.send_tokens(
-                _re_sentence.findall(line)
-            )
-    
-    """Ends the tagger and returns the tagged tokens.
-    This method can be overriden by inherited classes.
+    def tagging_end(self):
+        pass
+        
+    """Returns the tagged tokens.
+    This method shall be overriden by inherited classes.
+    Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
    Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
    """
-    def end(self):
-        return self.buffer
-    
-    """Starts the tagger, pipes the text,
-    ends the tagger, returns the result.
-    """
-    def tag(self, text):
-        self.start()
-        self.send_text(text)
-        return self.end()
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
+        # do something with the tokens here
+        if single:
+            self.tagging_end()
+        return []
        
+    """Send a text to be tagged.
+    """
+    def tag_text(self, text):
+        tokens_tags = []
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            tokens_tags += self.tag_tokens(tokens, False)
+        self.tagging_end()
+        return tokens_tags
+
--- a/mat-parsing/Taggers/TreeTagger.py
+++ b/mat-parsing/Taggers/TreeTagger.py
@@ -23,16 +23,20 @@ _tag_replacements = identity_dict({
    # Do we also have to take semicolons, comas and other points into account?
 })

-
 def _readOutput(output, buffer):
+    hasStarted = False
    while True:
        line = output.readline()
        if line:
-            if line == b"<end/>\n":
+            if line == b"<block>\n":
+                hasStarted = True
+                continue
+            if line == b"<block/>\n":
                break
-            token, tag = line.decode('utf8').split()[:2]
-            tag = _tag_replacements[tag.split(':')[0]]
-            buffer.append((token, tag))
+            if hasStarted:
+                token, tag = line.decode('utf8').split()[:2]
+                tag = _tag_replacements[tag.split(':')[0]]
+                buffer.append((token, tag))
        else:
            time.sleep(0.1)

@@ -62,26 +66,41 @@ class TreeTagger(Tagger):
        )
        self._input, self._output = self._popen.stdin, self._popen.stdout
        # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
+        # self.buffer = OutputBuffer()
+        
+    def stop(self):
+        # terminates the 'treetagger' process
+        self._popen.kill()
+        self._popen.terminate()
+        
+        
+    def tagging_start(self):
+        self.buffer = []
        self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
        self._thread.start()
+        self._input.write(b"<block>\n")
+        
+    def tagging_end(self):
+        self._input.write(b"<block/>\n")
+        # sends some dummy tokens, then wait for the text to be treated
+        self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
+        self._thread.join()
+        
        
-    def send_tokens(self, tokens):
+    def tag_tokens(self, tokens, single=True):
+        if single:
+            self.tagging_start()
        for token in tokens:
            self._input.write(bytes(token + "\n", "utf8"))
-    
-    def end(self):
-        # send some dummy tokens, then wait for the text to be treated
-        self.send_tokens("<end/> Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split())
-        # wait for the thread to end
-        self._thread.join()
-        # terminates the 'treetagger' process
-        self._popen.kill()
-        self._popen.terminate()
-        # returns the tagged tokens
+        if single:
+            self.tagging_end()
+            return self.buffer
+            
+    def tag_text(self, text):
+        self.tagging_start()
+        for line in text.split('\n'):
+            tokens = self._re_sentence.findall(line)
+            self.tag_tokens(tokens, False)
+        self.tagging_end()
        return self.buffer

-
-# tagger = TreeTagger()
-# tagger.start()
-# tagger.send_text("Ceci n'est pas une phrase, n'est-ce pas? Parfois, il faut tester des phrases ; mêmes celles avec des points-virgules.")
-# print(tagger.end())
\ No newline at end of file
--- a/mat-parsing/Taggers/test.py
+++ b/mat-parsing/Taggers/test.py
+
+from NltkTagger import NltkTagger
+tagger = NltkTagger()
+text0 = "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe."
+text1 = "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour."
+
+# from TreeTagger import TreeTagger
+# tagger = TreeTagger()
+# text0 = "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini."
+# text1 = "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie."
+
+print()
+print(tagger.tag_text(text0))
+print()
+print(tagger.tag_text(text1))
+print()
\ No newline at end of file