Commit 4394900c authored by Mathieu Rodic's avatar Mathieu Rodic

Changed the taggers to facilitate their use.

See test.py for examples.
parent 727d5711
...@@ -5,13 +5,6 @@ import nltk ...@@ -5,13 +5,6 @@ import nltk
class NltkTagger(Tagger): class NltkTagger(Tagger):
def send_tokens(self, tokens): def tag_tokens(self, tokens, single=True):
self.buffer += nltk.pos_tag(tokens) return nltk.pos_tag(tokens)
# tagger = NltkTagger()
# tagger.start()
# # tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
# tagger.send_text("This is not a sentence.")
# print(tagger.end())
\ No newline at end of file
import re
"""This regular expression is really good at tokenizing a text!
"""
_re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
"""This base class is a model for performing tagging in a pipeline fashion.
When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned
in a tuple format.
"""
class Tagger:
def __init__(self):
self.buffer = []
self.start()
def __del__(self):
self.stop()
"""Initializes the tagger. This method is called by the constructor.
This method can be overriden by inherited classes.
"""
def start(self):
pass
"""Ends the tagger.
This method is called by the destructor.
This method can be overriden by inherited classes.
"""
def end(self):
pass
"""This method is userful in the case of pipelines requiring
boundaries around blocks of text.
"""
def tagging_start(self):
pass
def tagging_end(self):
pass
"""Returns the tagged tokens.
This method shall be overriden by inherited classes.
Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
"""
def tag_tokens(self, tokens, block=False):
if not block:
self.tagging_start()
# do something with the tokens here
if not block:
self.tagging_end()
return []
"""Send a text to be tagged.
"""
def tag_text(self, text):
self.tagging_start()
for line in text.split('\n'):
tokens = _re_sentence.findall(line)
self.tag_tokens(tokens, True)
self.tagging_end()
return []
import re import re
"""This regular expression is really good at tokenizing a text!
"""
_re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
"""This base class is a model for performing tagging in a pipeline fashion. """This base class is a model for performing tagging in a pipeline fashion.
When started, it initiates the parser; When started, it initiates the parser;
when passed text, the text is piped to the parser. when passed text, the text is piped to the parser.
...@@ -21,40 +10,63 @@ in a tuple format. ...@@ -21,40 +10,63 @@ in a tuple format.
class Tagger: class Tagger:
def __init__(self): def __init__(self):
# This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = [] self.buffer = []
self.start()
def __del__(self):
self.stop()
"""Initialize the tagger. """Initializes the tagger. This method is called by the constructor.
This method can be overriden by inherited classes. This method can be overriden by inherited classes.
""" """
def start(self): def start(self):
pass pass
"""Ends the tagger.
This method is called by the destructor.
This method can be overriden by inherited classes.
"""
def stop(self):
pass
"""Send a list of tokens to be tagged. """This method is userful in the case of pipelines requiring
This method shall be overriden by inherited classes. boundaries around blocks of text.
""" """
def send_tokens(self, tokens): def tagging_start(self):
pass pass
"""Send a text to be tagged. def tagging_end(self):
""" pass
def send_text(self, text):
for line in text.split('\n'): """Returns the tagged tokens.
self.send_tokens( This method shall be overriden by inherited classes.
_re_sentence.findall(line) Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
)
"""Ends the tagger and returns the tagged tokens.
This method can be overriden by inherited classes.
Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')] Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
""" """
def end(self): def tag_tokens(self, tokens, single=True):
return self.buffer if single:
self.tagging_start()
"""Starts the tagger, pipes the text, # do something with the tokens here
ends the tagger, returns the result. if single:
""" self.tagging_end()
def tag(self, text): return []
self.start()
self.send_text(text)
return self.end()
"""Send a text to be tagged.
"""
def tag_text(self, text):
tokens_tags = []
self.tagging_start()
for line in text.split('\n'):
tokens = self._re_sentence.findall(line)
tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end()
return tokens_tags
...@@ -23,16 +23,20 @@ _tag_replacements = identity_dict({ ...@@ -23,16 +23,20 @@ _tag_replacements = identity_dict({
# Do we also have to take semicolons, comas and other points into account? # Do we also have to take semicolons, comas and other points into account?
}) })
def _readOutput(output, buffer): def _readOutput(output, buffer):
hasStarted = False
while True: while True:
line = output.readline() line = output.readline()
if line: if line:
if line == b"<end/>\n": if line == b"<block>\n":
hasStarted = True
continue
if line == b"<block/>\n":
break break
token, tag = line.decode('utf8').split()[:2] if hasStarted:
tag = _tag_replacements[tag.split(':')[0]] token, tag = line.decode('utf8').split()[:2]
buffer.append((token, tag)) tag = _tag_replacements[tag.split(':')[0]]
buffer.append((token, tag))
else: else:
time.sleep(0.1) time.sleep(0.1)
...@@ -62,26 +66,41 @@ class TreeTagger(Tagger): ...@@ -62,26 +66,41 @@ class TreeTagger(Tagger):
) )
self._input, self._output = self._popen.stdin, self._popen.stdout self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start() # self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
def stop(self):
# terminates the 'treetagger' process
self._popen.kill()
self._popen.terminate()
def tagging_start(self):
self.buffer = []
self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )) self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
self._thread.start() self._thread.start()
self._input.write(b"<block>\n")
def tagging_end(self):
self._input.write(b"<block/>\n")
# sends some dummy tokens, then wait for the text to be treated
self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
self._thread.join()
def send_tokens(self, tokens): def tag_tokens(self, tokens, single=True):
if single:
self.tagging_start()
for token in tokens: for token in tokens:
self._input.write(bytes(token + "\n", "utf8")) self._input.write(bytes(token + "\n", "utf8"))
if single:
def end(self): self.tagging_end()
# send some dummy tokens, then wait for the text to be treated return self.buffer
self.send_tokens("<end/> Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split())
# wait for the thread to end def tag_text(self, text):
self._thread.join() self.tagging_start()
# terminates the 'treetagger' process for line in text.split('\n'):
self._popen.kill() tokens = self._re_sentence.findall(line)
self._popen.terminate() self.tag_tokens(tokens, False)
# returns the tagged tokens self.tagging_end()
return self.buffer return self.buffer
# tagger = TreeTagger()
# tagger.start()
# tagger.send_text("Ceci n'est pas une phrase, n'est-ce pas? Parfois, il faut tester des phrases ; mêmes celles avec des points-virgules.")
# print(tagger.end())
\ No newline at end of file
from NltkTagger import NltkTagger
tagger = NltkTagger()
text0 = "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe."
text1 = "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour."
# from TreeTagger import TreeTagger
# tagger = TreeTagger()
# text0 = "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini."
# text1 = "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie."
print()
print(tagger.tag_text(text0))
print()
print(tagger.tag_text(text1))
print()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment