Commit c868c1c3 authored by Mathieu Rodic's avatar Mathieu Rodic

Added new classes, for token tagging. They are now fully working!

For now, two taggers can be used, using a common interface:
* TreeTagger
* NltkTagger
parent 0885f2ef
from Tagger import Tagger
import nltk
class NltkTagger(Tagger):
def send_tokens(self, tokens):
self.buffer += nltk.pos_tag(tokens)
# tagger = NltkTagger()
# tagger.start()
# tagger.send_text("This is not a sentence. Or, is it? I wish it was; I could perform tagging tests on it.")
# print(tagger.end())
\ No newline at end of file
import re
"""This regular expression is really good at tokenizing a text!
"""
_re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
"""This class is a model for performing tagging in a pipeline fashion.
When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned
in a tuple format.
"""
class Tagger:
def __init__(self):
self.buffer = []
"""Initialize the tagger.
This method shall be overriden by inherited classes.
"""
def start(self):
pass
"""Send a list of tokens to be tagged.
This method shall be overriden by inherited classes.
"""
def send_tokens(self, tokens):
pass
"""Send a text to be tagged.
"""
def send_text(self, text):
for line in text.split('\n'):
self.send_tokens(
_re_sentence.findall(line)
)
"""Ends the tagger and returns the tagged tokens.
This method shall be overriden by inherited classes.
Example of output: [('The', 'DET'), ('dog', 'NOM'), ('is', 'VER'), ('green', 'ADJ'), ('.', 'PUN')]
"""
def end(self):
return self.buffer
"""Starts the tagger, pipes the text,
ends the tagger, returns the result.
"""
def tag(self, text):
self.start()
self.send_text(text)
return self.end()
class TreeTagger:
pass
\ No newline at end of file
from Tagger import Tagger
import subprocess
import threading
import time
# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
class identity_dict(dict):
def __missing__(self, key):
return key
_tag_replacements = identity_dict({
"NOM": "NN",
"NAM": "NN",
"ADJ": "NN",
"VER": "JJ",
"PREP": "PRP",
"KON": "CC",
"DET": "DT",
"PRO": "DT",
# Do we also have to take semicolons, comas and other points into account?
})
def _readOutput(output, buffer):
while True:
line = output.readline()
if line:
if line == b"<end/>\n":
break
token, tag = line.decode('utf8').split()[:2]
tag = _tag_replacements[tag.split(':')[0]]
buffer.append((token, tag))
else:
time.sleep(0.1)
"""Use TreeTagger for the tagging.
Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "../../../nlp/pythonwrapperP3/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
"%s/lib/french-utf8.par" % treeTaggerPath,
"-token",
"-lemma",
"-sgml",
"-quiet"
]
self._popen = subprocess.Popen(
tagcmdlist, # Use a list of params in place of a string.
bufsize=0, # Not buffered to retrieve data asap from TreeTagger
executable=binaryFile, # As we have it, specify it
stdin=subprocess.PIPE, # Get a pipe to write input data to TreeTagger process
stdout=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
stderr=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
)
self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
self._thread.start()
def send_tokens(self, tokens):
for token in tokens:
self._input.write(bytes(token + "\n", "utf8"))
def end(self):
# send some dummy tokens, then wait for the text to be treated
self.send_tokens("<end/> Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split())
# wait for the thread to end
self._thread.join()
# terminates the 'treetagger' process
self._popen.kill()
self._popen.terminate()
# returns the tagged tokens
return self.buffer
# tagger = TreeTagger()
# tagger.start()
# tagger.send_text("Ceci n'est pas une phrase, n'est-ce pas? Parfois, il faut tester des phrases ; mêmes celles avec des points-virgules.")
# print(tagger.end())
\ No newline at end of file
class Tagger:
def start(self):
self.buffer = []
def send(self, text):
pass
def end(self):
return self.buffer
from NltkTagger import NltkTagger
from TreeTagger import TreeTagger
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment