Commit 6a7b8b89 authored by Administrator's avatar Administrator

[FIX] real path for lib/parsers

parent 44871a3f
/srv/gargantext-taggers
\ No newline at end of file
from .Tagger import Tagger
import subprocess
import threading
import time
# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
class identity_dict(dict):
def __missing__(self, key):
return key
_tag_replacements = identity_dict({
"NOM": "NN",
"NAM": "NN",
"ADJ": "NN",
"VER": "JJ",
"PREP": "PRP",
"KON": "CC",
"DET": "DT",
"PRO": "DT",
# Do we also have to take semicolons, comas and other points into account?
})
def _readOutput(output, buffer):
hasStarted = False
while True:
line = output.readline()
if line:
if line == b"<block>\n":
hasStarted = True
continue
if line == b"<block/>\n":
break
if hasStarted:
token, tag = line.decode('utf8').split()[:2]
tag = _tag_replacements[tag.split(':')[0]]
buffer.append((token, tag))
else:
time.sleep(0.1)
"""Use MElt for the tagging.
"""
class Melt(Tagger):
def start(self, taggerPath = "/usr/local/bin/"):
binaryFile = "%s/MElt" % taggerPath
tagcmdlist = [
binaryFile,
"-l",
]
tagcmdlist = []
self._popen = subprocess.Popen(
tagcmdlist, # Use a list of params in place of a string.
bufsize=0, # Not buffered to retrieve data asap from Tagger
executable=binaryFile, # As we have it, specify it
stdin=subprocess.PIPE, # Get a pipe to write input data to Tagger process
stdout=subprocess.PIPE, # Get a pipe to read processing results from Tagger
stderr=subprocess.PIPE, # Get a pipe to read processing results from Tagger
)
self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
def stop(self):
# terminates the process
try:
self._popen.kill()
self._popen.terminate()
except:
pass
def tagging_start(self):
self.buffer = []
self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
self._thread.start()
#self._input.write(b"<block>\n")
def tagging_end(self):
#self._input.write(b"<block/>\n")
# sends some dummy tokens, then wait for the text to be treated
#self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
self._thread.join()
def tag_tokens(self, tokens, single=True):
if single:
self.tagging_start()
for token in tokens:
self._input.write(bytes(token + "\n", "utf8"))
if single:
self.tagging_end()
return self.buffer
def tag_text(self, text):
self.tagging_start()
for line in text.split('\n'):
tokens = self._re_sentence.findall(line)
self.tag_tokens(tokens, False)
self.tagging_end()
return self.buffer
from .Tagger import Tagger
from .lib.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader
import subprocess
import sys
import os
# references for tag equivalents:
# - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html
# - http://www.lattice.cnrs.fr/sites/itellier/SEM.html
class identity_dict(dict):
def __missing__(self, key):
return key
_tag_replacements = identity_dict({
'DET': 'DT',
'NC': 'NN',
'NPP': 'NNP',
'ADJ': 'JJ',
'PONCT': '.',
'ADVWH': 'WRB',
'ADV': 'RB',
'DETWH': 'WDT',
'PROWH': 'WP',
'ET': 'FW',
'VINF': 'VB',
'I': 'UH',
'CS': 'IN',
# 'CLS': '',
# 'CLR': '',
# 'CLO': '',
# 'PRO': '',
# 'PROREL': '',
# 'P': '',
# 'P+D': '',
# 'P+PRO': '',
# 'V': '',
# 'VPR': '',
# 'VPP': '',
# 'VS': '',
# 'VIMP': '',
# 'PREF': '',
# 'ADJWH': '',
})
class MeltTagger(Tagger):
def start(self, language='fr', melt_data_path='lib/melttagger'):
basepath = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(basepath, melt_data_path)
self._pos_tagger = POSTagger()
self._pos_tagger.load_tag_dictionary('%s/%s/tag_dict.json' % (path, language))
self._pos_tagger.load_lexicon('%s/%s/lexicon.json' % (path, language))
self._pos_tagger.load_model('%s/%s' % (path, language))
self._preprocessing_commands = (
# ('/usr/local/bin/clean_noisy_characters.sh', ),
('%s/MElt_normalizer.pl' % path, '-nc', '-c', '-d', '%s/%s' % (path, language), '-l', language, ),
('%s/segmenteur.pl' % path, '-a', '-ca', '-af=%s/pctabr' % path, '-p', 'r'),
)
self._lemmatization_commands = (
('%s/MElt_postprocess.pl' % path, '-npp', '-l', language),
('%s/MElt_lemmatizer.pl' % path, '-m', '%s/%s' % (path, language)),
)
def stop(self):
pass
def _pipe(self, text, commands, encoding='utf8'):
text = text.encode(encoding)
for command in commands:
process = subprocess.Popen(
command,
bufsize=0,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
text, err = process.communicate(text)
if len(err):
print(err.decode(encoding), file=sys.stderr)
return text.decode(encoding)
def _tag(self, text):
preprocessed = self._pipe(text, self._preprocessing_commands)
for sentence in preprocessed.split('\n'):
words = sentence.split(' ')
tokens = [Token(word) for word in words]
tagged_tokens = self._pos_tagger.tag_token_sequence(tokens)
for token in tagged_tokens:
if len(token.string):
yield (token.string, _tag_replacements[token.label], )
def tag_text(self, text, lemmatize=True):
tagged_tokens = self._tag(text)
if not lemmatize:
for tagged_token in tagged_tokens:
yield tagged_token
return
# lemmatization
command_input = ' '.join(
'%s/%s' % (token, tag)
for token, tag in tagged_tokens
)
lemmatized = self._pipe(command_input, self._lemmatization_commands)
for token in lemmatized.split():
if len(token):
values = token.split('/')
yield (values[0], values[1], values[2].replace('*', ''))
from .Tagger import Tagger
import nltk
class NltkTagger(Tagger):
def tag_tokens(self, tokens, single=True):
return nltk.pos_tag(tokens)
In this repo are all files for Gargantext Taggers.
For developers please indicate this path:
/srv/gargantext_lib/gargantext-taggers/.
Then this repo should be locate in /srv/gargantext_lib
import re
"""This base class is a model for performing tagging in a pipeline fashion.
When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned
in a tuple format.
"""
class Tagger:
def __init__(self):
# This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
| \w+(?:-\w+)* # words with optional internal hyphens
| \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
self.start()
def __del__(self):
self.stop()
"""Initializes the tagger. This method is called by the constructor.
This method can be overriden by inherited classes.
"""
def start(self):
pass
"""Ends the tagger.
This method is called by the destructor.
This method can be overriden by inherited classes.
"""
def stop(self):
pass
"""This method is userful in the case of pipelines requiring
boundaries around blocks of text.
"""
def tagging_start(self):
pass
def tagging_end(self):
pass
"""Returns the tagged tokens.
This method shall be overriden by inherited classes.
Example of input: ['This', 'is', 'not', 'a', 'sentence', '.']
Example of output: [('This', 'DT'), ('is', 'VBZ'), ('not', 'RB'), ('a', 'DT'), ('sentence', 'NN'), ('.', '.')]
"""
def tag_tokens(self, tokens, single=True):
if single:
self.tagging_start()
# do something with the tokens here
if single:
self.tagging_end()
return []
"""Send a text to be tagged.
"""
# Not used right now
def tag_text(self, text):
tokens_tags = []
self.tagging_start()
for line in text.split('\n'):
tokens = self._re_sentence.findall(line)
tokens_tags += self.tag_tokens(tokens, False)
self.tagging_end()
return tokens_tags
from .Tagger import Tagger
import subprocess
import threading
import time
# TODO: have a look at "queue" instead of "list" (cf. http://stackoverflow.com/questions/17564804/in-python-how-to-wait-until-only-the-first-thread-is-finished)
class identity_dict(dict):
def __missing__(self, key):
return key
_tag_replacements = identity_dict({
"NOM": "NN",
"NAM": "NN",
"ADJ": "NN",
"VER": "JJ",
"PREP": "PRP",
"KON": "CC",
"DET": "DT",
"PRO": "DT",
# Do we also have to take semicolons, comas and other points into account?
})
def _readOutput(output, buffer):
hasStarted = False
while True:
line = output.readline()
if line:
if line == b"<block>\n":
hasStarted = True
continue
if line == b"<block/>\n":
break
if hasStarted:
token, tag = line.decode('utf8').split()[:2]
tag = _tag_replacements[tag.split(':')[0]]
buffer.append((token, tag))
else:
time.sleep(0.1)
"""Use TreeTagger for the tagging.
Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "./parsing/Taggers/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
"%s/lib/french-utf8.par" % treeTaggerPath,
"-token",
"-lemma",
"-sgml",
"-quiet"
]
self._popen = subprocess.Popen(
tagcmdlist, # Use a list of params in place of a string.
bufsize=0, # Not buffered to retrieve data asap from TreeTagger
executable=binaryFile, # As we have it, specify it
stdin=subprocess.PIPE, # Get a pipe to write input data to TreeTagger process
stdout=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
stderr=subprocess.PIPE, # Get a pipe to read processing results from TreeTagger
)
self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
def stop(self):
# terminates the 'treetagger' process
try:
self._popen.kill()
self._popen.terminate()
except:
pass
def tagging_start(self):
self.buffer = []
self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, ))
self._thread.start()
self._input.write(b"<block>\n")
def tagging_end(self):
self._input.write(b"<block/>\n")
# sends some dummy tokens, then wait for the text to be treated
self.tag_tokens("Les sanglots longs des violons de l ' automne bercent mon coeur d ' une langueur monotone .".split(), False)
self._thread.join()
def tag_tokens(self, tokens, single=True):
if single:
self.tagging_start()
for token in tokens:
self._input.write(bytes(token + "\n", "utf8"))
if single:
self.tagging_end()
return self.buffer
def tag_text(self, text):
self.tagging_start()
for line in text.split('\n'):
tokens = self._re_sentence.findall(line)
self.tag_tokens(tokens, False)
self.tagging_end()
return self.buffer
from .Tagger import Tagger
from .lib.nlpserver.client import NLPClient
class TurboTagger:
def start(self):
self._nlpclient = NLPClient()
def stop(self):
if hasattr(self, '_nlpclient'):
del self._nlpclient
def tag_text(self, text):
if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient()
tokens_tags = []
for sentence in self._nlpclient.tag(text):
for token, tag in sentence:
tokens_tags.append((token, tag, ))
return tokens_tags
from .Tagger import Tagger
from .NltkTagger import NltkTagger
from .TreeTagger import TreeTagger
from .TurboTagger import TurboTagger
from .MeltTagger import MeltTagger
/srv/gargantext_lib/taggers
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment