Commit bd33187b authored by Mathieu Rodic's avatar Mathieu Rodic

[FEAT] multi-lingual capabilities for MElt

https://forge.iscpif.fr/issues/1511
parent 8aff58f1
......@@ -6,5 +6,5 @@ class EnglishNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = NltkTagger()
# self.tagger = MeltTagger(language='en')
\ No newline at end of file
......@@ -6,4 +6,4 @@ class FrenchNgramsExtractor(NgramsExtractor):
def start(self):
self.tagger = TreeTagger()
# self.tagger = MeltTagger(language='fr')
......@@ -12,7 +12,8 @@ import os
class identity_dict(dict):
def __missing__(self, key):
return key
_tag_replacements = identity_dict({
_tag_replacements = dict()
_tag_replacements['fr'] = identity_dict({
'DET': 'DT',
'NC': 'NN',
'NPP': 'NNP',
......@@ -46,11 +47,18 @@ _tag_replacements = identity_dict({
# 'PREF': '',
# 'ADJWH': '',
})
_tag_replacements['en'] = identity_dict()
class MeltTagger(Tagger):
def start(self, language='fr', melt_data_path='lib/melttagger'):
def __init__(self, *args, **kwargs):
self.language = kwargs.pop('language', 'fr')
self._tag_replacements = _tag_replacements[self.language]
super(self.__class__, self).__init__(*args, **kwargs)
def start(self, melt_data_path='lib/melttagger'):
language = self.language
basepath = os.path.dirname(os.path.realpath(__file__))
path = os.path.join(basepath, melt_data_path)
self._pos_tagger = POSTagger()
......@@ -99,7 +107,7 @@ class MeltTagger(Tagger):
# without lemmatization
if not lemmatize:
for form, tag in tagged_tokens:
yield (form, _tag_replacements[tag])
yield (form, self._tag_replacements[tag])
return
# with lemmatization
command_input = ' '.join(
......@@ -110,4 +118,4 @@ class MeltTagger(Tagger):
for token in lemmatized.split():
if len(token):
values = token.split('/')
yield (values[0], _tag_replacements[values[1]], values[2].replace('*', ''))
yield (values[0], self._tag_replacements[values[1]], values[2].replace('*', ''))
from parsing.Taggers import MeltTagger
# from parsing.Taggers.melttagger.tagger import POSTagger, Token, DAGParser, DAGReader
# # references:
# # - http://cs.nyu.edu/grishman/jet/guide/PennPOS.html
# # - http://www.lattice.cnrs.fr/sites/itellier/SEM.html
# class identity_dict(dict):
# def __missing__(self, key):
# return key
# _tag_replacements = identity_dict({
# 'DET': 'DT',
# 'NC': 'NN',
# 'NPP': 'NNP',
# 'ADJ': 'JJ',
# 'PONCT': '.',
# 'ADVWH': 'WRB',
# 'ADV': 'RB',
# 'DETWH': 'WDT',
# 'PROWH': 'WP',
# 'ET': 'FW',
# 'VINF': 'VB',
# 'I': 'UH',
# 'CS': 'IN',
# # 'CLS': '',
# # 'CLR': '',
# # 'CLO': '',
# # 'PRO': '',
# # 'PROREL': '',
# # 'P': '',
# # 'P+D': '',
# # 'P+PRO': '',
# # 'V': '',
# # 'VPR': '',
# # 'VPP': '',
# # 'VS': '',
# # 'VIMP': '',
# # 'PREF': '',
# # 'ADJWH': '',
# })
# import subprocess
# class MeltTagger:
# def __init__(self, language='fr', melt_data_path='./parsing/Taggers/melttagger'):
# path = '%s/%s' % (melt_data_path, language)
# self.pos_tagger = POSTagger()
# self.pos_tagger.load_tag_dictionary('%s/tag_dict.json' % path)
# self.pos_tagger.load_lexicon('%s/lexicon.json' % path)
# self.pos_tagger.load_model('%s' % path)
# self._preprocessing_commands = (
# # ('/usr/local/bin/clean_noisy_characters.sh', ),
# # ('/usr/local/bin/MElt_normalizer.pl', '-nc', '-c', '-d', '/usr/local/share/melt/normalization/%s' % language, '-l', language, ),
# ('/usr/local/share/melt/segmenteur.pl', '-a', '-ca', '-af=/usr/local/share/melt/pctabr', '-p', 'r'),
# )
# self._lemmatization_commands = (
# ('/usr/local/bin/MElt_postprocess.pl', '-npp', '-l', language),
# ('MElt_lemmatizer.pl', '-m', '/usr/local/share/melt/%s' % language),
# )
# def pipe(self, text, commands, encoding='utf8'):
# text = text.encode(encoding)
# # print(text.decode(encoding))
# for command in commands:
# # print(command)
# process = subprocess.Popen(
# command,
# bufsize=0,
# stdin=subprocess.PIPE,
# stdout=subprocess.PIPE,
# stderr=subprocess.PIPE,
# )
# text, err = process.communicate(text)
# # print()
# # print(text.decode(encoding))
# if len(err):
# print(err.decode(encoding))
# return text.decode(encoding)
# def tag(self, text, encoding='utf8', lemmatize=True):
# preprocessed = self.pipe(text, self._preprocessing_commands)
# if lemmatize:
# result = ''
# for sentence in preprocessed.split('\n'):
# words = sentence.split(' ')
# tokens = [Token(word) for word in words]
# tagged_tokens = self.pos_tagger.tag_token_sequence(tokens)
# # result += ' '.join(token.__str__() for token in tagged_tokens)
# for token in tagged_tokens:
# if len(token.string):
# result += '%s/%s ' % (token.string, token.label, )
# result += '\n'
# lemmatized = self.pipe(result, self._lemmatization_commands)
# for sentence in lemmatized.split('\n'):
# for token in sentence.split(' '):
# if len(token):
# yield tuple(token.split('/'))
# else:
# for sentence in preprocessed.split('\n'):
# words = sentence.split(' ')
# tokens = [Token(word) for word in words]
# tagged_tokens = self.pos_tagger.tag_token_sequence(tokens)
# for token in tagged_tokens:
# if len(token.string):
# yield (token.string, _tag_replacements[token.label], )
if __name__ == '__main__':
from time import time
t0 = time()
tagger = MeltTagger()
print(time() - t0)
print()
text = """Le vieil hôtel de ville, construit de 1608 à 1610 est le plus ancien bâtiment de la ville de Wiesbaden. Il se dresse sur la place centrale de la vieille ville, la Place du Palais, qui abrite aujourd'hui le Parlement de l'État de Hesse, l'église et l'hôtel de ville.
texts = {
'en':
"""Air raids on Japan by the Allies in World War II caused extensive destruction and casualties; the most commonly cited estimates are 333,000 killed and 473,000 wounded.
During the first years of the Pacific War, these attacks were limited to the Doolittle Raid in April 1942 and small-scale raids on military positions in the Kuril Islands starting in mid-1943. Strategic bombing raids began in June 1944 and were greatly expanded in November. The raids initially attempted to target industrial facilities, but from March 1945 onwards were generally directed against urban areas. Aircraft flying from aircraft carriers and the Ryukyu Islands also frequently struck targets in Japan during 1945 in preparation for an Allied invasion planned for October. In early August, the cities of Hiroshima and Nagasaki were struck and mostly destroyed by atomic bombs. Japan's military and civil defenses were not capable of protecting the country, and the Allied forces generally suffered few losses. The bombing campaign was one of the main factors in the Japanese government's decision to surrender in mid-August 1945. Nevertheless, there has been a long-running debate over the attacks on Japanese cities, and the decision to use atomic weapons has been particularly controversial.
""",
'fr':
"""Le vieil hôtel de ville, construit de 1608 à 1610 est le plus ancien bâtiment de la ville de Wiesbaden. Il se dresse sur la place centrale de la vieille ville, la Place du Palais, qui abrite aujourd'hui le Parlement de l'État de Hesse, l'église et l'hôtel de ville.
Il a été construit dans le style Renaissance. On a ajouté, en 1828, un étage de style romantique historié. Sur les bas-reliefs des cinq fenêtres de l'étage, en bois, étaient représentées les vertus de la force, la justice, la charité, de prudence et de modération, alors que la pierre a remplacé par des copies. Le pièces de chêne d'origine peut être visitées aujourd'hui au Musée de Wiesbaden. Aujourd'hui, le bâtiment sert de bureau de la ville de Wiesbaden.
Devant le porche, entre l'hôtel de Ville et l'Ancien hôtel de ville, se trouve la colonne centrale de Nassau, un lion couronné avec bouclier.
Il s'agit de construire progressivement, à partir des données initiales, un sous-graphe dans lequel sont classés les différents sommets par ordre croissant de leur distance minimale au sommet de départ. La distance correspond à la somme des poids des arêtes empruntées.
......@@ -129,7 +16,18 @@ if __name__ == '__main__':
Le plus proche des sommets adjacents est alors ajouté au sous-graphe.
La seconde étape consiste à mettre à jour les distances des sommets adjacents à ce dernier. Encore une fois, on recherche alors le sommet doté de la distance la plus faible. Comme tous les sommets n'avaient plus une valeur infinie, il est donc possible que le sommet choisi ne soit pas un des derniers mis à jour.
On l'ajoute au sous-graphe, puis on continue ainsi à partir du dernier sommet ajouté, jusqu'à épuisement des sommets ou jusqu'à sélection du sommet d'arrivée.
"""
""",
}
language = 'en'
text = texts[language]
if __name__ == '__main__':
from time import time
t0 = time()
tagger = MeltTagger(language=language)
print(time() - t0)
print()
i = 0
t0 = time()
for x in tagger.tag_text(text, lemmatize=True):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment