Commit dde756c1 authored by Mathieu Rodic's avatar Mathieu Rodic

[DEBUG] trying to fit 'MElt FR' as a tagger

parent 3b565087
from .NgramsExtractor import NgramsExtractor from .NgramsExtractor import NgramsExtractor
from ..Taggers import NltkTagger from ..Taggers import NltkTagger, MeltTagger
class EnglishNgramsExtractor(NgramsExtractor): class EnglishNgramsExtractor(NgramsExtractor):
def start(self): def start(self):
self.tagger = NltkTagger() # self.tagger = NltkTagger()
# self.tagger = MeltTagger(language='en') self.tagger = MeltTagger(language='en')
\ No newline at end of file
...@@ -29,9 +29,9 @@ class NgramsExtractor: ...@@ -29,9 +29,9 @@ class NgramsExtractor:
Returns a list of the ngrams found in the given text. Returns a list of the ngrams found in the given text.
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_ngrams = list(self.tagger.tag_text(contents)) tagged_tokens = list(self.tagger.tag_text(contents))
if len(tagged_ngrams): if len(tagged_tokens):
grammar_parsed = self._grammar.parse(tagged_ngrams) grammar_parsed = self._grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees(): for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label: if subtree.label() == self._label:
yield subtree.leaves() yield subtree.leaves()
...@@ -102,7 +102,7 @@ class MeltTagger(Tagger): ...@@ -102,7 +102,7 @@ class MeltTagger(Tagger):
if len(token.string): if len(token.string):
yield (token.string, token.label, ) yield (token.string, token.label, )
def tag_text(self, text, lemmatize=True): def tag_text(self, text, lemmatize=False):
tagged_tokens = self._tag(text) tagged_tokens = self._tag(text)
# without lemmatization # without lemmatization
if not lemmatize: if not lemmatize:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment