Commit 94e16652 authored by Romain Loth's avatar Romain Loth

suppression des tags XML avant le parsing POS

parent 27b02533
from ..Taggers import TurboTagger #NltkTagger #,
from ..Taggers import NltkTagger
import nltk
from re import sub
"""Base class for all ngrams extractors.
......@@ -8,6 +9,12 @@ class NgramsExtractor:
"""Class instanciation.
This method can be overriden.
-*-*-*-*-
contenus :
[...'__dict__', '_grammar', '_label', '_rule', 'extract_ngrams', 'start', 'stop', 'tagger']
"""
def __init__(self, rule="{<JJ.*>*<NN.*|>+<JJ.*>*}"):
# TODO add this regex
......@@ -21,7 +28,7 @@ class NgramsExtractor:
self.stop()
def start(self):
self.tagger = TurboTagger()
self.tagger = NltkTagger()
def stop(self):
pass
......@@ -31,9 +38,21 @@ class NgramsExtractor:
Returns a list of the ngrams found in the given text.
"""
def extract_ngrams(self, contents):
tagged_tokens = list(self.tagger.tag_text(contents))
clean_contents = self._prepare_text(contents)
# ici tagging
tagged_tokens = list(self.tagger.tag_text(clean_contents))
if len(tagged_tokens):
grammar_parsed = self._grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label:
yield subtree.leaves()
@staticmethod
def _prepare_text(text_contents):
"""
Clean the text for better POS tagging
"""
# strip xml tags
return sub(r"<[^>]{0,45}>","",text_contents)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment