Commit 94e16652 authored by Romain Loth's avatar Romain Loth

suppression des tags XML avant le parsing POS

parent 27b02533
from ..Taggers import TurboTagger #NltkTagger #, from ..Taggers import NltkTagger
import nltk import nltk
from re import sub
"""Base class for all ngrams extractors. """Base class for all ngrams extractors.
...@@ -8,6 +9,12 @@ class NgramsExtractor: ...@@ -8,6 +9,12 @@ class NgramsExtractor:
"""Class instanciation. """Class instanciation.
This method can be overriden. This method can be overriden.
-*-*-*-*-
contenus :
[...'__dict__', '_grammar', '_label', '_rule', 'extract_ngrams', 'start', 'stop', 'tagger']
""" """
def __init__(self, rule="{<JJ.*>*<NN.*|>+<JJ.*>*}"): def __init__(self, rule="{<JJ.*>*<NN.*|>+<JJ.*>*}"):
# TODO add this regex # TODO add this regex
...@@ -21,7 +28,7 @@ class NgramsExtractor: ...@@ -21,7 +28,7 @@ class NgramsExtractor:
self.stop() self.stop()
def start(self): def start(self):
self.tagger = TurboTagger() self.tagger = NltkTagger()
def stop(self): def stop(self):
pass pass
...@@ -31,9 +38,21 @@ class NgramsExtractor: ...@@ -31,9 +38,21 @@ class NgramsExtractor:
Returns a list of the ngrams found in the given text. Returns a list of the ngrams found in the given text.
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_tokens = list(self.tagger.tag_text(contents)) clean_contents = self._prepare_text(contents)
# ici tagging
tagged_tokens = list(self.tagger.tag_text(clean_contents))
if len(tagged_tokens): if len(tagged_tokens):
grammar_parsed = self._grammar.parse(tagged_tokens) grammar_parsed = self._grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees(): for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label: if subtree.label() == self._label:
yield subtree.leaves() yield subtree.leaves()
@staticmethod
def _prepare_text(text_contents):
"""
Clean the text for better POS tagging
"""
# strip xml tags
return sub(r"<[^>]{0,45}>","",text_contents)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment