Commit 0f26d8a2 authored by c24b's avatar c24b

Adding a call to Tagger directly getting rid of ngramsextractors wrapper

parent 403913fc
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk
import re
class NgramsExtractor:
def __init__(self, tagger):
self._tagger = tagger()
@staticmethod
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict):
def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger)
else:
self[key] = self[LANGUAGES[key].iso3]
return self[key]
# this below will be shared within the current thread
ngramsextractors = NgramsExtractors()
......@@ -3,13 +3,13 @@ When started, it initiates the parser;
when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned as a tuple.
"""
from constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
import re
import nltk
class Tagger:
def __init__(self):
def __init__(self, text):
# This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
......@@ -19,8 +19,29 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
self.text = clean_text(text)
self.start()
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
def __del__(self):
self.stop()
......@@ -29,6 +50,7 @@ class Tagger:
This method is called by the constructor, and can be overriden by
inherited classes.
"""
self.extract(self.text)
def stop(self):
"""Ends the tagger.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment