Commit 0f26d8a2 authored by c24b's avatar c24b

Adding a call to Tagger directly getting rid of ngramsextractors wrapper

parent 403913fc
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
import nltk
import re
class NgramsExtractor:
def __init__(self, tagger):
self._tagger = tagger()
@staticmethod
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
class NgramsExtractors(dict):
def __missing__(self, key):
if not isinstance(key, str):
raise KeyError
if len(key) == 2 and key == key.lower():
tagger = LANGUAGES[key]['tagger']
self[key] = NgramsExtractor(tagger)
else:
self[key] = self[LANGUAGES[key].iso3]
return self[key]
# this below will be shared within the current thread
ngramsextractors = NgramsExtractors()
...@@ -3,13 +3,13 @@ When started, it initiates the parser; ...@@ -3,13 +3,13 @@ When started, it initiates the parser;
when passed text, the text is piped to the parser. when passed text, the text is piped to the parser.
When ended, the parser is closed and the tagged word returned as a tuple. When ended, the parser is closed and the tagged word returned as a tuple.
""" """
from constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
import re import re
import nltk
class Tagger: class Tagger:
def __init__(self): def __init__(self, text):
# This regular expression is really good at tokenizing a text! # This regular expression is really good at tokenizing a text!
self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps self._re_sentence = re.compile(r'''(?x) # set flag to allow verbose regexps
(?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A. (?:[A-Z])(?:\.[A-Z])+\.? # abbreviations, e.g. U.S.A.
...@@ -19,8 +19,29 @@ class Tagger: ...@@ -19,8 +19,29 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens | [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL) ''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = [] self.buffer = []
self.text = clean_text(text)
self.start() self.start()
def clean_text(text):
"""Clean the text for better POS tagging.
For now, only removes (short) XML tags.
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
def __del__(self): def __del__(self):
self.stop() self.stop()
...@@ -29,6 +50,7 @@ class Tagger: ...@@ -29,6 +50,7 @@ class Tagger:
This method is called by the constructor, and can be overriden by This method is called by the constructor, and can be overriden by
inherited classes. inherited classes.
""" """
self.extract(self.text)
def stop(self): def stop(self):
"""Ends the tagger. """Ends the tagger.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment