Adding a call to Tagger directly getting rid of ngramsextractors wrapper

0f26d8a2 · c24b · 403913fc · 0f26d8a2 · 0f26d8a2
Commit 0f26d8a2 authored Jul 27, 2016 by c24b
Hide whitespace changes
Inline Side-by-side

Showing with 70 additions and 3 deletions

ngramsextractors.py gargantext/util/ngramsextractors.py +45 -0

_Tagger.py gargantext/util/taggers/_Tagger.py +25 -3

No files found.
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
+from gargantext.util.languages import languages
+from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_NPN
+
+import nltk
+import re
+class NgramsExtractor:
+
+    def __init__(self, tagger):
+        self._tagger = tagger()
+
+    @staticmethod
+    def clean_text(text):
+        """Clean the text for better POS tagging.
+        For now, only removes (short) XML tags.
+        """
+        return re.sub(r'<[^>]{0,45}>', '', text)
+
+    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
+        text = self.clean_text(text)
+        grammar = nltk.RegexpParser(label + ': ' + rule)
+        tagged_tokens = list(self._tagger.tag_text(text))
+        if len(tagged_tokens):
+            grammar_parsed = grammar.parse(tagged_tokens)
+            for subtree in grammar_parsed.subtrees():
+                if subtree.label() == label:
+                    if len(subtree) < max_n_words:
+                        yield subtree.leaves()
+                            # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
+
+
+class NgramsExtractors(dict):
+
+    def __missing__(self, key):
+        if not isinstance(key, str):
+            raise KeyError
+        if len(key) == 2 and key == key.lower():
+            tagger = LANGUAGES[key]['tagger']
+            self[key] = NgramsExtractor(tagger)
+        else:
+            self[key] = self[LANGUAGES[key].iso3]
+        return self[key]
+
+
+# this below will be shared within the current thread
+ngramsextractors = NgramsExtractors()
--- a/gargantext/util/taggers/_Tagger.py
+++ b/gargantext/util/taggers/_Tagger.py
@@ -3,13 +3,13 @@ When started, it initiates the parser;
 when passed text, the text is piped to the parser.
 When ended, the parser is closed and the tagged word returned as a tuple.
 """
-
+from constants import RULE_JJNN, DEFAULT_MAX_NGRAM_LEN
 import re
-
+import nltk

 class Tagger:

-    def __init__(self):
+    def __init__(self, text):
        # This regular expression is really good at tokenizing a text!
        self._re_sentence = re.compile(r'''(?x)  # set flag to allow verbose regexps
            (?:[A-Z])(?:\.[A-Z])+\.?        # abbreviations, e.g. U.S.A.
@@ -19,8 +19,29 @@ class Tagger:
            | [][.,;"'?!():-_`]             # these are separate tokens
            ''', re.UNICODE | re.MULTILINE | re.DOTALL)
        self.buffer = []
+        self.text = clean_text(text)
        self.start()

+
+    def clean_text(text):
+        """Clean the text for better POS tagging.
+        For now, only removes (short) XML tags.
+        """
+        return re.sub(r'<[^>]{0,45}>', '', text)
+
+    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
+        text = self.clean_text(text)
+        grammar = nltk.RegexpParser(label + ': ' + rule)
+        tagged_tokens = list(self.tag_text(self.text))
+        if len(tagged_tokens):
+            grammar_parsed = grammar.parse(tagged_tokens)
+            for subtree in grammar_parsed.subtrees():
+                if subtree.label() == label:
+                    if len(subtree) < max_n_words:
+                        yield subtree.leaves()
+                            # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
+
+
    def __del__(self):
        self.stop()

@@ -29,6 +50,7 @@ class Tagger:
        This method is called by the constructor, and can be overriden by
        inherited classes.
        """
+        self.extract(self.text)

    def stop(self):
        """Ends the tagger.