[NGRAMS EXTRACTOR] rules in constant, back to a simple rule for debugging.

eb15d263 · delanoe · 3b281cbc · eb15d263 · eb15d263
Commit eb15d263 authored May 03, 2016 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 4 deletions

constants.py gargantext/constants.py +8 -0

ngramsextractors.py gargantext/util/ngramsextractors.py +2 -4

No files found.
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -215,3 +215,11 @@ BATCH_NGRAMSEXTRACTION_SIZE = 1024
 # Scrapers config
 QUERY_SIZE_N_MAX     = 1000
 QUERY_SIZE_N_DEFAULT = 1000
+# Grammar rules for chunking
+RULE_JJNN   = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
+RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
+RULE_TINA   = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+               +?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
+               ,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
--- a/gargantext/util/ngramsextractors.py
+++ b/gargantext/util/ngramsextractors.py
 from gargantext.util.languages import languages
-from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN
+from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_JJDTNN
 import nltk
 import re
 class NgramsExtractor:
    def __init__(self, tagger):
@@ -17,7 +15,7 @@ class NgramsExtractor:
        """
        return re.sub(r'<[^>]{0,45}>', '', text)
-    def extract(self, text, rule='{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}', label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
+    def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
        text = self.clean_text(text)
        grammar = nltk.RegexpParser(label + ': ' + rule)
        tagged_tokens = list(self._tagger.tag_text(text))