Commit eb15d263 authored by delanoe's avatar delanoe

[NGRAMS EXTRACTOR] rules in constant, back to a simple rule for debugging.

parent 3b281cbc
......@@ -215,3 +215,11 @@ BATCH_NGRAMSEXTRACTION_SIZE = 1024
# Scrapers config
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
# Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA = "^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)\
+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?\
,){0,2}?(N.?.?,|\?,)+?)+?)*?$"
from gargantext.util.languages import languages
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN
from gargantext.constants import LANGUAGES, DEFAULT_MAX_NGRAM_LEN, RULE_JJNN, RULE_JJDTNN
import nltk
import re
class NgramsExtractor:
def __init__(self, tagger):
......@@ -17,7 +15,7 @@ class NgramsExtractor:
"""
return re.sub(r'<[^>]{0,45}>', '', text)
def extract(self, text, rule='{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}', label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self._tagger.tag_text(text))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment