Commit 11fadebb authored by c24b's avatar c24b

TAGGER .extract(text)

parent 9ee3d71d
......@@ -148,7 +148,7 @@ RESOURCETYPES = [
'name': 'Europresse',
'format': 'Europresse',
'parser': "EuropresseParser",
'file_formats':["zip"],
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en', 'fr'],
},
......@@ -156,7 +156,7 @@ RESOURCETYPES = [
'name': 'Jstor [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip"],
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en'],
},
......@@ -172,7 +172,7 @@ RESOURCETYPES = [
'name': 'Scopus [RIS]',
'format': 'RIS',
'parser': "RISParser",
'file_formats':["zip"],
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en'],
},
......@@ -180,7 +180,7 @@ RESOURCETYPES = [
'name': 'Web of Science [ISI]',
'format': 'ISI',
'parser': "ISIParser",
'file_formats':["zip"],
'file_formats':["zip", "txt"],
#'crawler': "ISICrawler",
'crawler': None,
'default_languages': ['en'],
......@@ -205,7 +205,7 @@ RESOURCETYPES = [
'name': 'ISTex',
'format': 'json',
'parser': "ISTexParser",
'file_formats':["zip"],
'file_formats':["zip", "txt"],
'crawler': None,
'default_languages': ['en', 'fr'],
},
......
......@@ -10,8 +10,20 @@ class NltkTagger(Tagger):
self.tagr = PerceptronTagger()
super(self.__class__, self).__init__(*args, **kwargs)
#~ def __start__(self):
#def __start__(self):
#~ self.tagr = PerceptronTagger()
def tag_tokens(self, tokens, single=True):
return self.tagr.tag(tokens)
def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
if subtree.label() == label:
if len(subtree) < max_n_words:
yield subtree.leaves()
# ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
......@@ -74,6 +74,7 @@ class TreeTagger(Tagger):
self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
# self.extract(self.text)
def stop(self):
# terminates the 'treetagger' process
......
......@@ -6,12 +6,13 @@ class TurboTagger:
def start(self):
self._nlpclient = NLPClient()
#self.extract(self.text)
def stop(self):
if hasattr(self, '_nlpclient'):
del self._nlpclient
def tag_text(self, text):
def extract(self, text):
if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient()
try:
......
......@@ -19,8 +19,7 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
#self.start()
self.start()
def clean_text(self, text):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment