Commit 4211b1e2 authored by c24b's avatar c24b

Revert taggers modifications

parent aa70aaa8
......@@ -8,22 +8,10 @@ class NltkTagger(Tagger):
#import nltk
def __init__(self, *args, **kwargs):
self.tagr = PerceptronTagger()
#super(self.__class__, self).__init__(*args, **kwargs)
super(self.__class__, self).__init__(*args, **kwargs)
#def __start__(self):
#~ def __start__(self):
#~ self.tagr = PerceptronTagger()
def tag_tokens(self, tokens, single=True):
return self.tagr.tag(tokens)
# def extract(self, text, rule=RULE_JJNN, label='NP', max_n_words=DEFAULT_MAX_NGRAM_LEN):
# self.text = self.clean_text(text)
# grammar = nltk.RegexpParser(label + ': ' + rule)
# tagged_tokens = list(self.tag_text(self.text))
# if len(tagged_tokens):
# grammar_parsed = grammar.parse(tagged_tokens)
# for subtree in grammar_parsed.subtrees():
# if subtree.label() == label:
# if len(subtree) < max_n_words:
# yield subtree.leaves()
# # ex: [('wild', 'JJ'), ('pollinators', 'NNS')]
......@@ -74,7 +74,6 @@ class TreeTagger(Tagger):
self._input, self._output = self._popen.stdin, self._popen.stdout
# self._thread = threading.Thread(target=_readOutput, args=(self._output, self.buffer, )).start()
# self.buffer = OutputBuffer()
# self.extract(self.text)
def stop(self):
# terminates the 'treetagger' process
......
......@@ -6,13 +6,12 @@ class TurboTagger:
def start(self):
self._nlpclient = NLPClient()
#self.extract(self.text)
def stop(self):
if hasattr(self, '_nlpclient'):
del self._nlpclient
def extract(self, text):
def tag_text(self, text):
if not hasattr(self, '_nlpclient'):
self._nlpclient = NLPClient()
try:
......
......@@ -19,7 +19,8 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
self.start()
#self.start()
def clean_text(self, text):
......
......@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
#print('INTEGRATE')
print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram,
......@@ -59,7 +59,6 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
if document.id not in corpus.hyperdata["skipped_docs"]:
language_iso2 = document.hyperdata.get('language_iso2')
if language_iso2 not in supported_taggers_lang:
......@@ -73,7 +72,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
else:
tagger = tagger_bots[language_iso2]
print(tagger)
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
......@@ -109,10 +108,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.add(corpus)
......
......@@ -99,15 +99,14 @@ def parse(corpus):
#skipped docs to remember for later processing
skipped_docs = []
documents_count = 0
#BY RESOURCE
for i,resource in enumerate(resources):
if resource["extracted"] is True:
continue
else:
# BY documents
d = 0
for documents_count, hyperdata in enumerate(parserbot(resource["path"])):
for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in CONSTANTS
for k in DEFAULT_INDEX_FIELDS:
if k in hyperdata.keys():
......@@ -126,32 +125,45 @@ def parse(corpus):
name = hyperdata.get('title', '')[:255],
hyperdata = hyperdata,
)
#corpus.save_hyperdata()
# session.add(document)
# session.commit()
session.add(document)
session.commit()
if "error" in hyperdata.keys():
#document.status("error")
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
#document.status('Parsing', error= document.hyperdata["error"])
#document.save_hyperdata()
#session.add(document)
#session.commit()
#adding skipped_docs for later processsing if error in parsing
skipped_docs.append(document.id)
#BATCH_PARSING_SIZE
if documents_count % BATCH_PARSING_SIZE == 0:
corpus.status('Docs', progress=documents_count)
corpus.save_hyperdata()
#session.add(corpus)
#session.commit()
session.add(corpus)
session.commit()
documents_count += 1
# update info about the resource
resource['extracted'] = True
#print( "resource n°",i, ":", d, "docs inside this file")
#finally store documents for this corpus
session.add(corpus)
session.commit()
#finally store documents for this corpus
corpus.status('Parsing', progress=documents_count+1, complete=True)
#corpus.status('Parsing', complete =True)
corpus.save_hyperdata()
#session.add(corpus)
#session.commit()
#adding parsing error to document level
for node_id in skipped_docs:
node = session.query(Node).filter(Node.id== node_id).first()
node.status("Parsing", "Error in parsing")
node.save_hyperdata()
#session.flush()
#skipped_nodes = session.query(Node).filter(Node.id.in_(skipped_docs)).all()
#mods = [node.status('Parsing', "Error in parsing:skipped") for node in skipped_nodes]
#STORING AGREGATIONS INFO (STATS)
#skipped_docs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment