Commit e8d5e001 authored by Romain Loth's avatar Romain Loth

parsing+extraction: removing old debug messages

parent 5ce424f9
......@@ -115,8 +115,6 @@ class EuropresseParser(Parser):
# parse all the articles, one by one
for html_article in html_articles:
try:
print("==============================new article")
# s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath)
all_header_text = " ".join(scrap_text(all_header))
......
......@@ -27,7 +27,6 @@ class ISTexParser(Parser):
}
suma = 0
print(len(json_docs))
for json_doc in json_docs:
hyperdata = {}
......
......@@ -32,7 +32,7 @@ class Tagger:
self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
print("the tagged_tokens", tagged_tokens)
# print("the tagged_tokens", tagged_tokens)
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
......
......@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
"""
print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
# print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
print('INTEGRATE')
# integrate ngrams (aka new words)
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram, # todo type should :str ~~> :str|:re) !!!
......@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
# print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
......
......@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
lang_result['skipped'].append(hyperdata["language_name"])
else:
print("[WARNING] no language_* found in document [parsing.py]")
print("WARNING no language_* found in document [parsing.py] => "
+ ("(detecting)" if DETECT_LANG else "(using default)"))
if DETECT_LANG:
#no language have been indexed
......@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus):
try:
print("PARSING")
print("DETECT_LANG?", DETECT_LANG)
# print("DETECT_LANG?", DETECT_LANG)
corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources()
......@@ -107,7 +108,9 @@ def parse(corpus):
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source)
print(parserbot)
# print(parserbot)
#observed languages in default languages
observed_languages = []
#skipped_languages
......@@ -218,10 +221,10 @@ def parse(corpus):
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
print("#LANGAGES OK")
print(observed_langs)
print("#LANGUAGES UNKNOWN")
print(skipped_langs)
# print("#LANGAGES OK")
# print(observed_langs)
# print("#LANGUAGES UNKNOWN")
# print(skipped_langs)
top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
if len(top_langs) > 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment