Commit 403913fc authored by c24b's avatar c24b

filtering out ignored docs

parent 961ba2d8
......@@ -36,7 +36,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db.commit()
def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_INDEX_SUBGRAMS):
def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_INDEX_SUBGRAMS):
"""Extract ngrams for every document below the given corpus.
Default language is given by the resource type.
The result is then inserted into database.
......@@ -52,38 +52,12 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
documents_count = 0
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have an unsupported languages
corpus.skipped_docs = [doc.id for doc in corpus.children('DOCUMENT') if doc.hyperdata["language_iso2"] not in source["default_languages"]]
print(corpus.hyperdata["languages"])
#add it to corpus.Language info
#diff = set(corpus.hyperdata["languages"].keys()) - set(source["default_languages"]))
#if len(diff) > 1:
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
# corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
# corpus.save_hyperdata()
# session.commit()
# continue
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
#skipped documents that have been skipped previously for parsing error or unsupported language
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
#print(corpus.hyperdata["languages"])
for documents_count, document in enumerate(docs):
lang_doc = document.hyperdata['language_iso2']
if document not in corpus.skipped_docs:
# if document.id in corpus.skipped_docs:
# # get the langage of the current document
# # skip document
# print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
# # and remember that for later processes (eg stemming)
# #document.hyperdata['__skipped__'] = 'ngrams_extraction'
# #document.save_hyperdata()
# #session.commit()
# continue
#
#
# else:
# extract ngrams on each of the considered keys
ngramextractor = tagger_bots[lang_doc]
for key in keys:
value = document.hyperdata.get(key, None)
if not isinstance(value, str):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment