Commit 403913fc authored by c24b's avatar c24b

filtering out ignored docs

parent 961ba2d8
...@@ -36,7 +36,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor): ...@@ -36,7 +36,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
db.commit() db.commit()
def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_INDEX_SUBGRAMS): def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_INDEX_SUBGRAMS):
"""Extract ngrams for every document below the given corpus. """Extract ngrams for every document below the given corpus.
Default language is given by the resource type. Default language is given by the resource type.
The result is then inserted into database. The result is then inserted into database.
...@@ -52,38 +52,12 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_ ...@@ -52,38 +52,12 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
documents_count = 0 documents_count = 0
#load available taggers for source default langage #load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']} tagger_bots = {lang: load_tagger(lang) for lang in source['default_languages']}
#skipped documents that have an unsupported languages #skipped documents that have been skipped previously for parsing error or unsupported language
corpus.skipped_docs = [doc.id for doc in corpus.children('DOCUMENT') if doc.hyperdata["language_iso2"] not in source["default_languages"]] docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
print(corpus.hyperdata["languages"]) #print(corpus.hyperdata["languages"])
#add it to corpus.Language info for documents_count, document in enumerate(docs):
#diff = set(corpus.hyperdata["languages"].keys()) - set(source["default_languages"]))
#if len(diff) > 1:
# if lang_doc in corpus.hyperdata['languages']:
# skipped_lg_infos = corpus.hyperdata['languages'].pop(lang_doc)
# corpus.hyperdata['languages']['__skipped__'][lang_doc] = skipped_lg_infos
# corpus.save_hyperdata()
# session.commit()
# continue
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
lang_doc = document.hyperdata['language_iso2'] lang_doc = document.hyperdata['language_iso2']
if document not in corpus.skipped_docs:
# if document.id in corpus.skipped_docs:
# # get the langage of the current document
# # skip document
# print('Unsupported language: `%s` (doc #%i)' % (lang_doc, document.id))
# # and remember that for later processes (eg stemming)
# #document.hyperdata['__skipped__'] = 'ngrams_extraction'
# #document.save_hyperdata()
# #session.commit()
# continue
#
#
# else:
# extract ngrams on each of the considered keys
ngramextractor = tagger_bots[lang_doc] ngramextractor = tagger_bots[lang_doc]
for key in keys: for key in keys:
value = document.hyperdata.get(key, None) value = document.hyperdata.get(key, None)
if not isinstance(value, str): if not isinstance(value, str):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment