Commit 64fc3681 authored by Romain Loth's avatar Romain Loth

FIX unrecognized languages at ngrams extraction were stopping downstream...

FIX unrecognized languages at ngrams extraction were stopping downstream process like stemming: now metadata allows to continue skipping document in further processes
parent d45d3a91
......@@ -25,8 +25,9 @@ def prepare_stemmers(corpus):
'__unknown__' : SnowballStemmer("english")
}
for lgiso2 in corpus.hyperdata['languages'].keys():
lgname = languages[lgiso2].name.lower()
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
if (lgiso2 != '__skipped__'):
lgname = languages[lgiso2].name.lower()
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
......
......@@ -45,15 +45,27 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
ngrams_data = set()
# extract ngrams
resource_type_index = corpus.resources()[0]['type']
resource_type = RESOURCETYPES[resource_type_index]
default_language_iso2 = resource_type['default_language']
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
# get ngrams extractor for the current document
language_iso2 = document.hyperdata.get('language_iso2', default_language_iso2)
try:
# this looks for a parser in constants.LANGUAGES
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
print('Unrecognized language: `%s`' % (language_iso2, ))
# skip document
print('Unsupported language: `%s`' % (language_iso2, ))
# and remember that for later processes (eg stemming)
document.hyperdata['__skipped__'] = 'ngrams_extraction'
document.save_hyperdata()
session.commit()
if language_iso2 in corpus.hyperdata['languages']:
skipped_lg_infos = corpus.hyperdata['languages'].pop(language_iso2)
corpus.hyperdata['languages']['__skipped__'][language_iso2] = skipped_lg_infos
corpus.save_hyperdata()
session.commit()
continue
# extract ngrams on each of the considered keys
for key in keys:
......
......@@ -49,8 +49,10 @@ def parse(corpus):
documents_count += 1
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages
# add a corpus-level info about languages...
corpus.hyperdata['languages'] = observed_languages
# ...with a special key inside for skipped languages at ngrams_extraction
corpus.hyperdata['languages']['__skipped__'] = {}
# commit all changes
corpus.status('parsing', progress=documents_count, complete=True)
corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment