Commit 5a80b033 authored by c24b's avatar c24b

Tagger[__unknown__]

parent 322f49e2
......@@ -53,6 +53,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__"}
tagger_bots["__unknown__"] = load_tagger("en")
print("#TAGGERS LOADED: ", tagger_bots)
supported_taggers_lang = tagger_bots.keys()
print("#SUPPORTED TAGGER LANGS", supported_taggers_lang)
......
......@@ -63,7 +63,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
print(len(text_fields2))
if len(text_fields2) < 2:
print("[WARNING] missing %s key" %text_fields)
text = " ".join([hyperdata[k] for k in text_fields2])
if len(text) < 10:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment