Commit 9ee3d71d authored by c24b's avatar c24b

DEBUG Taggers + Stemmers info

parent 903e78c0
......@@ -30,7 +30,7 @@ class RISParser(Parser):
}
def parse(self, file):
print("=====> PARSING RIS")
hyperdata = {}
last_key = None
last_values = []
......
......@@ -50,9 +50,11 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# 1) compute stems/lemmas
# and group if same stem/lemma
stemmers = prepare_stemmers(corpus)
print("# STEMMERS LOADED", stemmers)
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" \
and lang in LANGUAGES.keys()]
print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg = defaultdict(set)
......
......@@ -53,7 +53,9 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" and lang in LANGUAGES.keys()}
print("#TAGGERS LOADED: ", tagger_bots)
supported_taggers_lang = tagger_bots.keys()
print("#SUPPORTED TAGGER LANGS", supported_taggers_lang)
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(corpus.children('DOCUMENT')):
......
......@@ -143,13 +143,22 @@ def parse(corpus):
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
# les documents
print(corpus.children("DOCUMENT").count(), "docs parsed")
docs = corpus.children("DOCUMENT").count()
if docs == 0:
print("[WARNING] PARSING FAILED!!!!!")
corpus.status('Parsing', error= "No documents parsed")
#document.save_hyperdata()
print(docs, "parsed")
#LANGUAGES INFO
print("#LANGAGES OK")
print(observed_langs)
print("#LANGUAGES UNKNOWN")
print(skipped_langs)
corpus.hyperdata["language_id"] = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)[0][0]
top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
if len(top_langs) > 0:
corpus.hyperdata["language_id"] = top_langs[0][0]
else:
corpus.hyperdata["language_id"] = "__unknown__"
print("#MAIN language of the CORPUS", corpus.hyperdata["language_id"])
corpus.hyperdata["languages"] = dict(observed_langs)
......
......@@ -81,6 +81,7 @@ class NewCorpusForm(forms.Form):
def project(request, project_id):
# current user
user = cache.User[request.user.id]
# viewed project
project = session.query(Node).filter(Node.id == project_id).first()
if project is None:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment