Commit d0fd5ad0 authored by Romain Loth's avatar Romain Loth

parsing.py: comments + add_lang return values and integration

parent 9e7284d2
"""
Deserialize an external document collection with a parserbot
then transfer it to our DB as document nodes within a corpus
"""
from gargantext.util.db import * from gargantext.util.db import *
from gargantext.models import * from gargantext.models import *
from gargantext.constants import * from gargantext.constants import *
...@@ -7,82 +11,83 @@ from re import sub ...@@ -7,82 +11,83 @@ from re import sub
from gargantext.util.languages import languages, detect_lang from gargantext.util.languages import languages, detect_lang
def add_lang(hyperdata, observed_languages, skipped_languages): def add_lang(hyperdata, observed_languages, skipped_languages):
'''utility to add lang information '''utility to gather corpus-level lang information
and also detect unknown languages (if constants.DETECT_LANG is true)
1. on language_iso2 1. on language_iso2
2. on other format language_%f 2. on other format language_%f
3. on text from concatenation of DEFAULT_INDEX_FIELDS 3. on text from concatenation of DEFAULT_INDEX_FIELDS
TODO factorize with _Parser.format_hyperdata_languages()
''' '''
if "language_iso2" in hyperdata.keys(): # this will be our return value
if hyperdata["language_iso2"] not in LANGUAGES.keys(): lang_result = {
skipped_languages.append(hyperdata["language_iso2"]) # the 2 global counts to update
return observed_languages,skipped_languages 'observed': observed_languages,
observed_languages.append(hyperdata["language_iso2"]) 'skipped': skipped_languages,
return hyperdata, observed_languages,skipped_languages
# optional new value for doc
'doc_prediction': None, # tuple with (iso2, iso3, name)
# optional error for doc
'doc_error': None # str
}
if "language_iso2" in hyperdata.keys():
lang = hyperdata["language_iso2"]
if lang not in LANGUAGES.keys():
lang_result['skipped'].append(lang) # FIXME: perhaps better to do this at ngrams_extraction
else:
lang_result['observed'].append(lang)
elif "language_iso3" in hyperdata.keys(): elif "language_iso3" in hyperdata.keys():
#convert #convert
try: try:
lang = languages[hyperdata["language_iso3"]].iso2 lang = languages[hyperdata["language_iso3"]].iso2
if lang not in LANGUAGES.keys(): if lang not in LANGUAGES.keys():
skipped_languages.append(lang) lang_result['skipped'].append(lang) # idem
return observed_languages,skipped_languages else:
observed_languages.append(lang) lang_result['observed'].append(lang)
return hyperdata, observed_languages,skipped_languages
except KeyError: except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"])) print ("LANG not referenced", hyperdata["language_iso3"])
skipped_languages.append(hyperdata["language_iso3"]) lang_result['skipped'].append(hyperdata["language_iso3"])
return hyperdata, observed_languages,skipped_languages
elif "language_name" in hyperdata.keys(): elif "language_name" in hyperdata.keys():
try: try:
#convert #convert
lang = languages[hyperdata["language_name"]].iso2 lang = languages[hyperdata["language_name"]].iso2
if lang not in LANGUAGES.keys(): if lang not in LANGUAGES.keys():
skipped_languages.append(lang) lang_result['skipped'].append(lang) # idem
return hyperdata, observed_languages,skipped_languages else:
observed_languages.append(lang) lang_result['observed'].append(lang)
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG Not referenced", (hyperdata["language_name"]))
skipped_languages.append(hyperdata["language_name"])
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG Not referenced", hyperdata["language_name"])
lang_result['skipped'].append(hyperdata["language_name"])
else: else:
print("[WARNING] no language_* found in document [parsing.py]") print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False:
#skipped_languages.append("__unknown__") if DETECT_LANG:
hyperdata["language_iso2"] = "__unknown__" #no language have been indexed
return hyperdata, observed_languages,skipped_languages #detectlang by joining on the DEFAULT_INDEX_FIELDS
text_fields = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS text = " ".join([hyperdata[k] for k in text_fields])
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys())) if len(text) < 10:
if len(text_fields2) < 2: lang_result["doc_error"] = "Error: not enough text to index"
print("[WARNING] missing %s key" %text_fields) else:
# detect_lang return object o with o.iso2, o.iso3 ...
text = " ".join([hyperdata[k] for k in text_fields2]) lang = detect_lang(text)
if len(text) < 10: lang_result["doc_prediction"] = (getattr(lang, k) for k in ["iso2", "iso3", "name"])
hyperdata["error"] = "Error: no TEXT fields to index" if lang.iso2 not in LANGUAGES.keys():
skipped_languages.append("__unknown__") lang_result['skipped'].append(lang.iso2) # idem
return hyperdata, observed_languages,skipped_languages else:
else: lang_result['observed'].append(lang.iso2)
#detect_lang return iso2
lang = detect_lang(text) return lang_result
for k in ["iso2", "iso3", "name"]:
hyperdata["language_"+k] = getattr(lang, k)
if lang.iso2 not in LANGUAGES.keys():
#hyperdata["language_iso2"] = "__unknown__"
skipped_languages.append(lang.iso2)
return hyperdata, observed_languages,skipped_languages
observed_languages.append(lang.iso2)
return hyperdata, observed_languages,skipped_languages
def parse(corpus): def parse(corpus):
...@@ -116,7 +121,7 @@ def parse(corpus): ...@@ -116,7 +121,7 @@ def parse(corpus):
if resource["extracted"] is True: if resource["extracted"] is True:
continue continue
else: else:
# BY documents # BY documents (cf. _Parser.__iter__)
for hyperdata in parserbot(resource["path"]): for hyperdata in parserbot(resource["path"]):
# indexed text fields defined in CONSTANTS # indexed text fields defined in CONSTANTS
for k in DEFAULT_INDEX_FIELDS: for k in DEFAULT_INDEX_FIELDS:
...@@ -126,11 +131,28 @@ def parse(corpus): ...@@ -126,11 +131,28 @@ def parse(corpus):
except Exception as error : except Exception as error :
hyperdata["error"] = "Error normalize_chars" hyperdata["error"] = "Error normalize_chars"
#adding lang into record hyperdata JUST if not declared # adding lang into record hyperdata JUST if not declared
hyperdata,observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages) lang_infos = add_lang(hyperdata, observed_languages, skipped_languages)
# update document
if lang_infos['doc_error']:
hyperdata['warning'] = lang_infos['doc_error']
if lang_infos['doc_prediction']:
prediction = lang_infos['doc_prediction']
hyperdata['language_iso2'] = prediction[0]
hyperdata['language_iso3'] = prediction[1]
hyperdata['language_name'] = prediction[2]
del prediction
# update stats
observed_languages = lang_infos['observed']
skipped_languages = lang_infos['skipped']
del lang_infos
# -----------------------
# save as corpus DB child # save as corpus DB child
# ---------------- # -----------------------
document = corpus.add_child( document = corpus.add_child(
typename = 'DOCUMENT', typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255], name = hyperdata.get('title', '')[:255],
...@@ -180,11 +202,7 @@ def parse(corpus): ...@@ -180,11 +202,7 @@ def parse(corpus):
#skipped_docs #skipped_docs
corpus.hyperdata["skipped_docs"] = list(set(skipped_docs)) corpus.hyperdata["skipped_docs"] = list(set(skipped_docs))
print(len(corpus.hyperdata["skipped_docs"]), "docs skipped") print(len(corpus.hyperdata["skipped_docs"]), "docs skipped")
#les langues pas belles
skipped_langs = dict(Counter(skipped_languages))
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
# les documents
docs = corpus.children("DOCUMENT").count() docs = corpus.children("DOCUMENT").count()
if docs == 0: if docs == 0:
print("[WARNING] PARSING FAILED!!!!!") print("[WARNING] PARSING FAILED!!!!!")
...@@ -192,11 +210,18 @@ def parse(corpus): ...@@ -192,11 +210,18 @@ def parse(corpus):
#document.save_hyperdata() #document.save_hyperdata()
print(docs, "parsed") print(docs, "parsed")
#LANGUAGES INFO
# language stats
#les langues pas belles
skipped_langs = dict(Counter(skipped_languages)) # idem
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
print("#LANGAGES OK") print("#LANGAGES OK")
print(observed_langs) print(observed_langs)
print("#LANGUAGES UNKNOWN") print("#LANGUAGES UNKNOWN")
print(skipped_langs) print(skipped_langs)
top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True) top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
if len(top_langs) > 0: if len(top_langs) > 0:
corpus.hyperdata["language_id"] = top_langs[0][0] corpus.hyperdata["language_id"] = top_langs[0][0]
...@@ -204,9 +229,8 @@ def parse(corpus): ...@@ -204,9 +229,8 @@ def parse(corpus):
corpus.hyperdata["language_id"] = "__unknown__" corpus.hyperdata["language_id"] = "__unknown__"
print("#MAIN language of the CORPUS", corpus.hyperdata["language_id"]) print("#MAIN language of the CORPUS", corpus.hyperdata["language_id"])
corpus.hyperdata["languages"] = dict(observed_langs) corpus.hyperdata["languages"] = observed_langs
corpus.hyperdata["languages"]["__unknown__"] = list(skipped_langs.keys()) corpus.hyperdata["languages"]["__unknown__"] = list(skipped_langs.keys())
print("OBSERVED_LANGUAGES", corpus.hyperdata["languages"])
corpus.save_hyperdata() corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment