Commit 8d0e31fa authored by c24b's avatar c24b

LANG undeclared DETECTED at parsing => hyperdata

parent f1476df9
from gargantext.constants import *
from langdetect import detect, DetectorFactory
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print ('function took %0.3f ms' %((time2-time1)*1000.0))
return ret
return wrap
class Language:
def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
self.iso2 = iso2
self.iso3 = iso3
self.name = name
self.full_name = full_name
self.implemented = iso2 in LANGUAGES
def __str__(self):
......@@ -38,10 +27,10 @@ class Languages(dict):
languages = Languages()
@timing
def detect_lang(text):
DetectorFactory.seed = 0
return languages[detect(text)].iso2
return languages[detect(text)]
import pycountry
pycountry_keys = (
......
......@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
observed_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
elif "language_iso3" in hyperdata.keys():
......@@ -32,33 +32,33 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"]))
skipped_languages.append(hyperdata["language_iso3"])
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
elif "language_fullname" in hyperdata.keys():
elif "language_name" in hyperdata.keys():
try:
#convert
lang = languages[hyperdata["language_fullname"]].iso2
lang = languages[hyperdata["language_name"]].iso2
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
except KeyError:
print ("LANG Not referenced", (hyperdata["language_fullname"]))
skipped_languages.append(hyperdata["language_fullname"])
return observed_languages,skipped_languages
print ("LANG Not referenced", (hyperdata["language_name"]))
skipped_languages.append(hyperdata["language_name"])
return hyperdata, observed_languages,skipped_languages
else:
print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False:
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
#no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS
......@@ -69,16 +69,17 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
return hyperdata, observed_languages,skipped_languages
else:
#detect_lang return iso2
lang = detect_lang(text)
if lang not in LANGUAGES.keys():
skipped_languages.append(lang)
for k in ["iso2", "iso3", "name"]:
hyperdata["language_"+k] = lang[k]
if lang.iso2 not in LANGUAGES.keys():
skipped_languages.append(lang.iso2)
return observed_languages,skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
observed_languages.append(lang.iso2)
return hyperdata, observed_languages,skipped_languages
def parse(corpus):
......@@ -122,8 +123,8 @@ def parse(corpus):
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
#adding lang into record hyperdata
observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
#adding lang into record hyperdata JUST if not declared
hyperdata,observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
# save as corpus DB child
# ----------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment