Commit 8d0e31fa authored by c24b's avatar c24b

LANG undeclared DETECTED at parsing => hyperdata

parent f1476df9
from gargantext.constants import * from gargantext.constants import *
from langdetect import detect, DetectorFactory from langdetect import detect, DetectorFactory
import time
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print ('function took %0.3f ms' %((time2-time1)*1000.0))
return ret
return wrap
class Language: class Language:
def __init__(self, iso2=None, iso3=None,full_name=None, name=None): def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
self.iso2 = iso2 self.iso2 = iso2
self.iso3 = iso3 self.iso3 = iso3
self.name = name self.name = name
self.full_name = full_name
self.implemented = iso2 in LANGUAGES self.implemented = iso2 in LANGUAGES
def __str__(self): def __str__(self):
...@@ -38,10 +27,10 @@ class Languages(dict): ...@@ -38,10 +27,10 @@ class Languages(dict):
languages = Languages() languages = Languages()
@timing
def detect_lang(text): def detect_lang(text):
DetectorFactory.seed = 0 DetectorFactory.seed = 0
return languages[detect(text)].iso2 return languages[detect(text)]
import pycountry import pycountry
pycountry_keys = ( pycountry_keys = (
......
...@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -21,7 +21,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(hyperdata["language_iso2"]) skipped_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages return observed_languages,skipped_languages
observed_languages.append(hyperdata["language_iso2"]) observed_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
elif "language_iso3" in hyperdata.keys(): elif "language_iso3" in hyperdata.keys():
...@@ -32,33 +32,33 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -32,33 +32,33 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
skipped_languages.append(lang) skipped_languages.append(lang)
return observed_languages,skipped_languages return observed_languages,skipped_languages
observed_languages.append(lang) observed_languages.append(lang)
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
except KeyError: except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"])) print ("LANG not referenced", (hyperdata["language_iso3"]))
skipped_languages.append(hyperdata["language_iso3"]) skipped_languages.append(hyperdata["language_iso3"])
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
elif "language_fullname" in hyperdata.keys(): elif "language_name" in hyperdata.keys():
try: try:
#convert #convert
lang = languages[hyperdata["language_fullname"]].iso2 lang = languages[hyperdata["language_name"]].iso2
if lang not in LANGUAGES.keys(): if lang not in LANGUAGES.keys():
skipped_languages.append(lang) skipped_languages.append(lang)
return observed_languages,skipped_languages return observed_languages,skipped_languages
observed_languages.append(lang) observed_languages.append(lang)
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
except KeyError: except KeyError:
print ("LANG Not referenced", (hyperdata["language_fullname"])) print ("LANG Not referenced", (hyperdata["language_name"]))
skipped_languages.append(hyperdata["language_fullname"]) skipped_languages.append(hyperdata["language_name"])
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
else: else:
print("[WARNING] no language_* found in document [parsing.py]") print("[WARNING] no language_* found in document [parsing.py]")
if DETECT_LANG is False: if DETECT_LANG is False:
skipped_languages.append("__unknown__") skipped_languages.append("__unknown__")
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
#no language have been indexed #no language have been indexed
#detectlang by joining on the DEFAULT_INDEX_FIELDS #detectlang by joining on the DEFAULT_INDEX_FIELDS
...@@ -69,16 +69,17 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -69,16 +69,17 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
if len(text) < 10: if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index" hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__") skipped_languages.append("__unknown__")
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
else: else:
#detect_lang return iso2 #detect_lang return iso2
lang = detect_lang(text) lang = detect_lang(text)
if lang not in LANGUAGES.keys(): for k in ["iso2", "iso3", "name"]:
skipped_languages.append(lang) hyperdata["language_"+k] = lang[k]
if lang.iso2 not in LANGUAGES.keys():
skipped_languages.append(lang.iso2)
return observed_languages,skipped_languages return observed_languages,skipped_languages
observed_languages.append(lang) observed_languages.append(lang.iso2)
return observed_languages,skipped_languages return hyperdata, observed_languages,skipped_languages
def parse(corpus): def parse(corpus):
...@@ -122,8 +123,8 @@ def parse(corpus): ...@@ -122,8 +123,8 @@ def parse(corpus):
except Exception as error : except Exception as error :
hyperdata["error"] = "Error normalize_chars" hyperdata["error"] = "Error normalize_chars"
#adding lang into record hyperdata #adding lang into record hyperdata JUST if not declared
observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages) hyperdata,observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
# save as corpus DB child # save as corpus DB child
# ---------------- # ----------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment