Commit 65b8f9b3 authored by c24b's avatar c24b

detect lang in CONSTANTS

parent 97eaa774
...@@ -5,6 +5,8 @@ from gargantext.constants import * ...@@ -5,6 +5,8 @@ from gargantext.constants import *
from collections import defaultdict, Counter from collections import defaultdict, Counter
from re import sub from re import sub
from gargantext.util.languages import languages, detect_lang from gargantext.util.languages import languages, detect_lang
import time
def add_lang(hyperdata, observed_languages, skipped_languages): def add_lang(hyperdata, observed_languages, skipped_languages):
'''utility to add lang information '''utility to add lang information
...@@ -52,10 +54,14 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -52,10 +54,14 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
else: else:
print("DETECT LANG:", DETECT_LANG)
if DETECT_LANG is False:
skipped_languages.append("__unknown__")
return observed_languages,skipped_languages
print("[WARNING] no language_* found in document [parsing.py]") print("[WARNING] no language_* found in document [parsing.py]")
#no language have been indexed #no language have been indexed
#detectlang by joining on DEFAULT_INDEX_FIELDS #detectlang by joining on the DEFAULT_INDEX_FIELDS
#text_fields = [k for k in DEFAULT_INDEX_FIELDS if k in hyperdata.keys()]
text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys())) text_fields2 = list(set(DEFAULT_INDEX_FIELDS) & set(hyperdata.keys()))
print(len(text_fields2)) print(len(text_fields2))
...@@ -66,6 +72,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -66,6 +72,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
return observed_languages,skipped_languages return observed_languages,skipped_languages
else: else:
#detect_lang return iso2 #detect_lang return iso2
lang = detect_lang(text) lang = detect_lang(text)
if lang not in LANGUAGES.keys(): if lang not in LANGUAGES.keys():
skipped_languages.append(lang) skipped_languages.append(lang)
...@@ -77,7 +84,6 @@ def add_lang(hyperdata, observed_languages, skipped_languages): ...@@ -77,7 +84,6 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus): def parse(corpus):
try: try:
print("PARSING") print("PARSING")
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers) #1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources() resources = corpus.resources()
...@@ -178,6 +184,7 @@ def parse(corpus): ...@@ -178,6 +184,7 @@ def parse(corpus):
if docs == 0: if docs == 0:
print("[WARNING] PARSING FAILED!!!!!") print("[WARNING] PARSING FAILED!!!!!")
corpus.status('Parsing', error= "No documents parsed") corpus.status('Parsing', error= "No documents parsed")
#document.save_hyperdata() #document.save_hyperdata()
print(docs, "parsed") print(docs, "parsed")
#LANGUAGES INFO #LANGUAGES INFO
...@@ -195,7 +202,6 @@ def parse(corpus): ...@@ -195,7 +202,6 @@ def parse(corpus):
corpus.hyperdata["languages"] = dict(observed_langs) corpus.hyperdata["languages"] = dict(observed_langs)
corpus.hyperdata["languages"]["__unknown__"] = list(skipped_langs.keys()) corpus.hyperdata["languages"]["__unknown__"] = list(skipped_langs.keys())
print("OBSERVED_LANGUAGES", corpus.hyperdata["languages"]) print("OBSERVED_LANGUAGES", corpus.hyperdata["languages"])
corpus.save_hyperdata() corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment