Commit 67cd43b0 authored by c24b's avatar c24b

LANG DETECTION IN PARSING

parent b192ddd8
......@@ -315,7 +315,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# "cool example".
# (all 1 to n-1 length ngrams,
# at indexing after extraction)
DEFAULT_INDEX_FIELDS = ('title', 'abstract', ) #Defaults Fields for ngrams extraction
# Defaults INDEXED Fields for ngrams extraction
# put longest field first in order to make detection language more efficient
DEFAULT_INDEX_FIELDS = ('abstract','title' )
# Grammar rules for chunking
RULE_JJNN = "{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_JJDTNN = "{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
......
......@@ -14,6 +14,7 @@ djangorestframework==3.3.2
html5lib==0.9999999
jdatetime==1.7.2
kombu==3.0.33
langdetect==1.0.6
lxml==3.5.0
networkx==1.11
nltk==3.1
......
from gargantext.constants import *
from langdetect import detect
from langdetect import DetectorFactory
class Language:
def __init__(self, iso2=None, iso3=None, name=None):
def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
self.iso2 = iso2
self.iso3 = iso3
self.name = name
self.implemented = iso2 in LANGUAGES
def __str__(self):
result = '<Language'
for key, value in self.__dict__.items():
......@@ -16,6 +18,10 @@ class Language:
return result
__repr__ = __str__
def detect_lang(self, text):
DetectorFactory.seed = 0
return Languages[detect(text)].iso2
class Languages(dict):
def __missing__(self, key):
key = key.lower()
......@@ -49,3 +55,4 @@ languages['fre'] = languages['fr']
languages['ger'] = languages['de']
languages['Français'] = languages['fr']
languages['en_US'] = languages['en']
languages['english'] = languages['en']
......@@ -4,6 +4,7 @@ from gargantext.constants import *
#from gargantext.util.parsers import *
from collections import defaultdict, Counter
from re import sub
from gargantext.util.languages import languages, detect_lang
def parse(corpus):
try:
......@@ -27,8 +28,8 @@ def parse(corpus):
else:
#observed langages in corpus docs
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0)
skipped_languages = []
#remember the skipped docs in parsing
skipped_languages = []
corpus.skipped_docs = []
session.add(corpus)
session.commit()
......@@ -43,34 +44,50 @@ def parse(corpus):
hyperdata[k] = normalize_chars(hyperdata[k])
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
indexed = False
# a simple census to raise language info at corpus level
if "language_iso2" in hyperdata.keys():
try:
corpus.languages[hyperdata["language_iso2"]] += 1
except KeyError:
print("KeyError", hyperdata["language_iso2"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
elif "language_fullname" in hyperdata.keys():
try:
#full => iso2
lang = languages[hyperdata["language_fullname"]].name.lower()
corpus.languages[lang] += 1
except KeyError:
print("KeyError", hyperdata["language_fullname"])
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(lang)
else:
pass
for l in ["iso2", "iso3", "full_name"]:
if hyperdata["indexed"] is True:
break
lang_field = "language_"+l
if lang_field in hyperdata.keys():
if l == "iso2":
try:
corpus.languages[hyperdata["language_iso2"]] += 1
indexed = True
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
else:
try:
lang = languages(hyperdata[lang_field].lower()).iso2
corpus.languages[lang] += 1
indexed = True
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(lang)
if indexed is False:
#no language have been indexed
#detectlang by index_fields
# for k in DEFAULT_INDEX_FIELDS:
# if k in hyperdata.keys():
# try:
# hyperdata["language_iso2"] = langdetect(hyperdata[k])
# except Exception as error :
# pass
#print(hyperdata.keys())
for k in DEFAULT_INDEX_FIELDS:
if indexed is True:
break
if k in hyperdata.keys():
try:
hyperdata["language_iso2"] = detect_lang(hyperdata[k])
corpus.languages[lang] += 1
indexed = True
break
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
indexed = True
except Exception as error :
print(error)
pass
# save as DB child
# ----------------
document = corpus.add_child(
......@@ -96,10 +113,6 @@ def parse(corpus):
session.add(corpus)
session.commit()
# update info about the resource
resource['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info
......
File mode changed from 100644 to 100755
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment