Commit cc674dea authored by c24b's avatar c24b

[PATCH] lang

parent 5f610771
......@@ -263,13 +263,13 @@ def load_crawler(resource):
LANGUAGES = {
'en': {
#'tagger': 'EnglishMeltTagger',
'tagger': "TurboTagger",
#'tagger': 'NltkTagger',
#'tagger': "TurboTagger",
'tagger': 'NltkTagger',
},
'fr': {
'tagger': "FrenchMeltTagger",
#'tagger': "FrenchMeltTagger",
#'tagger': 'TreeTagger',
#'tagger': 'NltkTagger',
'tagger': 'NltkTagger',
},
}
......@@ -278,11 +278,13 @@ def load_tagger(lang):
given a LANG load the corresponding tagger
lang(str) > Tagger(Object)
'''
filename = LANGUAGES[lang]["tagger"]
module = 'gargantext.util.taggers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, filename)
try:
filename = LANGUAGES[lang]["tagger"]
module = 'gargantext.util.taggers.%s' %(filename)
module = importlib.import_module(module)
return getattr(module, filename)()
except:
raise ImportError("No tagger for this lang %s TIP: declare a new parser in LANGUAGES" %lang)
# linguistic extraction parameters ---------------------------------------------
......
from gargantext.constants import *
from langdetect import detect, DetectorFactory
class Language:
def __init__(self, iso2=None, iso3=None,full_name=None, name=None):
self.iso2 = iso2
......@@ -55,3 +57,5 @@ languages['ger'] = languages['de']
languages['Français'] = languages['fr']
languages['en_US'] = languages['en']
languages['english'] = languages['en']
languages['chi'] = languages['zh']
languages['dut'] = languages['de']
......@@ -12,8 +12,8 @@ from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
# to convert fr => french :/
from gargantext.constants import LANGUAGES
from gargantext.util.languages import languages
from re import split as resplit
from collections import defaultdict, Counter
from nltk.stem.snowball import SnowballStemmer
......@@ -25,8 +25,11 @@ def prepare_stemmers(corpus):
languages has been previously filtered by supported source languages
and formatted
"""
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"]\
if lang != "__unknown__" \
if lang in LANGUAGES.keys()]
stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \
in corpus.hyperdata['languages'].keys() if lang !="__skipped__"}
in supported_stemmers_lang}
stemmers['__unknown__'] = SnowballStemmer("english")
return stemmers
......@@ -47,7 +50,9 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# 1) compute stems/lemmas
# and group if same stem/lemma
stemmers = prepare_stemmers(corpus)
supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" \
and lang in LANGUAGES.keys()]
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg = defaultdict(set)
......@@ -57,11 +62,16 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language
for doc in corpus.children('DOCUMENT'):
if doc.id not in corpus.hyperdata['skipped_docs']:
if ('language_iso2' in doc.hyperdata):
if ('language_iso2' in doc.hyperdata) \
and doc.hyperdata['language_iso2'] in supported_stemmers_lang:
lgid = doc.hyperdata['language_iso2']
else:
lgid = "__unknown__"
document.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming")
document.save_hyperdata()
#corpus.hyperdata["skipped_docs"].append(doc.id)
#corpus.save_hyperdata()
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for ngram_pack in doc.ngrams.all():
......
......@@ -9,7 +9,7 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
print('INTEGRATE')
#print('INTEGRATE')
# integrate ngrams
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram,
......@@ -49,18 +49,28 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
source = get_resource(resource["type"])
#load only the docs that have passed the parsing without error
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata["skipped_docs"]]
#load available taggers for source default langage
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.hyperdata["languages"] if lang != "__skipped__"}
#load available taggers for default langage of plateform
#print(LANGUAGES.keys())
tagger_bots = {lang: load_tagger(lang) for lang in corpus.hyperdata["languages"] \
if lang != "__unknown__" and lang in LANGUAGES.keys()}
supported_taggers_lang = tagger_bots.keys()
#sort docs by lang?
# for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2')
if language_iso2 not in supported_taggers_lang:
#print("ERROR NO language_iso2")
document.status("NGRAMS", error="Error: unsupported language for tagging")
session.add(document)
session.commit()
corpus.hyperdata["skipped_docs"].append(document.id)
corpus.save_hyperdata()
continue
else:
if language_iso2 in source["default_languages"]:
#filtering out skipped_docs of parsing not necessary in here filtered out in docs???
#if document.id not in corpus.skipped_docs:
tagger = tagger_bots[language_iso2]
#print(language_iso2)
#>>> romain-stable-patch
#to do verify if document has no KEYS to index
......@@ -68,7 +78,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
try:
value = document.hyperdata[str(key)]
if not isinstance(value, str):
print("DBG wrong content in doc for key", key)
#print("DBG wrong content in doc for key", key)
continue
# get ngrams
for ngram in tagger.extract(value):
......@@ -92,11 +102,8 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
ngrams_data.add((ngram[:255], len(seqterm), ))
except:
#value not in doc
pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
continue
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
......@@ -116,12 +123,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error:
corpus.status('Ngrams', error=error)
corpus.save_hyperdata()
session.commit()
raise error
......
......@@ -6,7 +6,7 @@ from collections import defaultdict, Counter
from re import sub
from gargantext.util.languages import languages, detect_lang
def add_lang(languages, hyperdata, skipped_languages):
def add_lang(hyperdata, observed_languages, skipped_languages):
'''utility to add lang information
1. on language_iso2
2. on other format language_%f
......@@ -14,69 +14,50 @@ def add_lang(languages, hyperdata, skipped_languages):
'''
if "language_iso2" in hyperdata.keys():
try:
languages[hyperdata["language_iso2"]] +=1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %hyperdata["language_iso2"]
skipped_languages.append(hyperdata["language_iso2"])
return languages,hyperdata, skipped_languages
# this should be the responsability of the parserbot
observed_languages.append(hyperdata["language_iso2"])
return observed_languages,skipped_languages
elif "language_iso3" in hyperdata.keys():
#convert
try:
lang = languages[hyperdata["language_iso3"]].iso2
try:
corpus.languages[lang] +=1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
observed_languages.append(lang)
return observed_languages,skipped_languages
except KeyError:
print ("LANG not referenced", (hyperdata["language_iso3"]))
#skipped_languages.append(hyperdata["language_iso3"])
#hyperdata["error"] = "Error: unsupported language '%s'" %hyperdata["language_fullname"]
return languages,hyperdata, skipped_languages
skipped_languages.append(hyperdata["language_iso3"])
return observed_languages,skipped_languages
elif "language_fullname" in hyperdata.keys():
try:
#convert
lang = languages[hyperdata["language_fullname"]].iso2
try:
corpus.languages[lang] +=1
return corpus, hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language %s" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
lang = hyperdata["language_fullname"].iso2
observed_languages.append(lang)
return observed_languages,skipped_languages
except KeyError:
print ("LANG Not referenced", (hyperdata["language_fullname"]))
#hyperdata["error"] = "Error: unsupported language '%s'" %hyperdata["language_fullname"]
return languages,hyperdata, skipped_languages
skipped_languages.append(hyperdata["language_fullname"])
return observed_languages,skipped_languages
else:
print("[WARNING] no language_* found in document [parsing.py]")
#no language have been indexed
#detectlang by index_fields
#detectlang by joining on DEFAULT_INDEX_FIELDS
text = " ".join([getattr(hyperdata, k) for k in DEFAULT_INDEX_FIELDS])
if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__")
return languages,hyperdata, skipped_languages
#detect_lang return iso2
lang = detect_lang(text)
try:
languages[lang] += 1
return languages,hyperdata, skipped_languages
except KeyError:
hyperdata["error"] = "Error: unsupported language '%s'" %lang
skipped_languages.append(lang)
return languages,hyperdata, skipped_languages
return observed_languages,skipped_languages
else:
#detect_lang return iso2
lang = detect_lang(text)
observed_languages.append(lang)
return observed_languages,skipped_languages
def parse(corpus):
......@@ -97,8 +78,7 @@ def parse(corpus):
raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source)
#observed languages in default languages
languages = defaultdict.fromkeys(source["default_languages"], 0)
observed_languages = []
#skipped_languages
skipped_languages = []
#skipped docs to remember for later processing
......@@ -120,16 +100,12 @@ def parse(corpus):
hyperdata[k] = normalize_chars(hyperdata[k])
except Exception as error :
hyperdata["error"] = "Error normalize_chars"
#else:
#print("[WARNING] No %s field found in hyperdata at parsing.py" %k)
# continue
#adding lang into record hyperdata
languages, hyperdata, skipped_languages = add_lang(languages, hyperdata, skipped_languages)
observed_languages, skipped_languages = add_lang(hyperdata, observed_languages, skipped_languages)
# save as DB child
# save as corpus DB child
# ----------------
#d += 1
#print ("INSERT", d)
document = corpus.add_child(
typename = 'DOCUMENT',
name = hyperdata.get('title', '')[:255],
......@@ -143,9 +119,9 @@ def parse(corpus):
#document.status("error")
document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata()
session.add(document)
session.commit()
#adding skipped_docs for later processsing
#adding skipped_docs for later processsing if error in parsing
skipped_docs.append(document.id)
#documents for this resources
session.add(corpus)
......@@ -155,25 +131,28 @@ def parse(corpus):
#print( "resource n°",i, ":", d, "docs inside this file")
#STORING AGREGATIONS INFO (STATS)
#skipped_docs
corpus.skipped_docs = list(set(skipped_docs))
print(len(corpus.skipped_docs), "docs skipped")
corpus.hyperdata["skipped_docs"] = list(set(skipped_docs))
print(len(corpus.hyperdata["skipped_docs"]), "docs skipped")
#les langues pas belles
skipped_langs = dict(Counter(skipped_languages))
if len(corpus.skipped_docs) > 0:
print ("in which:")
print (sum(skipped_langs.values()), "docs with unsupported lang")
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
# les documents
print(corpus.children("DOCUMENT").count(), "docs parsed")
#languages INFO of corpus
print(languages.items())
corpus.language_id = sorted(languages.items(), key = lambda x: x[1], reverse=True)[0][0]
print("Default MAIN language of CORPUS", corpus.language_id)
corpus.languages = dict(languages)
corpus.languages["__skipped__"] = list(skipped_langs.keys())
print("Languages of CORPUS", corpus.languages)
#LANGUAGES INFO
print("#LANGAGES OK")
print(observed_langs)
print("#LANGUAGES UNKNOWN")
print(skipped_langs)
corpus.language_id = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)[0][0]
print("#MAIN language of the CORPUS", corpus.language_id)
corpus.hyperdata["languages"] = dict(observed_langs)
corpus.hyperdata["languages"]["__unknown__"] = list(skipped_langs.keys())
print("OBSERVED_LANGUAGES", corpus.hyperdata["languages"])
corpus.save_hyperdata()
session.commit()
#TODO: assign main lang of the corpus to unsupported languages docs
# for d_id in corpus.skipped_docs:
# document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment