Commit db1b31a2 authored by c24b's avatar c24b

Merge remote-tracking branch 'origin/romain-stable-patch' into c24b-stable

parents 7ed3dc0b 570c9fd8
...@@ -122,6 +122,7 @@ class Parser: ...@@ -122,6 +122,7 @@ class Parser:
if language_key in hyperdata: if language_key in hyperdata:
try: try:
language_symbol = hyperdata[language_key] language_symbol = hyperdata[language_key]
if language_symbol is not None:
language = languages[language_symbol] language = languages[language_symbol]
if language: if language:
break break
......
...@@ -26,7 +26,7 @@ def prepare_stemmers(corpus): ...@@ -26,7 +26,7 @@ def prepare_stemmers(corpus):
and formatted and formatted
""" """
stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \ stemmers = {lang:SnowballStemmer(languages[lang].name.lower()) for lang \
in corpus.languages.keys() if lang !="__skipped__"} in corpus.hyperdata['languages'].keys() if lang !="__skipped__"}
stemmers['__unknown__'] = SnowballStemmer("english") stemmers['__unknown__'] = SnowballStemmer("english")
return stemmers return stemmers
...@@ -56,7 +56,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None): ...@@ -56,7 +56,7 @@ def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
# preloop per doc to sort ngrams by language # preloop per doc to sort ngrams by language
for doc in corpus.children('DOCUMENT'): for doc in corpus.children('DOCUMENT'):
if doc.id not in corpus.skipped_docs: if doc.id not in corpus.hyperdata['skipped_docs']:
if ('language_iso2' in doc.hyperdata): if ('language_iso2' in doc.hyperdata):
lgid = doc.hyperdata['language_iso2'] lgid = doc.hyperdata['language_iso2']
else: else:
......
...@@ -47,29 +47,60 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -47,29 +47,60 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
resource = corpus.resources()[0] resource = corpus.resources()[0]
documents_count = 0 documents_count = 0
source = get_resource(resource["type"]) source = get_resource(resource["type"])
#load available taggers for source default langage
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs] # preload available taggers for corpus languages
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"} tagger_bots = {}
#sort docs by lang? skipped_languages = {}
# for lang, tagger in tagger_bots.items():
for lang in corpus.hyperdata['languages']:
try:
tagger_bots[lang] = load_tagger(lang)()
except KeyError:
skipped_languages[lang] = True
print("WARNING skipping language:", lang)
# the list of todo docs
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.hyperdata['skipped_docs']]
# go for the loop
for documents_count, document in enumerate(docs): for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2') language_iso2 = document.hyperdata.get('language_iso2')
tagger = tagger_bots[language_iso2]
#print(language_iso2) #print(language_iso2)
# skip case if no tagger available
if language_iso2 in skipped_languages:
corpus.hyperdata['skipped_docs'][document.id] = True
corpus.save_hyperdata()
document.hyperdata["error"] = "Error: unsupported language"
document.save_hyperdata()
session.commit()
continue
# NORMAL CASE
tagger = tagger_bots[language_iso2]
for key in keys: for key in keys:
try: key = str(key)
value = document[str(key)] if key not in document.hyperdata:
# print("DBG missing key in doc", key)
# TODO test if document has no keys at all
continue
# get a text value
value = document[key]
if not isinstance(value, str): if not isinstance(value, str):
print("DBG wrong content in doc for key", key)
continue continue
try:
# get ngrams # get ngrams
for ngram in tagger.extract(value): ngrams = tagger.extract(value)
for ngram in ngrams:
tokens = tuple(normalize_forms(token[0]) for token in ngram) tokens = tuple(normalize_forms(token[0]) for token in ngram)
if do_subngrams: if do_subngrams:
# ex tokens = ["very", "cool", "exemple"] # ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'], # subterms = [['very', 'cool'],...]
# ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens) subterms = subsequences(tokens)
else: else:
subterms = [tokens] subterms = [tokens]
...@@ -81,13 +112,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -81,13 +112,11 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count[(document.id, ngram)] += 1 nodes_ngrams_count[(document.id, ngram)] += 1
# add fields : terms n # add fields : terms n
ngrams_data.add((ngram[:255], len(seqterm), )) ngrams_data.add((ngram[:255], len(seqterm), ))
except: except Exception as e:
#value not in doc print('NGRAMS EXTRACTION skipping doc %i because of unknown error:' % document.id, str(e))
# TODO add info to document.hyperdata['error']
pass pass
# except AttributeError:
# print("ERROR NO language_iso2")
# document.status("NGRAMS", error="No lang detected skipped Ngrams")
# corpus.skipped_docs.append(document.id)
# integrate ngrams and nodes-ngrams # integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
...@@ -105,9 +134,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -105,9 +134,13 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
nodes_ngrams_count.clear() nodes_ngrams_count.clear()
ngrams_data.clear() ngrams_data.clear()
corpus.hyperdata['skipped_languages'] = skipped_languages
corpus.save_hyperdata()
corpus.status('Ngrams', progress=documents_count+1, complete=True) corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
except Exception as error: except Exception as error:
corpus.status('Ngrams', error=error) corpus.status('Ngrams', error=error)
corpus.save_hyperdata() corpus.save_hyperdata()
......
...@@ -10,31 +10,30 @@ def parse(corpus): ...@@ -10,31 +10,30 @@ def parse(corpus):
try: try:
documents_count = 0 documents_count = 0
corpus.status('Docs', progress=0) corpus.status('Docs', progress=0)
#1 corpus => 1 resource
# shortcut to hyperdata's list of added resources (packs of docs)
resources = corpus.resources() resources = corpus.resources()
#get the sources capabilities for a given corpus resource
sources = [get_resource(resource["type"]) for resource in corpus.resources() if resource["extracted"] is False] # vars to gather some infos during parsing (=> will end up in hyperdata)
if len(sources) == 0: skipped_docs = defaultdict(bool)
#>>> documents have already been parsed????? observed_languages = defaultdict(int)
return
if len(sources) > 0: # each resource contains a path to a file with the docs
#>>> necessairement 1 corpus = 1 source dans l'archi actuelle for i, resource in enumerate(resources):
source = sources[0]
resource = resources[0] # we'll only want the resources that have never been extracted
#source.extend(resource) if resource["extracted"]:
if source["parser"] is None: continue
# the sourcetype's infos
source_infos = get_resource(resource['type'])
if source_infos["parser"] is None:
#corpus.status(error) #corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"]) raise ValueError("Resource '%s' has no Parser" %resource["name"])
else: else:
#observed langages in corpus docs # load the corresponding parser
corpus.languages = defaultdict.fromkeys(source["default_languages"], 0) parserbot = load_parser(source_infos)
#remember the skipped docs in parsing
skipped_languages = []
corpus.skipped_docs = []
session.add(corpus)
session.commit()
#load the corresponding parser
parserbot = load_parser(source)
# extract and insert documents from resource.path into database # extract and insert documents from resource.path into database
default_lang_field = ["language_"+l for l in ["iso2", "iso3", "full_name"]] default_lang_field = ["language_"+l for l in ["iso2", "iso3", "full_name"]]
...@@ -47,15 +46,10 @@ def parse(corpus): ...@@ -47,15 +46,10 @@ def parse(corpus):
except Exception as error : except Exception as error :
hyperdata["error"] = "Error normalize_chars" hyperdata["error"] = "Error normalize_chars"
# any parserbot should implement a language_iso2
#any parser should implement a language_iso2
if "language_iso2" in hyperdata.keys(): if "language_iso2" in hyperdata.keys():
try: observed_languages[hyperdata["language_iso2"]] +=1
corpus.languages[hyperdata["language_iso2"]] +=1
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
# this should be the responsability of the parserbot # this should be the responsability of the parserbot
# elif "language_iso3" in hyperdata.keys(): # elif "language_iso3" in hyperdata.keys():
# try: # try:
...@@ -66,22 +60,15 @@ def parse(corpus): ...@@ -66,22 +60,15 @@ def parse(corpus):
else: else:
print("[WARNING] no language_iso2 found in document [parsing.py]") print("[WARNING] no language_iso2 found in document [parsing.py]")
#no language have been indexed # no language has been found by parserbot
#detectlang by index_fields # => detectlang on index_fields
text = " ".join([getattr(hyperdata, k, '') for k in DEFAULT_INDEX_FIELDS])
text = " ".join([getattr(hyperdata, k) for k in DEFAULT_INDEX_FIELDS])
if len(text) < 10: if len(text) < 10:
hyperdata["error"] = "Error: no TEXT fields to index" hyperdata["error"] = "Error: no TEXT fields to index"
skipped_languages.append("__unknown__") else:
predicted_lang = detect_lang(text)
hyperdata["language_iso2"] = detect_lang(text) hyperdata["language_iso2"] = predicted_lang
try: observed_languages[predicted_lang] += 1
corpus.languages[hyperdata["language_iso2"]] += 1
corpus.languages[hyperdata["language_iso2"]] +=1
except KeyError:
hyperdata["error"] = "Error: unsupported language"
skipped_languages.append(hyperdata["language_iso2"])
# save as DB child # save as DB child
# ---------------- # ----------------
...@@ -97,8 +84,10 @@ def parse(corpus): ...@@ -97,8 +84,10 @@ def parse(corpus):
document.status('Parsing', error= document.hyperdata["error"]) document.status('Parsing', error= document.hyperdata["error"])
document.save_hyperdata() document.save_hyperdata()
session.commit() session.commit()
#adding skipped_docs for later processsing
corpus.skipped_docs.append(document.id) # adding to skipped_docs for later processing
skipped_docs[document.id] = True
documents_count += 1 documents_count += 1
# logging # logging
...@@ -109,19 +98,27 @@ def parse(corpus): ...@@ -109,19 +98,27 @@ def parse(corpus):
session.commit() session.commit()
# update info about the resource # update info about the resource
resource['extracted'] = True corpus.hyperdata['resources'][i]['extracted'] = True
# add a corpus-level info about languages adding a __skipped__ info corpus.save_hyperdata()
corpus.languages['__skipped__'] = Counter(skipped_languages) session.commit()
print("PARSING:", len(skipped_docs), "docs skipped")
print("LANGUES") print("LANGUES")
for n in corpus.languages.items(): for n in observed_languages.items():
print(n) print(n)
#TO DO: give the main language of the corpus to unsupported lang docs
print(len(corpus.skipped_docs), "docs skipped") # add the infos to hyperdata at the end
corpus.hyperdata['skipped_docs'] = skipped_docs
corpus.hyperdata['languages'] = observed_languages
corpus.save_hyperdata()
# commit all changes # commit all changes
corpus.status('Docs', progress=documents_count, complete=True) corpus.status('Docs', progress=documents_count, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
session.add(corpus) session.add(corpus)
session.commit() session.commit()
except Exception as error: except Exception as error:
corpus.status('Docs', error=error) corpus.status('Docs', error=error)
corpus.save_hyperdata() corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment