Commit e5d4e175 authored by c24b's avatar c24b

FIX TAGERBOT default_lang

parent dbb66340
...@@ -52,62 +52,66 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -52,62 +52,66 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"} tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"}
#sort docs by lang? #sort docs by lang?
# for lang, tagger in tagger_bots.items(): # for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs): for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2') language_iso2 = document.hyperdata.get('language_iso2')
tagger = tagger_bots[language_iso2] if language_iso2 in source["default_languages"]:
#print(language_iso2) #filtering out skipped_docsof parsing
for key in keys: #if document.id not in corpus.skipped_docs:
try: tagger = tagger_bots[language_iso2]
value = document[str(key)] #print(language_iso2)
if not isinstance(value, str): for key in keys:
continue try:
# get ngrams value = document[str(key)]
for ngram in tagger.extract(value): if not isinstance(value, str):
tokens = tuple(normalize_forms(token[0]) for token in ngram) continue
if do_subngrams: # get ngrams
# ex tokens = ["very", "cool", "exemple"] for ngram in tagger.extract(value):
# subterms = [['very', 'cool'], tokens = tuple(normalize_forms(token[0]) for token in ngram)
# ['very', 'cool', 'exemple'], if do_subngrams:
# ['cool', 'exemple']] # ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],
subterms = subsequences(tokens) # ['very', 'cool', 'exemple'],
else: # ['cool', 'exemple']]
subterms = [tokens]
subterms = subsequences(tokens)
for seqterm in subterms: else:
ngram = ' '.join(seqterm) subterms = [tokens]
if len(ngram) > 1:
# doc <=> ngram index for seqterm in subterms:
nodes_ngrams_count[(document.id, ngram)] += 1 ngram = ' '.join(seqterm)
# add fields : terms n if len(ngram) > 1:
ngrams_data.add((ngram[:255], len(seqterm), )) # doc <=> ngram index
except: nodes_ngrams_count[(document.id, ngram)] += 1
#value not in doc # add fields : terms n
pass ngrams_data.add((ngram[:255], len(seqterm), ))
# except AttributeError: except:
# print("ERROR NO language_iso2") #value not in doc
# document.status("NGRAMS", error="No lang detected skipped Ngrams") pass
# corpus.skipped_docs.append(document.id) # except AttributeError:
# integrate ngrams and nodes-ngrams # print("ERROR NO language_iso2")
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: # document.status("NGRAMS", error="No lang detected skipped Ngrams")
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) # corpus.skipped_docs.append(document.id)
nodes_ngrams_count.clear() # integrate ngrams and nodes-ngrams
ngrams_data.clear() if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0: _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1) nodes_ngrams_count.clear()
ngrams_data.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.status('Ngrams', progress=documents_count+1)
corpus.save_hyperdata()
session.add(corpus)
session.commit()
# integrate ngrams and nodes-ngrams (le reste)
if len(nodes_ngrams_count) > 0:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
session.add(corpus)
session.commit() session.commit()
# integrate ngrams and nodes-ngrams (le reste)
if len(nodes_ngrams_count) > 0:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error: except Exception as error:
corpus.status('Ngrams', error=error) corpus.status('Ngrams', error=error)
corpus.save_hyperdata() corpus.save_hyperdata()
......
...@@ -146,14 +146,7 @@ def parse(corpus): ...@@ -146,14 +146,7 @@ def parse(corpus):
session.commit() session.commit()
#adding skipped_docs for later processsing #adding skipped_docs for later processsing
skipped_docs.append(document.id) skipped_docs.append(document.id)
#documents for this resources #documents for this resources
session.add(corpus) session.add(corpus)
session.commit() session.commit()
# update info about the resource # update info about the resource
...@@ -161,20 +154,27 @@ def parse(corpus): ...@@ -161,20 +154,27 @@ def parse(corpus):
#print( "resource n°",i, ":", d, "docs inside this file") #print( "resource n°",i, ":", d, "docs inside this file")
# add a corpus-level info about languages adding a __skipped__ info
print(len(skipped_docs), "docs skipped") #skipped_docs
corpus.skipped_docs = list(set(skipped_docs))
print(len(corpus.skipped_docs), "docs skipped")
skipped_langs = dict(Counter(skipped_languages))
if len(corpus.skipped_docs) > 0:
print ("INFO in which:")
print (sum(skipped_langs.values()), "docs with unsupported lang")
print(corpus.children("DOCUMENT").count(), "docs parsed") print(corpus.children("DOCUMENT").count(), "docs parsed")
#main language of the corpus #language of corpus
print(languages.items()) print(languages.items())
corpus.language_id = sorted(languages.items(), key = lambda x: x[1], reverse=True)[0][0] corpus.language_id = sorted(languages.items(), key = lambda x: x[1], reverse=True)[0][0]
print(corpus.language_id) print("Default MAIN language of CORPUS", corpus.language_id)
languages['__skipped__'] = dict(Counter(skipped_languages)) corpus.languages = dict(languages)
corpus.languages = languages corpus.languages["__skipped__"] = list(skipped_langs.keys())
corpus.skipped_docs = list(set(skipped_docs)) print("Languages of CORPUS", corpus.languages)
corpus.save_hyperdata() corpus.save_hyperdata()
session.commit() session.commit()
if len(corpus.skipped_docs) > 0:
print (sum(languages["__skipped__"].values()), "docs with unsupported lang")
#assign main lang of the corpus to unsupported languages docs #assign main lang of the corpus to unsupported languages docs
# for d_id in corpus.skipped_docs: # for d_id in corpus.skipped_docs:
# document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first() # document = session.query(Node).filter(Node.id == d_id, Node.typename == "DOCUMENT").first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment