Commit cd9b6308 authored by Romain Loth's avatar Romain Loth

fix bulk_insert error on associations: new solution mixing first idea about...

fix bulk_insert error on associations: new solution mixing first idea about remainder of loop and new idea about one 'for' loop too much on the languages
parent ab88ba32
...@@ -51,58 +51,63 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -51,58 +51,63 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs] docs = [doc for doc in corpus.children('DOCUMENT') if doc.id not in corpus.skipped_docs]
tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"} tagger_bots = {lang: load_tagger(lang)() for lang in corpus.languages if lang != "__skipped__"}
#sort docs by lang? #sort docs by lang?
for lang, tagger in tagger_bots.items(): # for lang, tagger in tagger_bots.items():
for documents_count, document in enumerate(docs): for documents_count, document in enumerate(docs):
language_iso2 = document.hyperdata.get('language_iso2', lang) language_iso2 = document.hyperdata.get('language_iso2')
#print(language_iso2) tagger = tagger_bots[language_iso2]
for key in keys: #print(language_iso2)
try: for key in keys:
value = document[str(key)] try:
if not isinstance(value, str): value = document[str(key)]
continue if not isinstance(value, str):
# get ngrams continue
for ngram in tagger.extract(value): # get ngrams
tokens = tuple(normalize_forms(token[0]) for token in ngram) for ngram in tagger.extract(value):
if do_subngrams: tokens = tuple(normalize_forms(token[0]) for token in ngram)
# ex tokens = ["very", "cool", "exemple"] if do_subngrams:
# subterms = [['very', 'cool'], # ex tokens = ["very", "cool", "exemple"]
# ['very', 'cool', 'exemple'], # subterms = [['very', 'cool'],
# ['cool', 'exemple']] # ['very', 'cool', 'exemple'],
# ['cool', 'exemple']]
subterms = subsequences(tokens)
else: subterms = subsequences(tokens)
subterms = [tokens] else:
subterms = [tokens]
for seqterm in subterms:
ngram = ' '.join(seqterm) for seqterm in subterms:
if len(ngram) > 1: ngram = ' '.join(seqterm)
# doc <=> ngram index if len(ngram) > 1:
nodes_ngrams_count[(document.id, ngram)] += 1 # doc <=> ngram index
# add fields : terms n nodes_ngrams_count[(document.id, ngram)] += 1
ngrams_data.add((ngram[:255], len(seqterm), )) # add fields : terms n
except: ngrams_data.add((ngram[:255], len(seqterm), ))
#value not in doc except:
pass #value not in doc
# except AttributeError: pass
# print("ERROR NO language_iso2") # except AttributeError:
# document.status("NGRAMS", error="No lang detected skipped Ngrams") # print("ERROR NO language_iso2")
# corpus.skipped_docs.append(document.id) # document.status("NGRAMS", error="No lang detected skipped Ngrams")
# integrate ngrams and nodes-ngrams # corpus.skipped_docs.append(document.id)
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE: # integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor) if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
nodes_ngrams_count.clear() _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
ngrams_data.clear() nodes_ngrams_count.clear()
if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0: ngrams_data.clear()
corpus.status('Ngrams', progress=documents_count+1) if documents_count % BATCH_NGRAMSEXTRACTION_SIZE == 0:
corpus.save_hyperdata() corpus.status('Ngrams', progress=documents_count+1)
session.add(corpus)
session.commit()
else:
# integrate ngrams and nodes-ngrams
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata() corpus.save_hyperdata()
session.add(corpus)
session.commit() session.commit()
# integrate ngrams and nodes-ngrams (le reste)
if len(nodes_ngrams_count) > 0:
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
corpus.status('Ngrams', progress=documents_count+1, complete=True)
corpus.save_hyperdata()
session.commit()
except Exception as error: except Exception as error:
corpus.status('Ngrams', error=error) corpus.status('Ngrams', error=error)
corpus.save_hyperdata() corpus.save_hyperdata()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment