diff --git a/gargantext/util/toolchain/parsing.py b/gargantext/util/toolchain/parsing.py index c699df21ac0eb1ea29811c9842e9bbb42d50f489..bc18c6fe1716eacf3ecc368ceb1fe6884c3e207d 100644 --- a/gargantext/util/toolchain/parsing.py +++ b/gargantext/util/toolchain/parsing.py @@ -138,13 +138,20 @@ def parse(corpus): #adding skipped_docs for later processsing if error in parsing skipped_docs.append(document.id) - #documents for this resources - session.add(corpus) - session.commit() + + if documents_count % BATCH_PARSING_SIZE == 0: + corpus.status('Docs', progress=documents_count) + corpus.save_hyperdata() + session.add(corpus) + session.commit() + + # update info about the resource resource['extracted'] = True #print( "resource n°",i, ":", d, "docs inside this file") - + #finally store documents for this corpus + session.add(corpus) + session.commit() #STORING AGREGATIONS INFO (STATS) #skipped_docs