Commit 6815bcfb authored by Romain Loth's avatar Romain Loth

ngrams reindexing: add timer (diagnostic: my functionis too slow, negative...

ngrams reindexing: add timer (diagnostic: my functionis too slow, negative effect on list importing and free annotation add)
parent 6fc3df6d
......@@ -23,6 +23,8 @@ from gargantext.util.db import bulk_insert_ifnotexists # £TODO debug
from sqlalchemy import distinct
from re import findall, IGNORECASE
from gargantext.util.toolchain.main import t # timer
# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
......@@ -40,6 +42,8 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
@param corpus: the CORPUS node
@param keys: the hyperdata fields to index
# FIXME too slow: index_new_ngrams should be faster via tsvector on DB
"""
# retrieve *all* the ngrams from our list
......@@ -56,7 +60,11 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
node_ngram_to_write = {}
# loop throught the docs and their text fields
for doc in corpus.children('DOCUMENT'):
for (i, doc) in enumerate(corpus.children('DOCUMENT')):
if (i % 100 == 0):
print('CORPUS #%d: [%s] ngrams_addition: doc %i' % (corpus.id, t(), i))
print()
# a new empty counting subdict
node_ngram_to_write[doc.id] = {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment