Commit 323e3d9d authored by Romain Loth's avatar Romain Loth

suggestions for faster 'integrate' of ngrams

parent b90eb786
...@@ -8,13 +8,15 @@ from gargantext.util.scheduling import scheduled ...@@ -8,13 +8,15 @@ from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor): def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
""" """
@param ngrams_data a set like {('single word', 2), ('apple', 1),...} @param ngrams_data a set like {('single word', 2), ('apple', 1),...}
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
""" """
print('INTEGRATE') print('INTEGRATE')
# integrate ngrams # integrate ngrams (aka new words)
ngrams_ids = bulk_insert_ifnotexists( ngrams_ids = bulk_insert_ifnotexists(
model = Ngram, model = Ngram, # todo type should :str ~~> :str|:re) !!!
uniquekey = 'terms', uniquekey = 'terms', # todo col 'terms' should be renamed 'form' ?
fields = ('terms', 'n'), fields = ('terms', 'n'), # todo replace by type ?
data = ngrams_data, data = ngrams_data,
cursor = cursor, cursor = cursor,
) )
...@@ -77,6 +79,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND ...@@ -77,6 +79,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
#print(language_iso2) #print(language_iso2)
#>>> romain-stable-patch #>>> romain-stable-patch
#to do verify if document has no KEYS to index #to do verify if document has no KEYS to index
# eg: use set intersect (+ loop becomes direct! with no continue)
for key in keys: for key in keys:
try: try:
value = document.hyperdata[str(key)] value = document.hyperdata[str(key)]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment