Commit ce9e9de3 authored by Romain Loth's avatar Romain Loth

[FIX] new ngram indexing: fix cases where ngram was already partially indexed...

[FIX] new ngram indexing: fix cases where ngram was already partially indexed (ex: if extraction was only correct in some documents) (todo: change bulk_insert_ifnotexists for cases like this)
parent 85a3f8a0
...@@ -19,6 +19,7 @@ procedure: ...@@ -19,6 +19,7 @@ procedure:
from gargantext.models import Ngram, Node, NodeNgram from gargantext.models import Ngram, Node, NodeNgram
from gargantext.util.db import session, bulk_insert from gargantext.util.db import session, bulk_insert
from gargantext.util.db import bulk_insert_ifnotexists # £TODO debug
from sqlalchemy import distinct from sqlalchemy import distinct
from re import findall, IGNORECASE from re import findall, IGNORECASE
...@@ -41,20 +42,13 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )): ...@@ -41,20 +42,13 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
@param keys: the hyperdata fields to index @param keys: the hyperdata fields to index
""" """
# check the ngrams we won't process (those that were already indexed) # retrieve *all* the ngrams from our list
indexed_ngrams_subquery = (session # (even if some relations may be already indexed
.query(distinct(NodeNgram.ngram_id)) # b/c they were perhaps not extracted in all docs
.join(Node, Node.id == NodeNgram.node_id) # => we'll use already_indexed later)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.subquery()
)
# retrieve the ngrams from our list, filtering out the already indexed ones
todo_ngrams = (session todo_ngrams = (session
.query(Ngram) .query(Ngram)
.filter(Ngram.id.in_(ngram_ids)) .filter(Ngram.id.in_(ngram_ids))
.filter(~ Ngram.id.in_(indexed_ngrams_subquery))
.all() .all()
) )
...@@ -90,22 +84,49 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )): ...@@ -90,22 +84,49 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
else: else:
node_ngram_to_write[doc.id][ngram.id] += n_occs node_ngram_to_write[doc.id][ngram.id] += n_occs
# debug
# print("new node_ngrams before filter:", node_ngram_to_write)
# check the relations we won't insert (those that were already indexed)
# NB costly but currently impossible with bulk_insert_ifnotexists
# b/c double uniquekey
already_indexed = (session
.query(NodeNgram.node_id, NodeNgram.ngram_id)
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.all()
)
filter_out = {(nd_id,ng_id) for (nd_id,ng_id) in already_indexed}
# POSSIBLE update those that are filtered out if wei_previous != wei
# integrate all at the end # integrate all at the end
my_new_rows = [] my_new_rows = []
add_new_row = my_new_rows.append add_new_row = my_new_rows.append
for doc_id in node_ngram_to_write: for doc_id in node_ngram_to_write:
for ngram_id in node_ngram_to_write[doc_id]: for ngram_id in node_ngram_to_write[doc_id]:
wei = node_ngram_to_write[doc_id][ngram_id] if (doc_id, ngram_id) not in filter_out:
add_new_row([doc_id, ngram_id, wei]) wei = node_ngram_to_write[doc_id][ngram_id]
add_new_row([doc_id, ngram_id, wei])
del node_ngram_to_write del node_ngram_to_write
# debug
# print("new node_ngrams after filter:", my_new_rows)
bulk_insert( bulk_insert(
table = NodeNgram, table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'), fields = ('node_id', 'ngram_id', 'weight'),
data = my_new_rows data = my_new_rows
) )
# bulk_insert_ifnotexists(
# model = NodeNgram,
# uniquekey = ('node_id','ngram_id'), <= currently impossible
# fields = ('node_id', 'ngram_id', 'weight'),
# data = my_new_rows
# )
n_added = len(my_new_rows) n_added = len(my_new_rows)
print("index_new_ngrams: added %i new NodeNgram rows" % n_added) print("index_new_ngrams: added %i new NodeNgram rows" % n_added)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment