Commit 24c99bbe authored by Romain Loth's avatar Romain Loth

[FEAT] import/export terms table: previously unindexed ngrams are indexed at import

parent 4cb382da
......@@ -21,6 +21,9 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus
from gargantext.util.toolchain.ngrams_addition import index_new_ngrams
from sqlalchemy.sql import exists
from os import path
from csv import writer, reader, QUOTE_MINIMAL
......@@ -483,7 +486,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
this_row_forms = ''
# string normalizations
this_row_label = normalize_terms(normalize_chars(this_row_label))
this_row_label = normalize_forms(normalize_chars(this_row_label))
# except:
# if i == 0:
......@@ -521,7 +524,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
for raw_term_str in this_row_forms.split(group_delimiter):
# each subform is also like an ngram declaration
term_str = normalize_terms(normalize_chars(raw_term_str))
term_str = normalize_forms(normalize_chars(raw_term_str))
imported_unique_ngramstrs[term_str] = True
imported_nodes_ngrams[this_list_type].append(term_str)
......@@ -559,6 +562,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
# print(new_ngrams_ids)
# print(imported_nodes_ngrams)
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
......@@ -632,11 +636,8 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
seront remis dans la main à la fin)
NB: Uses group_tools.group_union() to merge the synonym links.
FIXME: new terms created at import_ngramlists() can now be added to lists
but are never added to docs
Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
"""
# log to send back to client-side (lines will be joined)
my_log = []
......@@ -656,6 +657,20 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
{'key': 'map', 'name':"MAPLIST"} # lid = 2
]
# ======== Index the new ngrams in the docs =========
all_possibly_new_ngram_ids = []
collect = all_possibly_new_ngram_ids.append
for lid, info in enumerate(linfos):
list_type = info['key']
if list_type in new_lists:
for ng_id in new_lists[list_type].items:
collect(ng_id)
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
# ======== Get the old lists =========
old_lists = {}
......
"""
Module for raw indexing a totally new ngram
=> creates new (doc_node <-> new_ngram) relations in NodeNgram
use cases:
- from annotation view user selects a free segment of text to make a new ngram
- at list import, any new list can contain ngrams that've never been extracted
prerequisite:
- normalize_chars(new_ngram_str)
- normalize_form(new_ngram_str)
- add the new ngram to `ngrams` table
procedure:
- simple regexp search of the ngram string => addition to NodeNgram
/!\ -> morphological variants are NOT considered (ex plural or declined forms)
"""
from gargantext.models import Ngram, Node, NodeNgram
from gargantext.util.db import session, bulk_insert
from sqlalchemy import distinct
from re import findall, IGNORECASE
# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
"""
Find occurrences of some ngrams for every document of the given corpus.
+ insert them in the NodeNgram table.
@param ngram_ids: a list of ids for Ngram objects
(we assume they already went throught normalizations
and they were already added to Ngrams table
and optionally to some of the lists like MAPLIST)
(but we can't know if they were previously indexed in the corpus)
@param corpus: the CORPUS node
@param keys: the hyperdata fields to index
"""
# check the ngrams we won't process (those that were already indexed)
indexed_ngrams_subquery = (session
.query(distinct(NodeNgram.ngram_id))
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.subquery()
)
# retrieve the ngrams from our list, filtering out the already indexed ones
todo_ngrams = (session
.query(Ngram)
.filter(Ngram.id.in_(ngram_ids))
.filter(~ Ngram.id.in_(indexed_ngrams_subquery))
.all()
)
# initialize result dict
node_ngram_to_write = {}
# loop throught the docs and their text fields
for doc in corpus.children('DOCUMENT'):
# a new empty counting subdict
node_ngram_to_write[doc.id] = {}
for key in keys:
# a text field
text = doc.hyperdata.get(key, None)
if not isinstance(text, str):
# print("WARN: doc %i has no text in field %s" % (doc.id, key))
continue
for ngram in todo_ngrams:
# build regexp : "british" => r'\bbritish\b'
ngram_re = r'\b%s\b' % ngram.terms
# --------------------------------------- find ---
n_occs = len(findall(ngram_re, text, IGNORECASE))
# -----------------------------------------------
# save the count results
if n_occs > 0:
if ngram.id not in node_ngram_to_write[doc.id]:
node_ngram_to_write[doc.id][ngram.id] = n_occs
else:
node_ngram_to_write[doc.id][ngram.id] += n_occs
# integrate all at the end
my_new_rows = []
add_new_row = my_new_rows.append
for doc_id in node_ngram_to_write:
for ngram_id in node_ngram_to_write[doc_id]:
wei = node_ngram_to_write[doc_id][ngram_id]
add_new_row([doc_id, ngram_id, wei])
del node_ngram_to_write
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = my_new_rows
)
n_added = len(my_new_rows)
print("index_new_ngrams: added %i new NodeNgram rows" % n_added)
return n_added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment