[FEAT] import/export terms table: previously unindexed ngrams are indexed at import

24c99bbe · Romain Loth · 4cb382da · 24c99bbe · 24c99bbe
Commit 24c99bbe authored Jul 20, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 6 deletions

ngramlists_tools.py gargantext/util/ngramlists_tools.py +21 -6

ngrams_addition.py gargantext/util/toolchain/ngrams_addition.py +112 -0

No files found.
--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -21,6 +21,9 @@ from gargantext.constants        import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
 from gargantext.util.toolchain.parsing           import normalize_chars
 from gargantext.util.toolchain.ngrams_extraction import normalize_forms

+# merge will also index the new ngrams in the docs of the corpus
+from gargantext.util.toolchain.ngrams_addition   import index_new_ngrams
+
 from sqlalchemy.sql      import exists
 from os                  import path
 from csv                 import writer, reader, QUOTE_MINIMAL
@@ -483,7 +486,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
            this_row_forms = ''

        # string normalizations
-        this_row_label = normalize_terms(normalize_chars(this_row_label))
+        this_row_label = normalize_forms(normalize_chars(this_row_label))

        # except:
        #     if i == 0:
@@ -521,7 +524,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
            for raw_term_str in this_row_forms.split(group_delimiter):

                # each subform is also like an ngram declaration
-                term_str = normalize_terms(normalize_chars(raw_term_str))
+                term_str = normalize_forms(normalize_chars(raw_term_str))
                imported_unique_ngramstrs[term_str] = True
                imported_nodes_ngrams[this_list_type].append(term_str)

@@ -559,6 +562,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,

    # print(new_ngrams_ids)
    # print(imported_nodes_ngrams)
+
    # ======== Import into lists =========

    # 3 x abstract lists + 1 translations
@@ -632,11 +636,8 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
                                          seront remis dans la main à la fin)

    NB: Uses group_tools.group_union() to merge the synonym links.
-
-    FIXME: new terms created at import_ngramlists() can now be added to lists
-           but are never added to docs
+        Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
    """
-
    # log to send back to client-side (lines will be joined)
    my_log = []

@@ -656,6 +657,20 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
       {'key': 'map',  'name':"MAPLIST"}      # lid = 2
    ]

+
+    # ======== Index the new ngrams in the docs =========
+    all_possibly_new_ngram_ids = []
+    collect = all_possibly_new_ngram_ids.append
+    for lid, info in enumerate(linfos):
+        list_type = info['key']
+        if list_type in new_lists:
+            for ng_id in new_lists[list_type].items:
+                collect(ng_id)
+
+    n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
+
+    my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
+
    # ======== Get the old lists =========
    old_lists = {}


--- a/gargantext/util/toolchain/ngrams_addition.py
+++ b/gargantext/util/toolchain/ngrams_addition.py
+"""
+Module for raw indexing a totally new ngram
+
+  => creates new (doc_node <-> new_ngram) relations in NodeNgram
+
+use cases:
+  - from annotation view user selects a free segment of text to make a new ngram
+  - at list import, any new list can contain ngrams that've never been extracted
+
+prerequisite:
+  - normalize_chars(new_ngram_str)
+  - normalize_form(new_ngram_str)
+  - add the new ngram to `ngrams` table
+
+procedure:
+  - simple regexp search of the ngram string => addition to NodeNgram
+  /!\ -> morphological variants are NOT considered (ex plural or declined forms)
+"""
+
+from gargantext.models   import Ngram, Node, NodeNgram
+from gargantext.util.db  import session, bulk_insert
+from sqlalchemy          import distinct
+from re                  import findall, IGNORECASE
+
+# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
+
+def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
+    """
+    Find occurrences of some ngrams for every document of the given corpus.
+    + insert them in the NodeNgram table.
+
+    @param ngram_ids: a list of ids for Ngram objects
+                      (we assume they already went throught normalizations
+                       and they were already added to Ngrams table
+                       and optionally to some of the lists like MAPLIST)
+
+            (but we can't know if they were previously indexed in the corpus)
+
+    @param corpus: the CORPUS node
+
+    @param keys: the hyperdata fields to index
+    """
+
+    # check the ngrams we won't process (those that were already indexed)
+    indexed_ngrams_subquery = (session
+                                .query(distinct(NodeNgram.ngram_id))
+                                .join(Node, Node.id == NodeNgram.node_id)
+                                .filter(Node.parent_id == corpus.id)
+                                .filter(Node.typename == 'DOCUMENT')
+                                .subquery()
+                                )
+
+    # retrieve the ngrams from our list, filtering out the already indexed ones
+    todo_ngrams = (session
+                    .query(Ngram)
+                    .filter(Ngram.id.in_(ngram_ids))
+                    .filter(~ Ngram.id.in_(indexed_ngrams_subquery))
+                    .all()
+                    )
+
+    # initialize result dict
+    node_ngram_to_write = {}
+
+    # loop throught the docs and their text fields
+    for doc in corpus.children('DOCUMENT'):
+
+        # a new empty counting subdict
+        node_ngram_to_write[doc.id] = {}
+
+        for key in keys:
+            # a text field
+            text = doc.hyperdata.get(key, None)
+
+            if not isinstance(text, str):
+                # print("WARN: doc %i has no text in field %s" % (doc.id, key))
+                continue
+
+            for ngram in todo_ngrams:
+                # build regexp : "british" => r'\bbritish\b'
+                ngram_re = r'\b%s\b' % ngram.terms
+
+                # --------------------------------------- find ---
+                n_occs = len(findall(ngram_re, text, IGNORECASE))
+                # -----------------------------------------------
+
+                # save the count results
+                if n_occs > 0:
+                    if ngram.id not in node_ngram_to_write[doc.id]:
+                        node_ngram_to_write[doc.id][ngram.id] = n_occs
+                    else:
+                        node_ngram_to_write[doc.id][ngram.id] += n_occs
+
+    # integrate all at the end
+    my_new_rows = []
+    add_new_row = my_new_rows.append
+    for doc_id in node_ngram_to_write:
+        for ngram_id in node_ngram_to_write[doc_id]:
+            wei = node_ngram_to_write[doc_id][ngram_id]
+            add_new_row([doc_id, ngram_id, wei])
+
+    del node_ngram_to_write
+
+    bulk_insert(
+        table = NodeNgram,
+        fields = ('node_id', 'ngram_id', 'weight'),
+        data = my_new_rows
+    )
+
+    n_added = len(my_new_rows)
+    print("index_new_ngrams: added %i new NodeNgram rows" % n_added)
+
+    return n_added