import terms table: better stats log and result format

fae2b9c0 · Romain Loth · 1b1417de · fae2b9c0 · fae2b9c0
Commit fae2b9c0 authored Jun 14, 2016 by Romain Loth
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 20 deletions

db.py gargantext/util/db.py +25 -2

ngramlists_tools.py gargantext/util/ngramlists_tools.py +34 -18

No files found.
--- a/gargantext/util/db.py
+++ b/gargantext/util/db.py
@@ -86,7 +86,19 @@ class bulk_insert:
    readline = read
-def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
+def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stats=False):
+    """
+    Inserts bulk data with an intermediate check on a uniquekey
+    (ex: Ngram.terms) to see if the row existed before.
+    If the row already existed we just retrieve its id.
+    If it didn't exist we create it and retrieve the id.
+    Returns a dict {uniquekey => id}
+    Option:
+        do stats: also returns the number of those that had no previous id
+    """
    if cursor is None:
        db, cursor = get_cursor()
        mustcommit = True
@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
        sourcetable = model.__tablename__,
        uniquecolumn = uniquekey,
    ))
    # insert what has not been found to the real table
    cursor.execute('''
        INSERT INTO {sourcetable} ({columns})
@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
        sourcetable = model.__tablename__,
        columns = ', '.join(fields),
    ))
+    if do_stats:
+        # remember how many rows we inserted just now
+        n_new = cursor.rowcount
    # retrieve dict associating unique key to id
    cursor.execute('''
        SELECT source.id, source.{uniquecolumn}
@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
        columns = ', '.join(fields),
    ))
    result = {
+        # term : new_id
        row[1]: row[0] for row in cursor.fetchall()
    }
    # this is the end!
    cursor.execute('DROP TABLE __tmp__')
    if mustcommit:
        db.commit()
-    return result
+    if do_stats:
+        return result, n_new
+    else:
+        return result
--- a/gargantext/util/ngramlists_tools.py
+++ b/gargantext/util/ngramlists_tools.py
@@ -253,7 +253,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
    this_corpus_all_rows = map_csv_rows + miam_csv_rows + stop_csv_rows
    # choice of output: file or string
-    print(type(fname))
    if fname == None:
        out_file = StringIO()
    elif type(fname) == str:
@@ -317,6 +316,8 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
           (creates absent ngrams if necessary)
        => use the new ids to map the relations involving the old ones
+    NB: the creation of MAINLIST also adds all elements from the MAPLIST
    NB: To merge the imported lists into a corpus node's lists,
        chain this function with merge_ngramlists()
    '''
@@ -346,11 +347,17 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
                             quoting   = QUOTE_MINIMAL
                             )
+    # for stats
    n_read_lines = 0
+    n_total_ng = 0
+    n_added_ng = 0
+    n_group_relations = 0
    # load CSV + initial checks
    for i, csv_row in enumerate(ngrams_csv_rows):
-        print("---------------READ LINE %i" % i)
+        # fyi
+        n_read_lines +=1
+        # print("---------------READ LINE %i" % i)
        try:
            this_ng_oldid        = str(csv_row[0])
            this_ng_term         = str(csv_row[1])
@@ -398,19 +405,19 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
                  (this_ng_oldid,external_subform_id)
                  )
-        # fyi
-        n_read_lines +=1
    # end of CSV read
    fh.close()
    # ======== ngram save + id lookup =========
-    # returns a dict {term => id}
+    n_total_ng = len(imported_ngrams_dbdata)
-    new_ngrams_ids = bulk_insert_ifnotexists(
+    # returns a dict {term => id} and a count of inserted ones
+    (new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
        model = Ngram,
        uniquekey = 'terms',
        fields = ('terms', 'n'),
-        data = imported_ngrams_dbdata
+        data = imported_ngrams_dbdata,
+        do_stats = True
    )
    del imported_ngrams_dbdata
@@ -421,34 +428,43 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
    del new_ngrams_ids
    del imported_ngrams_oldids
-    print(old_to_new_id_map)
+    # print(old_to_new_id_map)
-    print(import_nodes_ngrams)
+    # print(import_nodes_ngrams)
    # ======== Import into lists =========
-    # 3 x abstract lists
+    # 3 x abstract lists + 1 translations
-    new_lists = {
+    result = {
         'map':  UnweightedList(),
         'main': UnweightedList(),
-         'stop': UnweightedList()
+         'stop': UnweightedList(),
+         'groupings' : Translations()
         }
    for list_type in import_nodes_ngrams:
        for old_id in import_nodes_ngrams[list_type]:
            new_id = old_to_new_id_map[old_id]
            # add to the abstract list
-            new_lists[list_type].items.add(new_id)
+            result[list_type].items.add(new_id)
-    # ======== Synonyms =========
+        # for main also add map elements
-    new_groups = Translations()
+        if list_type == 'main':
+            for old_id in import_nodes_ngrams['map']:
+                new_id = old_to_new_id_map[old_id]
+                result['main'].items.add(new_id)
+    # ======== Synonyms =========
    for (x,y) in imported_groupings:
        new_mainform_id = old_to_new_id_map[x]
        new_subform_id  = old_to_new_id_map[y]
        # /!\ Translations use (subform => mainform) order
-        new_groups.items[new_subform_id] = new_mainform_id
+        result['groupings'].items[new_subform_id] = new_mainform_id
+        n_group_relations += 1
    # ------------------------------------------------------------------
    print("IMPORT: read %i lines from the CSV" % n_read_lines)
+    print("IMPORT: read %i terms (%i added and %i already existing)"
+                % (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
+    print("IMPORT: read %i grouping relations" % n_group_relations)
-    return (new_lists, new_groups)
+    return result