Commit fae2b9c0 authored by Romain Loth's avatar Romain Loth

import terms table: better stats log and result format

parent 1b1417de
...@@ -86,7 +86,19 @@ class bulk_insert: ...@@ -86,7 +86,19 @@ class bulk_insert:
readline = read readline = read
def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None): def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stats=False):
"""
Inserts bulk data with an intermediate check on a uniquekey
(ex: Ngram.terms) to see if the row existed before.
If the row already existed we just retrieve its id.
If it didn't exist we create it and retrieve the id.
Returns a dict {uniquekey => id}
Option:
do stats: also returns the number of those that had no previous id
"""
if cursor is None: if cursor is None:
db, cursor = get_cursor() db, cursor = get_cursor()
mustcommit = True mustcommit = True
...@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None): ...@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__, sourcetable = model.__tablename__,
uniquecolumn = uniquekey, uniquecolumn = uniquekey,
)) ))
# insert what has not been found to the real table # insert what has not been found to the real table
cursor.execute(''' cursor.execute('''
INSERT INTO {sourcetable} ({columns}) INSERT INTO {sourcetable} ({columns})
...@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None): ...@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable = model.__tablename__, sourcetable = model.__tablename__,
columns = ', '.join(fields), columns = ', '.join(fields),
)) ))
if do_stats:
# remember how many rows we inserted just now
n_new = cursor.rowcount
# retrieve dict associating unique key to id # retrieve dict associating unique key to id
cursor.execute(''' cursor.execute('''
SELECT source.id, source.{uniquecolumn} SELECT source.id, source.{uniquecolumn}
...@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None): ...@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
columns = ', '.join(fields), columns = ', '.join(fields),
)) ))
result = { result = {
# term : new_id
row[1]: row[0] for row in cursor.fetchall() row[1]: row[0] for row in cursor.fetchall()
} }
# this is the end! # this is the end!
cursor.execute('DROP TABLE __tmp__') cursor.execute('DROP TABLE __tmp__')
if mustcommit: if mustcommit:
db.commit() db.commit()
return result
if do_stats:
return result, n_new
else:
return result
...@@ -253,7 +253,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False): ...@@ -253,7 +253,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
this_corpus_all_rows = map_csv_rows + miam_csv_rows + stop_csv_rows this_corpus_all_rows = map_csv_rows + miam_csv_rows + stop_csv_rows
# choice of output: file or string # choice of output: file or string
print(type(fname))
if fname == None: if fname == None:
out_file = StringIO() out_file = StringIO()
elif type(fname) == str: elif type(fname) == str:
...@@ -317,6 +316,8 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'): ...@@ -317,6 +316,8 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
(creates absent ngrams if necessary) (creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones => use the new ids to map the relations involving the old ones
NB: the creation of MAINLIST also adds all elements from the MAPLIST
NB: To merge the imported lists into a corpus node's lists, NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists() chain this function with merge_ngramlists()
''' '''
...@@ -346,11 +347,17 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'): ...@@ -346,11 +347,17 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
quoting = QUOTE_MINIMAL quoting = QUOTE_MINIMAL
) )
# for stats
n_read_lines = 0 n_read_lines = 0
n_total_ng = 0
n_added_ng = 0
n_group_relations = 0
# load CSV + initial checks # load CSV + initial checks
for i, csv_row in enumerate(ngrams_csv_rows): for i, csv_row in enumerate(ngrams_csv_rows):
print("---------------READ LINE %i" % i) # fyi
n_read_lines +=1
# print("---------------READ LINE %i" % i)
try: try:
this_ng_oldid = str(csv_row[0]) this_ng_oldid = str(csv_row[0])
this_ng_term = str(csv_row[1]) this_ng_term = str(csv_row[1])
...@@ -398,19 +405,19 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'): ...@@ -398,19 +405,19 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
(this_ng_oldid,external_subform_id) (this_ng_oldid,external_subform_id)
) )
# fyi
n_read_lines +=1
# end of CSV read # end of CSV read
fh.close() fh.close()
# ======== ngram save + id lookup ========= # ======== ngram save + id lookup =========
# returns a dict {term => id} n_total_ng = len(imported_ngrams_dbdata)
new_ngrams_ids = bulk_insert_ifnotexists(
# returns a dict {term => id} and a count of inserted ones
(new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
model = Ngram, model = Ngram,
uniquekey = 'terms', uniquekey = 'terms',
fields = ('terms', 'n'), fields = ('terms', 'n'),
data = imported_ngrams_dbdata data = imported_ngrams_dbdata,
do_stats = True
) )
del imported_ngrams_dbdata del imported_ngrams_dbdata
...@@ -421,34 +428,43 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'): ...@@ -421,34 +428,43 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
del new_ngrams_ids del new_ngrams_ids
del imported_ngrams_oldids del imported_ngrams_oldids
print(old_to_new_id_map) print(old_to_new_id_map)
print(import_nodes_ngrams) # print(import_nodes_ngrams)
# ======== Import into lists ========= # ======== Import into lists =========
# 3 x abstract lists # 3 x abstract lists + 1 translations
new_lists = { result = {
'map': UnweightedList(), 'map': UnweightedList(),
'main': UnweightedList(), 'main': UnweightedList(),
'stop': UnweightedList() 'stop': UnweightedList(),
'groupings' : Translations()
} }
for list_type in import_nodes_ngrams: for list_type in import_nodes_ngrams:
for old_id in import_nodes_ngrams[list_type]: for old_id in import_nodes_ngrams[list_type]:
new_id = old_to_new_id_map[old_id] new_id = old_to_new_id_map[old_id]
# add to the abstract list # add to the abstract list
new_lists[list_type].items.add(new_id) result[list_type].items.add(new_id)
# ======== Synonyms ========= # for main also add map elements
new_groups = Translations() if list_type == 'main':
for old_id in import_nodes_ngrams['map']:
new_id = old_to_new_id_map[old_id]
result['main'].items.add(new_id)
# ======== Synonyms =========
for (x,y) in imported_groupings: for (x,y) in imported_groupings:
new_mainform_id = old_to_new_id_map[x] new_mainform_id = old_to_new_id_map[x]
new_subform_id = old_to_new_id_map[y] new_subform_id = old_to_new_id_map[y]
# /!\ Translations use (subform => mainform) order # /!\ Translations use (subform => mainform) order
new_groups.items[new_subform_id] = new_mainform_id result['groupings'].items[new_subform_id] = new_mainform_id
n_group_relations += 1
# ------------------------------------------------------------------ # ------------------------------------------------------------------
print("IMPORT: read %i lines from the CSV" % n_read_lines) print("IMPORT: read %i lines from the CSV" % n_read_lines)
print("IMPORT: read %i terms (%i added and %i already existing)"
% (n_total_ng, n_added_ng, n_total_ng-n_added_ng) )
print("IMPORT: read %i grouping relations" % n_group_relations)
return (new_lists, new_groups) return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment