From b79f438f258122c66ef61eaa8b61f179af0c7fd4 Mon Sep 17 00:00:00 2001 From: rloth <romain.loth@iscpif.fr> Date: Thu, 19 Jan 2017 18:22:50 +0100 Subject: [PATCH] WIP2 comex_shared DB (luid and hashtags in create/update user) --- doc/table_specifications.md | 2 +- services/db.py | 231 +++++++++++------- services/main.py | 93 +++---- .../comex2_mysql_server/init_comex_shared.sql | 2 +- 4 files changed, 199 insertions(+), 129 deletions(-) diff --git a/doc/table_specifications.md b/doc/table_specifications.md index 8a6697d..76574e9 100755 --- a/doc/table_specifications.md +++ b/doc/table_specifications.md @@ -11,7 +11,7 @@ CREATE TABLE scholars ( -- local uid necessary for users who still have no doors_uid luid int(15) not null auto_increment unique primary key, -- doors uid common to all lab's services - doors_uid char(36) not null unique, + doors_uid char(36) unique, last_modified_date char(24) not null, email varchar(255) not null unique, country varchar(60) not null, diff --git a/services/db.py b/services/db.py index 433b5a9..e94f382 100644 --- a/services/db.py +++ b/services/db.py @@ -19,7 +19,8 @@ else: # sorted columns as declared in DB, as a tuple USER_COLS = [ # NAME, NOT NULL, N or MAXCHARS (if applicable) - ("doors_uid", True, 36), + ("luid", True, 15), + ("doors_uid", False, 36), ("last_modified_date", True, 24), ("email", True, 255), ("country", True, 60), @@ -31,7 +32,6 @@ USER_COLS = [ ("position", False, 30), ("hon_title", False, 30), ("interests_text", False, 1200), - ("community_hashtags", False, 350), ("gender", False, 1), ("job_looking_date", False, 24), ("home_url", False, 120), @@ -49,14 +49,15 @@ ORG_COLS = [ FIELDS_FRONTEND_TO_SQL = { "keywords": "keywords.kwstr", + "tags": "hashtags.htstr", + "countries": "scholars.country", + "gender": "scholars.gender", + "organizations": "affiliations.org", "laboratories": "affiliations.team_lab", - "tags": "scholars.community_hashtags", - - # new - "gender": "scholars.gender", "cities": "affiliations.org_city", + "linked": "linked_ids.ext_id_type" } @@ -139,7 +140,7 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): FROM scholars -- 0 or 1 LEFT JOIN linked_ids - ON scholars.doors_uid = linked_ids.uid + ON scholars.luid = linked_ids.uid GROUP BY %(col)s ) AS allcounts %(filter)s @@ -153,8 +154,8 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): FROM scholars -- 0 or many LEFT JOIN sch_kw - ON scholars.doors_uid = sch_kw.uid - JOIN keywords + ON scholars.luid = sch_kw.uid + LEFT JOIN keywords ON sch_kw.kwid = keywords.kwid GROUP BY %(col)s ) AS allcounts @@ -162,6 +163,21 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): ORDER BY occs DESC """ % {'col': sql_col, 'filter': where_clause} + elif sql_tab == 'hashtags': + stmt = """ + SELECT * FROM ( + SELECT %(col)s AS x, COUNT(*) AS occs + FROM scholars + -- 0 or many + LEFT JOIN sch_ht + ON scholars.luid = sch_ht.uid + LEFT JOIN hashtags + ON sch_ht.htid = hashtags.htid + GROUP BY %(col)s + ) AS allcounts + %(filter)s + ORDER BY occs DESC + """ % {'col': sql_col, 'filter': where_clause} mlog("DEBUGSQL", "get_field_aggs STATEMENT:\n-- SQL\n%s\n-- /SQL" % stmt) @@ -179,6 +195,9 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): def get_full_scholar(uid): """ + uid : str + local user id aka luid + Autonomous function to be used by User class => Retrieves one line from *scholars* table, with joined optional concatenated *affiliations*, *keywords* and *linked_ids* => Parse it all into a structured python user info dict @@ -195,7 +214,7 @@ def get_full_scholar(uid): one_usr_stmt = """ SELECT - sch_n_aff_n_kws.*, + sch_n_aff_n_kws_n_hts.*, -- linked_ids info condensed -- (format : "type1:ID1,type2:ID2,...") @@ -205,49 +224,68 @@ def get_full_scholar(uid): COUNT(linked_ids.ext_id) AS linked_ids_nb FROM ( - SELECT - sch_n_aff.*, - -- kws info condensed - COUNT(keywords.kwid) AS keywords_nb, - -- GROUP_CONCAT(keywords.kwid) AS kwids, - GROUP_CONCAT(keywords.kwstr) AS keywords + SELECT + sch_n_aff_n_kws.*, + -- hts info condensed + COUNT(hashtags.htid) AS hashtags_nb, + -- GROUP_CONCAT(hashtags.htid) AS htids, + GROUP_CONCAT(hashtags.htstr) AS hashtags - FROM ( + FROM ( SELECT - scholars.*, - -- for debug replace scholars.* by - -- scholars.doors_uid, - -- scholars.email, - -- scholars.last_modified_date, - -- scholars.initials, + sch_n_aff.*, - affiliations.* + -- kws info condensed + COUNT(keywords.kwid) AS keywords_nb, + -- GROUP_CONCAT(keywords.kwid) AS kwids, + GROUP_CONCAT(keywords.kwstr) AS keywords - FROM scholars + FROM ( + SELECT + -- scholars.*, + -- for debug replace scholars.* by + scholars.luid, + scholars.doors_uid, + scholars.email, + -- scholars.last_modified_date, + -- scholars.initials, - LEFT JOIN affiliations - ON scholars.affiliation_id = affiliations.affid + affiliations.* + + FROM scholars + + LEFT JOIN affiliations + ON scholars.affiliation_id = affiliations.affid + + GROUP BY luid + + ) AS sch_n_aff - GROUP BY doors_uid + -- two step JOIN for keywords + LEFT JOIN sch_kw + ON sch_kw.uid = luid + LEFT JOIN keywords + ON sch_kw.kwid = keywords.kwid + GROUP BY luid - ) AS sch_n_aff + ) AS sch_n_aff_n_kws - -- two step JOIN for keywords - LEFT JOIN sch_kw - ON sch_n_aff.doors_uid = sch_kw.uid - LEFT JOIN keywords - ON sch_kw.kwid = keywords.kwid - GROUP BY doors_uid + -- also two step JOIN for hashtags + LEFT JOIN sch_ht + ON sch_ht.uid = luid + LEFT JOIN hashtags + ON sch_ht.htid = hashtags.htid + GROUP BY luid - ) AS sch_n_aff_n_kws + ) AS sch_n_aff_n_kws_n_hts LEFT JOIN linked_ids - ON linked_ids.uid = sch_n_aff_n_kws.doors_uid + ON linked_ids.uid = luid -- WHERE our user UID - WHERE doors_uid = "%s" - GROUP BY doors_uid + WHERE luid = "%s" + GROUP BY luid """ % str(uid) mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt) @@ -289,16 +327,17 @@ def get_full_scholar(uid): # post-treatments # --------------- - # 1/ split concatenated kw lists and check correct length - if urow_dict['keywords_nb'] == 0: - urow_dict['keywords'] = [] - else: - kws_array = urow_dict['keywords'].split(',') - - if len(kws_array) != urow_dict['keywords_nb']: - raise ValueError("Can't correctly split keywords for user %s" % uid) + # 1/ split concatenated kw an ht lists and check correct length + for toktype in ['keywords', 'hashtags']: + if urow_dict[toktype+'_nb'] == 0: + urow_dict[toktype] = [] else: - urow_dict['keywords'] = kws_array + tokarray = urow_dict[toktype].split(',') + + if len(tokarray) != urow_dict[toktype+'_nb']: + raise ValueError("Can't correctly split %s for user %s" % (toktype, uid)) + else: + urow_dict[toktype] = tokarray # 2/ also split and parse all linked_ids if urow_dict['linked_ids_nb'] == 0: @@ -326,21 +365,21 @@ def get_full_scholar(uid): return urow_dict -def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False): +def save_scholar(safe_recs, reg_db, uactive=True, update_luid=None): """ For new registration: - -> add to *scholars* table + -> add to *scholars* table, return new local uid - For profile change (just toggle update_flag to True) + For profile change (just pass previous local uid in update_luid) -> *update* scholars table see also COLS variable and doc/table_specifications.md """ - # we already have the first two columns - db_tgtcols = ['doors_uid', 'last_modified_date'] - db_qstrvals = ["'"+str(uid)+"'", "'"+str(date)+"'"] - actual_len_dbg = 2 + # column names and column quoted values + db_tgtcols = [] + db_qstrvals = [] + actual_len_dbg = 0 # REMARK: # => In theory should be possible to execute(statment, values) to insert all @@ -353,8 +392,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False): # ------------- ----------- # and then we execute(full_statmt) :-) - - for colinfo in USER_COLS[2:]: + for colinfo in USER_COLS: colname = colinfo[0] # NB: each val already contains no quotes because of sanitize() @@ -385,7 +423,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False): reg_db_c = reg_db.cursor() - if not update_flag: + if not update_luid: # expected colnames "(doors_uid, last_modified_date, email, ...)" db_tgtcols_str = ','.join(db_tgtcols) @@ -404,63 +442,82 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False): set_full_str = ','.join([db_tgtcols[i] + '=' + db_qstrvals[i] for i in range(len(db_tgtcols))]) # UPDATE: full_statement with formated values - full_statmt = 'UPDATE scholars SET %s WHERE doors_uid = "%s"' % ( + full_statmt = 'UPDATE scholars SET %s WHERE luid = "%s"' % ( set_full_str, - uid + update_luid ) - mlog("DEBUG", "UPDATE" if update_flag else "INSERT", "SQL statement:", full_statmt) + mlog("DEBUG", "UPDATE" if update_luid else "INSERT", "SQL statement:", full_statmt) reg_db_c.execute(full_statmt) + if not update_luid: + luid = reg_db_c.lastrowid + else: + luid = update_luid reg_db.commit() + return luid -def save_pairs_sch_kw(pairings_list, comex_db): +def save_pairs_sch_tok(pairings_list, comex_db, map_table='sch_kw'): """ - Simply save all pairings (uid, kwid) in the list + Simply save all pairings (luid, kwid) or (luid, htid) in the list """ db_cursor = comex_db.cursor() for id_pair in pairings_list: - db_cursor.execute('INSERT INTO sch_kw VALUES %s' % str(id_pair)) + db_cursor.execute('INSERT INTO %s VALUES %s' % (map_table, str(id_pair))) comex_db.commit() - mlog("DEBUG", "Keywords: saved %s pair" % str(id_pair)) + mlog("DEBUG", "%s: saved %s pair" % (map_table, str(id_pair))) -def delete_pairs_sch_kw(uid, comex_db): +def delete_pairs_sch_tok(uid, comex_db, map_table='sch_kw'): """ - Simply deletes all pairings (uid, *) in the table + Simply deletes all pairings (luid, *) in the table """ + if map_table not in ['sch_kw', 'sch_ht']: + raise TypeError('ERROR: Unknown map_table') db_cursor = comex_db.cursor() - n = db_cursor.execute('DELETE FROM sch_kw WHERE uid = "%s"' % uid) + n = db_cursor.execute('DELETE FROM %s WHERE uid = "%s"' % (map_table, uid)) comex_db.commit() - mlog("DEBUG", "Keywords: DELETED %i pairings for %s" % (n, str(uid))) + mlog("DEBUG", "%s: DELETED %i pairings for %s" % (map_table, n, str(uid))) -def get_or_create_keywords(kw_list, comex_db): +def get_or_create_tokitems(tok_list, comex_db, tok_table='keywords'): """ kw_str -> lookup/add to *keywords* table -> kw_id + ht_str -> lookup/add to *hashtags* table -> ht_id ------------------------------------------------- - kw_list is an array of strings + tok_list is an array of strings NB keywords are mandatory: each registration should provide at least MIN_KW - + hashtags aren't for loop - 1) query to *keywords* table (exact match) + 1) query to *keywords* or *hashtags* table (exact match) 2) return id - => if a keyword matches return kwid - => if no keyword matches create new and return kwid + => if a keyword/tag matches return kwid/htid + => if no keyword/tag matches create new and return kwid/htid """ + # sql names + fill = {'tb': tok_table} + if tok_table == 'keywords': + fill['idc'] = 'kwid' + fill['strc']= 'kwstr' + elif tok_table == 'hashtags': + fill['idc'] = 'htid' + fill['strc']= 'htstr' + db_cursor = comex_db.cursor() found_ids = [] - for kw_str in kw_list: + for tok_str in tok_list: # TODO better string normalization here or in read_record - kw_str = kw_str.lower() + tok_str = tok_str.lower() + fill['q'] = tok_str - n_matched = db_cursor.execute('SELECT kwid FROM keywords WHERE kwstr = "%s"' % kw_str) + # ex: SELECT kwid FROM keywords WHERE kwstr = "complexity" + n_matched = db_cursor.execute('SELECT %(idc)s FROM %(tb)s WHERE %(strc)s = "%(q)s"' % fill) # ok existing keyword => row id if n_matched == 1: @@ -468,15 +525,17 @@ def get_or_create_keywords(kw_list, comex_db): # no matching keyword => add => row id elif n_matched == 0: - db_cursor.execute('INSERT INTO keywords(kwstr) VALUES ("%s")' % kw_str) + + # ex: INSERT INTO keywords(kwstr) VALUES ("complexity") + db_cursor.execute('INSERT INTO %(tb)s(%(strc)s) VALUES ("%(q)s")' % fill) comex_db.commit() - mlog("INFO", "Added keyword '%s'" % kw_str) + mlog("INFO", "Added '%s' to %s table" % (tok_str, tok_table)) found_ids.append(db_cursor.lastrowid) else: - raise Exception("ERROR: non-unique keyword '%s'" % kw_str) + raise Exception("ERROR: non-unique token '%s'" % tok_str) return found_ids @@ -489,9 +548,11 @@ def get_or_create_affiliation(org_info, comex_db): 1) query to *affiliations* table 2) return id => TODO if institution almost matches send suggestion - => TODO unicity constraint on institution + lab + => unicity constraint on institution + lab + org_type => if an institution matches return affid => if no institution matches create new and return affid + + TODO test more """ the_aff_id = None @@ -512,11 +573,9 @@ def get_or_create_affiliation(org_info, comex_db): db_qstrvals.append(quotedstrval) # for select - if colname != 'org_type': - db_constraints.append("%s = %s" % (colname, quotedstrval)) + db_constraints.append("%s = %s" % (colname, quotedstrval)) else: - if colname != 'org_type': - db_constraints.append("%s IS NULL" % colname) + db_constraints.append("%s IS NULL" % colname) db_cursor = comex_db.cursor() diff --git a/services/main.py b/services/main.py index 76055a8..009cbdd 100755 --- a/services/main.py +++ b/services/main.py @@ -43,7 +43,7 @@ if __package__ == 'services': from services.user import User, login_manager, doors_login, UCACHE from services.text import keywords from services.tools import restparse, mlog, re_hash, REALCONFIG - from services.db import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs + from services.db import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs from services.db_to_tina_api.extractDataCustom import MyExtractor as MySQL else: # when this script is run directly @@ -51,7 +51,7 @@ else: from user import User, login_manager, doors_login, UCACHE from text import keywords from tools import restparse, mlog, re_hash, REALCONFIG - from db import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs + from db import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs from db_to_tina_api.extractDataCustom import MyExtractor as MySQL # ============= app creation ============ @@ -61,7 +61,7 @@ app = Flask("services", static_folder=path.join(config['HOME'],"static"), template_folder=path.join(config['HOME'],"templates")) -app.config['DEBUG'] = (config['LOG_LEVEL'] == "DEBUG") +app.config['DEBUG'] = (config['LOG_LEVEL'] in ["DEBUG","DEBUGSQL"]) app.config['SECRET_KEY'] = 'TODO fill secret key for sessions for login' # for SSL @@ -82,6 +82,7 @@ login_manager.init_app(app) # all inputs as they are declared in form, as a couple SOURCE_FIELDS = [ # NAME, SANITIZE? + ("luid", False ), ("doors_uid", False ), ("last_modified_date", False ), # ex 2016-11-16T17:47:07.308Z ("email", True ), @@ -95,8 +96,7 @@ SOURCE_FIELDS = [ ("position", True ), ("hon_title", True ), ("interests_text", True ), - ("community_hashtags", True ), - ("gender", True ), # M|F + ("gender", False ), # M|F ("job_looking_date", True ), # def null: not looking for a job ("home_url", True ), # scholar's homepage ("pic_url", True ), @@ -110,8 +110,11 @@ SOURCE_FIELDS = [ ("org_city", True ), # => for *affiliations* table - ("keywords", True ) + ("keywords", True ), # => for *keywords* table (after split str) + + ("community_hashtags", True ) + # => for *hashtags* table (after split str) ] # NB password values have already been sent by ajax to Doors @@ -442,10 +445,11 @@ def save_form(request_form, request_files, update_flag=False): """ # only safe values clean_records = {} - kw_array = [] # 1) handles all the inputs from form, no matter what target table - (duuid, rdate, kw_array, clean_records) = read_record(request_form) + clean_records = read_record(request_form) + + mlog("DEBUG", "===== clean_records =====", clean_records) # 2) handles the pic_file if present if 'pic_file' in request_files: @@ -461,28 +465,42 @@ def save_form(request_form, request_files, update_flag=False): # B) read/fill the affiliation table to get associated id clean_records['affiliation_id'] = get_or_create_affiliation(clean_records, reg_db) - # C) create record into the primary user table - # --------------------------------------------- + # C) create/update record into the primary user table + # ---------------------------------------------------- # TODO class User method !! - save_scholar(duuid, rdate, clean_records, reg_db, update_flag=update_flag) + luid = None + if update_flag: + luid = clean_records['luid'] + save_scholar(clean_records, reg_db, update_luid=luid) + else: + luid = save_scholar(clean_records, reg_db) + # D) read/fill each keyword and save the (uid <=> kwid) pairings - kwids = get_or_create_keywords(kw_array, reg_db) + # read/fill each hashtag and save the (uid <=> htid) pairings + for intables in [['keywords', 'keywords', 'sch_kw'], + ['community_hashtags', 'hashtags', 'sch_ht']]: + tok_field = intables[0] + if tok_field in clean_records: + tok_table = intables[1] + map_table = intables[2] - # TODO class User method !! - # POSS selective delete ? - if update_flag: - delete_pairs_sch_kw(duuid, reg_db) + tokids = get_or_create_tokitems(clean_records[tok_field], reg_db, tok_table) - save_pairs_sch_kw([(duuid, kwid) for kwid in kwids], reg_db) + # TODO class User method !! + # POSS selective delete ? + if update_flag: + delete_pairs_sch_tok(luid, reg_db, map_table) - # clear cache concerning this scholar - # TODO class User method !! - if duuid in UCACHE: UCACHE.pop(duuid) + save_pairs_sch_tok([(luid, tokid) for tokid in tokids], reg_db, map_table) - # E) end connection + # F) end connection reg_db.close() + # clear cache concerning this scholar + # TODO class User method !! + if luid in UCACHE: UCACHE.pop(luid) + return clean_records @@ -492,14 +510,9 @@ def read_record(incoming_data): - custom made for regcomex/templates/base_form - uses SOURCE_FIELDS """ - # init var clean_records = {} - # read in + sanitize values - duuid = None - rdate = None - # we should have all the mandatory fields (checked in client-side js) # TODO recheck b/c if post comes from elsewhere for field_info in SOURCE_FIELDS: @@ -513,31 +526,29 @@ def read_record(incoming_data): else: # mysql will want None instead of '' val = None - - # these 2 fields already validated and useful separately - elif field == 'doors_uid': - duuid = incoming_data[field] - elif field == 'last_modified_date': - rdate = incoming_data[field] - # any other fields that don't need sanitization (ex: menu options) else: clean_records[field] = incoming_data[field] + # special treatment for "other" subquestions if 'org_type' in clean_records: if clean_records['org_type'] == 'other' and 'other_org_type' in clean_records: clean_records['org_type'] = clean_records['other_org_type'] - # split for kw_array - kw_array = [] - if 'keywords' in clean_records: - for kw in clean_records['keywords'].split(','): - kw = sanitize(kw) - if kw != '': - kw_array.append(kw) + # splits for kw_array and ht_array + for tok_field in ['keywords', 'community_hashtags']: + if tok_field in clean_records: + print(tok_field, "in clean_records") + temp_array = [] + for tok in clean_records[tok_field].split(','): + tok = sanitize(tok) + if tok != '': + temp_array.append(tok) + # replace str by array + clean_records[tok_field] = temp_array - return (duuid, rdate, kw_array, clean_records) + return clean_records # TODO move to text submodules diff --git a/setup/dockers/comex2_mysql_server/init_comex_shared.sql b/setup/dockers/comex2_mysql_server/init_comex_shared.sql index 6dd0111..9f13890 100644 --- a/setup/dockers/comex2_mysql_server/init_comex_shared.sql +++ b/setup/dockers/comex2_mysql_server/init_comex_shared.sql @@ -11,7 +11,7 @@ CREATE TABLE scholars ( -- local uid necessary for users who still have no doors_uid luid int(15) not null auto_increment unique primary key, -- doors uid common to all lab's services - doors_uid char(36) not null unique, + doors_uid char(36) unique, last_modified_date char(24) not null, email varchar(255) not null unique, country varchar(60) not null, -- 2.21.0