WIP2 comex_shared DB (luid and hashtags in create/update user)

b79f438f · Romain Loth · ae898647 · b79f438f · b79f438f · b79f438f
Commit b79f438f authored Jan 19, 2017 by Romain Loth
Showing with 199 additions and 129 deletions

table_specifications.md doc/table_specifications.md +1 -1

db.py services/db.py +145 -86

main.py services/main.py +52 -41

init_comex_shared.sql setup/dockers/comex2_mysql_server/init_comex_shared.sql +1 -1

No files found.
--- a/doc/table_specifications.md
+++ b/doc/table_specifications.md
@@ -11,7 +11,7 @@ CREATE TABLE scholars (
    -- local uid necessary for users who still have no doors_uid
    luid                 int(15) not null auto_increment unique primary key,
    -- doors uid common to all lab's services
-    doors_uid            char(36) not null unique,
+    doors_uid            char(36) unique,
    last_modified_date   char(24) not null,
    email                varchar(255) not null unique,
    country              varchar(60) not null,

--- a/services/db.py
+++ b/services/db.py
@@ -19,7 +19,8 @@ else:
 # sorted columns as declared in DB, as a tuple
 USER_COLS = [
 #          NAME,               NOT NULL,  N or MAXCHARS (if applicable)
-         ("doors_uid",              True,        36),
+         ("luid",                   True,        15),
+         ("doors_uid",             False,        36),
         ("last_modified_date",     True,        24),
         ("email",                  True,       255),
         ("country",                True,        60),
@@ -31,7 +32,6 @@ USER_COLS = [
         ("position",              False,        30),
         ("hon_title",             False,        30),
         ("interests_text",        False,      1200),
-         ("community_hashtags",    False,       350),
         ("gender",                False,         1),
         ("job_looking_date",      False,        24),
         ("home_url",              False,       120),
@@ -49,14 +49,15 @@ ORG_COLS = [
 FIELDS_FRONTEND_TO_SQL = {
    "keywords": "keywords.kwstr",
+    "tags": "hashtags.htstr",
    "countries": "scholars.country",
+    "gender": "scholars.gender",
    "organizations": "affiliations.org",
    "laboratories": "affiliations.team_lab",
-    "tags": "scholars.community_hashtags",
-    # new
-    "gender": "scholars.gender",
    "cities": "affiliations.org_city",
    "linked": "linked_ids.ext_id_type"
 }
@@ -139,7 +140,7 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                    FROM scholars
                    -- 0 or 1
                    LEFT JOIN linked_ids
-                        ON scholars.doors_uid = linked_ids.uid
+                        ON scholars.luid = linked_ids.uid
                    GROUP BY %(col)s
                ) AS allcounts
                %(filter)s
@@ -153,8 +154,8 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                    FROM scholars
                    -- 0 or many
                    LEFT JOIN sch_kw
-                        ON scholars.doors_uid = sch_kw.uid
+                        ON scholars.luid = sch_kw.uid
-                    JOIN keywords
+                    LEFT JOIN keywords
                        ON sch_kw.kwid = keywords.kwid
                    GROUP BY %(col)s
                ) AS allcounts
@@ -162,6 +163,21 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                ORDER BY occs DESC
            """ % {'col': sql_col, 'filter': where_clause}
+        elif sql_tab == 'hashtags':
+            stmt = """
+                SELECT * FROM (
+                    SELECT %(col)s AS x, COUNT(*) AS occs
+                    FROM scholars
+                    -- 0 or many
+                    LEFT JOIN sch_ht
+                        ON scholars.luid = sch_ht.uid
+                    LEFT JOIN hashtags
+                        ON sch_ht.htid = hashtags.htid
+                    GROUP BY %(col)s
+                ) AS allcounts
+                %(filter)s
+                ORDER BY occs DESC
+            """ % {'col': sql_col, 'filter': where_clause}
        mlog("DEBUGSQL", "get_field_aggs STATEMENT:\n-- SQL\n%s\n-- /SQL" % stmt)
@@ -179,6 +195,9 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
 def get_full_scholar(uid):
    """
+    uid : str
+          local user id aka luid
    Autonomous function to be used by User class
       => Retrieves one line from *scholars* table, with joined optional concatenated *affiliations*, *keywords* and *linked_ids*
       => Parse it all into a structured python user info dict
@@ -195,7 +214,7 @@ def get_full_scholar(uid):
    one_usr_stmt = """
        SELECT
-            sch_n_aff_n_kws.*,
+            sch_n_aff_n_kws_n_hts.*,
            -- linked_ids info condensed
            -- (format : "type1:ID1,type2:ID2,...")
@@ -204,6 +223,15 @@ def get_full_scholar(uid):
                ) AS linked_ids,
            COUNT(linked_ids.ext_id) AS linked_ids_nb
+        FROM (
+            SELECT
+                sch_n_aff_n_kws.*,
+                -- hts info condensed
+                COUNT(hashtags.htid) AS hashtags_nb,
+                -- GROUP_CONCAT(hashtags.htid) AS htids,
+                GROUP_CONCAT(hashtags.htstr) AS hashtags
            FROM (
                    SELECT
                        sch_n_aff.*,
@@ -215,10 +243,11 @@ def get_full_scholar(uid):
                    FROM (
                        SELECT
-                        scholars.*,
+                            -- scholars.*,
                            -- for debug replace scholars.* by
-                        -- scholars.doors_uid,
+                            scholars.luid,
-                        -- scholars.email,
+                            scholars.doors_uid,
+                            scholars.email,
                            -- scholars.last_modified_date,
                            -- scholars.initials,
@@ -229,25 +258,34 @@ def get_full_scholar(uid):
                        LEFT JOIN affiliations
                            ON scholars.affiliation_id = affiliations.affid
-                    GROUP BY doors_uid
+                        GROUP BY luid
                        ) AS sch_n_aff
                    -- two step JOIN for keywords
                    LEFT JOIN sch_kw
-                    ON sch_n_aff.doors_uid = sch_kw.uid
+                        ON sch_kw.uid = luid
                    LEFT JOIN keywords
                        ON sch_kw.kwid = keywords.kwid
-                GROUP BY doors_uid
+                    GROUP BY luid
            ) AS sch_n_aff_n_kws
+            -- also two step JOIN for hashtags
+            LEFT JOIN sch_ht
+                ON sch_ht.uid = luid
+            LEFT JOIN hashtags
+                ON sch_ht.htid = hashtags.htid
+            GROUP BY luid
+        ) AS sch_n_aff_n_kws_n_hts
        LEFT JOIN linked_ids
-            ON linked_ids.uid = sch_n_aff_n_kws.doors_uid
+            ON linked_ids.uid = luid
        -- WHERE our user UID
-        WHERE  doors_uid = "%s"
+        WHERE  luid = "%s"
-        GROUP BY doors_uid
+        GROUP BY luid
    """ % str(uid)
    mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt)
@@ -289,16 +327,17 @@ def get_full_scholar(uid):
    # post-treatments
    # ---------------
-    # 1/ split concatenated kw lists and check correct length
+    # 1/ split concatenated kw an ht lists and check correct length
-    if urow_dict['keywords_nb'] == 0:
+    for toktype in ['keywords', 'hashtags']:
-        urow_dict['keywords'] = []
+        if urow_dict[toktype+'_nb'] == 0:
+            urow_dict[toktype] = []
        else:
-        kws_array = urow_dict['keywords'].split(',')
+            tokarray = urow_dict[toktype].split(',')
-        if len(kws_array) != urow_dict['keywords_nb']:
+            if len(tokarray) != urow_dict[toktype+'_nb']:
-            raise ValueError("Can't correctly split keywords for user %s" % uid)
+                raise ValueError("Can't correctly split %s for user %s" % (toktype, uid))
            else:
-            urow_dict['keywords'] = kws_array
+                urow_dict[toktype] = tokarray
    # 2/ also split and parse all linked_ids
    if urow_dict['linked_ids_nb'] == 0:
@@ -326,21 +365,21 @@ def get_full_scholar(uid):
    return urow_dict
-def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
+def save_scholar(safe_recs, reg_db, uactive=True, update_luid=None):
    """
    For new registration:
-      -> add to *scholars* table
+      -> add to *scholars* table, return new local uid
-    For profile change (just toggle update_flag to True)
+    For profile change (just pass previous local uid in update_luid)
      -> *update* scholars table
    see also COLS variable and doc/table_specifications.md
    """
-    # we already have the first two columns
+    # column names and column quoted values
-    db_tgtcols = ['doors_uid', 'last_modified_date']
+    db_tgtcols = []
-    db_qstrvals = ["'"+str(uid)+"'", "'"+str(date)+"'"]
+    db_qstrvals = []
-    actual_len_dbg = 2
+    actual_len_dbg = 0
    # REMARK:
    # => In theory should be possible to execute(statment, values) to insert all
@@ -353,8 +392,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
    #                            -------------              -----------
    #    and then we execute(full_statmt)         :-)
+    for colinfo in USER_COLS:
-    for colinfo in USER_COLS[2:]:
        colname = colinfo[0]
        # NB: each val already contains no quotes because of sanitize()
@@ -385,7 +423,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
    reg_db_c = reg_db.cursor()
-    if not update_flag:
+    if not update_luid:
        # expected colnames "(doors_uid, last_modified_date, email, ...)"
        db_tgtcols_str = ','.join(db_tgtcols)
@@ -404,63 +442,82 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
        set_full_str = ','.join([db_tgtcols[i] + '=' + db_qstrvals[i] for i in range(len(db_tgtcols))])
        # UPDATE: full_statement with formated values
-        full_statmt = 'UPDATE scholars SET %s WHERE doors_uid = "%s"' % (
+        full_statmt = 'UPDATE scholars SET %s WHERE luid = "%s"' % (
                            set_full_str,
-                            uid
+                            update_luid
        )
-    mlog("DEBUG", "UPDATE" if update_flag else "INSERT",  "SQL statement:", full_statmt)
+    mlog("DEBUG", "UPDATE" if update_luid else "INSERT",  "SQL statement:", full_statmt)
    reg_db_c.execute(full_statmt)
+    if not update_luid:
+        luid = reg_db_c.lastrowid
+    else:
+        luid = update_luid
    reg_db.commit()
+    return luid
-def save_pairs_sch_kw(pairings_list, comex_db):
+def save_pairs_sch_tok(pairings_list, comex_db, map_table='sch_kw'):
    """
-    Simply save all pairings (uid, kwid) in the list
+    Simply save all pairings (luid, kwid) or (luid, htid) in the list
    """
    db_cursor = comex_db.cursor()
    for id_pair in pairings_list:
-        db_cursor.execute('INSERT INTO sch_kw VALUES %s' % str(id_pair))
+        db_cursor.execute('INSERT INTO %s VALUES %s' % (map_table, str(id_pair)))
        comex_db.commit()
-        mlog("DEBUG", "Keywords: saved %s pair" % str(id_pair))
+        mlog("DEBUG", "%s: saved %s pair" % (map_table, str(id_pair)))
-def delete_pairs_sch_kw(uid, comex_db):
+def delete_pairs_sch_tok(uid, comex_db, map_table='sch_kw'):
    """
-    Simply deletes all pairings (uid, *) in the table
+    Simply deletes all pairings (luid, *) in the table
    """
+    if map_table not in ['sch_kw', 'sch_ht']:
+        raise TypeError('ERROR: Unknown map_table')
    db_cursor = comex_db.cursor()
-    n = db_cursor.execute('DELETE FROM sch_kw WHERE uid = "%s"' % uid)
+    n = db_cursor.execute('DELETE FROM %s WHERE uid = "%s"' % (map_table, uid))
    comex_db.commit()
-    mlog("DEBUG", "Keywords: DELETED %i pairings for %s" % (n, str(uid)))
+    mlog("DEBUG", "%s: DELETED %i pairings for %s" % (map_table, n, str(uid)))
-def get_or_create_keywords(kw_list, comex_db):
+def get_or_create_tokitems(tok_list, comex_db, tok_table='keywords'):
    """
        kw_str -> lookup/add to *keywords* table -> kw_id
+        ht_str -> lookup/add to *hashtags* table -> ht_id
        -------------------------------------------------
-    kw_list is an array of strings
+    tok_list is an array of strings
    NB keywords are mandatory: each registration should provide at least MIN_KW
+       hashtags aren't
    for loop
-       1) query to *keywords* table (exact match)
+       1) query to *keywords* or *hashtags* table (exact match)
       2) return id
-          => if a keyword matches return kwid
+          => if a keyword/tag matches return kwid/htid
-          => if no keyword matches create new and return kwid
+          => if no keyword/tag matches create new and return kwid/htid
    """
+    # sql names
+    fill = {'tb': tok_table}
+    if tok_table == 'keywords':
+        fill['idc'] = 'kwid'
+        fill['strc']= 'kwstr'
+    elif tok_table == 'hashtags':
+        fill['idc'] = 'htid'
+        fill['strc']= 'htstr'
    db_cursor = comex_db.cursor()
    found_ids = []
-    for kw_str in kw_list:
+    for tok_str in tok_list:
        # TODO better string normalization here or in read_record
-        kw_str = kw_str.lower()
+        tok_str = tok_str.lower()
+        fill['q'] = tok_str
-        n_matched = db_cursor.execute('SELECT kwid FROM keywords WHERE kwstr = "%s"' % kw_str)
+        # ex: SELECT kwid FROM keywords WHERE kwstr = "complexity"
+        n_matched = db_cursor.execute('SELECT %(idc)s FROM %(tb)s WHERE %(strc)s = "%(q)s"' % fill)
        # ok existing keyword => row id
        if n_matched == 1:
@@ -468,15 +525,17 @@ def get_or_create_keywords(kw_list, comex_db):
        # no matching keyword => add => row id
        elif n_matched == 0:
-            db_cursor.execute('INSERT INTO keywords(kwstr) VALUES ("%s")' % kw_str)
+            # ex: INSERT INTO keywords(kwstr) VALUES ("complexity")
+            db_cursor.execute('INSERT INTO %(tb)s(%(strc)s) VALUES ("%(q)s")' % fill)
            comex_db.commit()
-            mlog("INFO", "Added keyword '%s'" % kw_str)
+            mlog("INFO", "Added '%s' to %s table" % (tok_str, tok_table))
            found_ids.append(db_cursor.lastrowid)
        else:
-            raise Exception("ERROR: non-unique keyword '%s'" % kw_str)
+            raise Exception("ERROR: non-unique token '%s'" % tok_str)
    return found_ids
@@ -489,9 +548,11 @@ def get_or_create_affiliation(org_info, comex_db):
     1) query to *affiliations* table
     2) return id
        => TODO if institution almost matches send suggestion
-        => TODO unicity constraint on institution + lab
+        => unicity constraint on institution + lab + org_type
        => if an institution matches return affid
        => if no institution matches create new and return affid
+        TODO test more
    """
    the_aff_id = None
@@ -512,10 +573,8 @@ def get_or_create_affiliation(org_info, comex_db):
            db_qstrvals.append(quotedstrval)
            # for select
-            if colname != 'org_type':
            db_constraints.append("%s = %s" % (colname, quotedstrval))
        else:
-            if colname != 'org_type':
            db_constraints.append("%s IS NULL" % colname)
    db_cursor = comex_db.cursor()

--- a/services/main.py
+++ b/services/main.py
@@ -43,7 +43,7 @@ if __package__ == 'services':
    from services.user  import User, login_manager, doors_login, UCACHE
    from services.text  import keywords
    from services.tools import restparse, mlog, re_hash, REALCONFIG
-    from services.db    import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs
+    from services.db    import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs
    from services.db_to_tina_api.extractDataCustom import MyExtractor as MySQL
 else:
    # when this script is run directly
@@ -51,7 +51,7 @@ else:
    from user           import User, login_manager, doors_login, UCACHE
    from text           import keywords
    from tools          import restparse, mlog, re_hash, REALCONFIG
-    from db             import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs
+    from db             import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs
    from db_to_tina_api.extractDataCustom import MyExtractor as MySQL
 # ============= app creation ============
@@ -61,7 +61,7 @@ app = Flask("services",
             static_folder=path.join(config['HOME'],"static"),
             template_folder=path.join(config['HOME'],"templates"))
-app.config['DEBUG'] = (config['LOG_LEVEL'] == "DEBUG")
+app.config['DEBUG'] = (config['LOG_LEVEL'] in ["DEBUG","DEBUGSQL"])
 app.config['SECRET_KEY'] = 'TODO fill secret key for sessions for login'
 # for SSL
@@ -82,6 +82,7 @@ login_manager.init_app(app)
 # all inputs as they are declared in form, as a couple
 SOURCE_FIELDS = [
 #             NAME,              SANITIZE?
+         ("luid",                  False  ),
         ("doors_uid",             False  ),
         ("last_modified_date",    False  ),   # ex 2016-11-16T17:47:07.308Z
         ("email",                  True  ),
@@ -95,8 +96,7 @@ SOURCE_FIELDS = [
         ("position",               True  ),
         ("hon_title",              True  ),
         ("interests_text",         True  ),
-         ("community_hashtags",     True  ),
+         ("gender",                False  ),   # M|F
-         ("gender",                 True  ),   # M|F
         ("job_looking_date",       True  ),   # def null: not looking for a job
         ("home_url",               True  ),   # scholar's homepage
         ("pic_url",                True  ),
@@ -110,8 +110,11 @@ SOURCE_FIELDS = [
         ("org_city",               True  ),
         # => for *affiliations* table
-         ("keywords",               True  )
+         ("keywords",               True  ),
         # => for *keywords* table (after split str)
+         ("community_hashtags",     True  )
+         # => for *hashtags* table (after split str)
      ]
 # NB password values have already been sent by ajax to Doors
@@ -442,10 +445,11 @@ def save_form(request_form, request_files, update_flag=False):
    """
    # only safe values
    clean_records = {}
-    kw_array = []
    # 1) handles all the inputs from form, no matter what target table
-    (duuid, rdate, kw_array, clean_records) = read_record(request_form)
+    clean_records = read_record(request_form)
+    mlog("DEBUG", "===== clean_records =====", clean_records)
    # 2) handles the pic_file if present
    if 'pic_file' in request_files:
@@ -461,27 +465,41 @@ def save_form(request_form, request_files, update_flag=False):
    # B) read/fill the affiliation table to get associated id
    clean_records['affiliation_id'] = get_or_create_affiliation(clean_records, reg_db)
-    # C) create record into the primary user table
+    # C) create/update record into the primary user table
-    # ---------------------------------------------
+    # ----------------------------------------------------
        # TODO class User method !!
-    save_scholar(duuid, rdate, clean_records, reg_db, update_flag=update_flag)
+    luid = None
+    if update_flag:
+        luid = clean_records['luid']
+        save_scholar(clean_records, reg_db, update_luid=luid)
+    else:
+        luid = save_scholar(clean_records, reg_db)
    # D) read/fill each keyword and save the (uid <=> kwid) pairings
-    kwids = get_or_create_keywords(kw_array, reg_db)
+    #    read/fill each hashtag and save the (uid <=> htid) pairings
+    for intables in [['keywords',           'keywords', 'sch_kw'],
+                     ['community_hashtags', 'hashtags', 'sch_ht']]:
+        tok_field = intables[0]
+        if tok_field in clean_records:
+            tok_table = intables[1]
+            map_table = intables[2]
+            tokids = get_or_create_tokitems(clean_records[tok_field], reg_db, tok_table)
                # TODO class User method !!
                # POSS selective delete ?
            if update_flag:
-        delete_pairs_sch_kw(duuid, reg_db)
+                delete_pairs_sch_tok(luid, reg_db, map_table)
-    save_pairs_sch_kw([(duuid, kwid) for kwid in kwids], reg_db)
+            save_pairs_sch_tok([(luid, tokid) for tokid in tokids], reg_db, map_table)
+    # F) end connection
+    reg_db.close()
    # clear cache concerning this scholar
    # TODO class User method !!
-    if duuid in UCACHE: UCACHE.pop(duuid)
+    if luid in UCACHE: UCACHE.pop(luid)
-    # E) end connection
-    reg_db.close()
    return clean_records
@@ -492,14 +510,9 @@ def read_record(incoming_data):
      - custom made for regcomex/templates/base_form
      - uses SOURCE_FIELDS
    """
    # init var
    clean_records = {}
-    # read in + sanitize values
-    duuid = None
-    rdate = None
    # we should have all the mandatory fields (checked in client-side js)
    # TODO recheck b/c if post comes from elsewhere
    for field_info in SOURCE_FIELDS:
@@ -513,31 +526,29 @@ def read_record(incoming_data):
                else:
                    # mysql will want None instead of ''
                    val = None
-            # these 2 fields already validated and useful separately
-            elif field == 'doors_uid':
-                duuid = incoming_data[field]
-            elif field == 'last_modified_date':
-                rdate = incoming_data[field]
            # any other fields that don't need sanitization (ex: menu options)
            else:
                clean_records[field] = incoming_data[field]
    # special treatment for "other" subquestions
    if 'org_type' in clean_records:
        if clean_records['org_type'] == 'other' and 'other_org_type' in clean_records:
            clean_records['org_type'] = clean_records['other_org_type']
-    # split for kw_array
+    # splits for kw_array and ht_array
-    kw_array = []
+    for tok_field in ['keywords', 'community_hashtags']:
-    if 'keywords' in clean_records:
+        if tok_field in clean_records:
-        for kw in clean_records['keywords'].split(','):
+            print(tok_field, "in clean_records")
-            kw = sanitize(kw)
+            temp_array = []
-            if kw != '':
+            for tok in clean_records[tok_field].split(','):
-                kw_array.append(kw)
+                tok = sanitize(tok)
+                if tok != '':
+                    temp_array.append(tok)
+            # replace str by array
+            clean_records[tok_field] = temp_array
-    return (duuid, rdate, kw_array, clean_records)
+    return clean_records
 # TODO move to text submodules

--- a/setup/dockers/comex2_mysql_server/init_comex_shared.sql
+++ b/setup/dockers/comex2_mysql_server/init_comex_shared.sql
@@ -11,7 +11,7 @@ CREATE TABLE scholars (
    -- local uid necessary for users who still have no doors_uid
    luid                 int(15) not null auto_increment unique primary key,
    -- doors uid common to all lab's services
-    doors_uid            char(36) not null unique,
+    doors_uid            char(36) unique,
    last_modified_date   char(24) not null,
    email                varchar(255) not null unique,
    country              varchar(60) not null,