From b79f438f258122c66ef61eaa8b61f179af0c7fd4 Mon Sep 17 00:00:00 2001
From: rloth <romain.loth@iscpif.fr>
Date: Thu, 19 Jan 2017 18:22:50 +0100
Subject: [PATCH] WIP2 comex_shared DB (luid and hashtags in create/update
 user)

---
 doc/table_specifications.md                   |   2 +-
 services/db.py                                | 231 +++++++++++-------
 services/main.py                              |  93 +++----
 .../comex2_mysql_server/init_comex_shared.sql |   2 +-
 4 files changed, 199 insertions(+), 129 deletions(-)

diff --git a/doc/table_specifications.md b/doc/table_specifications.md
index 8a6697d..76574e9 100755
--- a/doc/table_specifications.md
+++ b/doc/table_specifications.md
@@ -11,7 +11,7 @@ CREATE TABLE scholars (
     -- local uid necessary for users who still have no doors_uid
     luid                 int(15) not null auto_increment unique primary key,
     -- doors uid common to all lab's services
-    doors_uid            char(36) not null unique,
+    doors_uid            char(36) unique,
     last_modified_date   char(24) not null,
     email                varchar(255) not null unique,
     country              varchar(60) not null,
diff --git a/services/db.py b/services/db.py
index 433b5a9..e94f382 100644
--- a/services/db.py
+++ b/services/db.py
@@ -19,7 +19,8 @@ else:
 # sorted columns as declared in DB, as a tuple
 USER_COLS = [
 #          NAME,               NOT NULL,  N or MAXCHARS (if applicable)
-         ("doors_uid",              True,        36),
+         ("luid",                   True,        15),
+         ("doors_uid",             False,        36),
          ("last_modified_date",     True,        24),
          ("email",                  True,       255),
          ("country",                True,        60),
@@ -31,7 +32,6 @@ USER_COLS = [
          ("position",              False,        30),
          ("hon_title",             False,        30),
          ("interests_text",        False,      1200),
-         ("community_hashtags",    False,       350),
          ("gender",                False,         1),
          ("job_looking_date",      False,        24),
          ("home_url",              False,       120),
@@ -49,14 +49,15 @@ ORG_COLS = [
 
 FIELDS_FRONTEND_TO_SQL = {
     "keywords": "keywords.kwstr",
+    "tags": "hashtags.htstr",
+
     "countries": "scholars.country",
+    "gender": "scholars.gender",
+
     "organizations": "affiliations.org",
     "laboratories": "affiliations.team_lab",
-    "tags": "scholars.community_hashtags",
-
-    # new
-    "gender": "scholars.gender",
     "cities": "affiliations.org_city",
+
     "linked": "linked_ids.ext_id_type"
 }
 
@@ -139,7 +140,7 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                     FROM scholars
                     -- 0 or 1
                     LEFT JOIN linked_ids
-                        ON scholars.doors_uid = linked_ids.uid
+                        ON scholars.luid = linked_ids.uid
                     GROUP BY %(col)s
                 ) AS allcounts
                 %(filter)s
@@ -153,8 +154,8 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                     FROM scholars
                     -- 0 or many
                     LEFT JOIN sch_kw
-                        ON scholars.doors_uid = sch_kw.uid
-                    JOIN keywords
+                        ON scholars.luid = sch_kw.uid
+                    LEFT JOIN keywords
                         ON sch_kw.kwid = keywords.kwid
                     GROUP BY %(col)s
                 ) AS allcounts
@@ -162,6 +163,21 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
                 ORDER BY occs DESC
             """ % {'col': sql_col, 'filter': where_clause}
 
+        elif sql_tab == 'hashtags':
+            stmt = """
+                SELECT * FROM (
+                    SELECT %(col)s AS x, COUNT(*) AS occs
+                    FROM scholars
+                    -- 0 or many
+                    LEFT JOIN sch_ht
+                        ON scholars.luid = sch_ht.uid
+                    LEFT JOIN hashtags
+                        ON sch_ht.htid = hashtags.htid
+                    GROUP BY %(col)s
+                ) AS allcounts
+                %(filter)s
+                ORDER BY occs DESC
+            """ % {'col': sql_col, 'filter': where_clause}
 
         mlog("DEBUGSQL", "get_field_aggs STATEMENT:\n-- SQL\n%s\n-- /SQL" % stmt)
 
@@ -179,6 +195,9 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
 
 def get_full_scholar(uid):
     """
+    uid : str
+          local user id aka luid
+
     Autonomous function to be used by User class
        => Retrieves one line from *scholars* table, with joined optional concatenated *affiliations*, *keywords* and *linked_ids*
        => Parse it all into a structured python user info dict
@@ -195,7 +214,7 @@ def get_full_scholar(uid):
     one_usr_stmt = """
 
         SELECT
-            sch_n_aff_n_kws.*,
+            sch_n_aff_n_kws_n_hts.*,
 
             -- linked_ids info condensed
             -- (format : "type1:ID1,type2:ID2,...")
@@ -205,49 +224,68 @@ def get_full_scholar(uid):
             COUNT(linked_ids.ext_id) AS linked_ids_nb
 
         FROM (
-                SELECT
-                    sch_n_aff.*,
 
-                    -- kws info condensed
-                    COUNT(keywords.kwid) AS keywords_nb,
-                    -- GROUP_CONCAT(keywords.kwid) AS kwids,
-                    GROUP_CONCAT(keywords.kwstr) AS keywords
+            SELECT
+                sch_n_aff_n_kws.*,
+                -- hts info condensed
+                COUNT(hashtags.htid) AS hashtags_nb,
+                -- GROUP_CONCAT(hashtags.htid) AS htids,
+                GROUP_CONCAT(hashtags.htstr) AS hashtags
 
-                FROM (
+            FROM (
                     SELECT
-                        scholars.*,
-                        -- for debug replace scholars.* by
-                        -- scholars.doors_uid,
-                        -- scholars.email,
-                        -- scholars.last_modified_date,
-                        -- scholars.initials,
+                        sch_n_aff.*,
 
-                        affiliations.*
+                        -- kws info condensed
+                        COUNT(keywords.kwid) AS keywords_nb,
+                        -- GROUP_CONCAT(keywords.kwid) AS kwids,
+                        GROUP_CONCAT(keywords.kwstr) AS keywords
 
-                    FROM scholars
+                    FROM (
+                        SELECT
+                            -- scholars.*,
+                            -- for debug replace scholars.* by
+                            scholars.luid,
+                            scholars.doors_uid,
+                            scholars.email,
+                            -- scholars.last_modified_date,
+                            -- scholars.initials,
 
-                    LEFT JOIN affiliations
-                        ON scholars.affiliation_id = affiliations.affid
+                            affiliations.*
+
+                        FROM scholars
+
+                        LEFT JOIN affiliations
+                            ON scholars.affiliation_id = affiliations.affid
+
+                        GROUP BY luid
+
+                        ) AS sch_n_aff
 
-                    GROUP BY doors_uid
+                    -- two step JOIN for keywords
+                    LEFT JOIN sch_kw
+                        ON sch_kw.uid = luid
+                    LEFT JOIN keywords
+                        ON sch_kw.kwid = keywords.kwid
+                    GROUP BY luid
 
-                    ) AS sch_n_aff
+            ) AS sch_n_aff_n_kws
 
-                -- two step JOIN for keywords
-                LEFT JOIN sch_kw
-                    ON sch_n_aff.doors_uid = sch_kw.uid
-                LEFT JOIN keywords
-                    ON sch_kw.kwid = keywords.kwid
-                GROUP BY doors_uid
+            -- also two step JOIN for hashtags
+            LEFT JOIN sch_ht
+                ON sch_ht.uid = luid
+            LEFT JOIN hashtags
+                ON sch_ht.htid = hashtags.htid
+            GROUP BY luid
 
-        ) AS sch_n_aff_n_kws
+        ) AS sch_n_aff_n_kws_n_hts
 
         LEFT JOIN linked_ids
-            ON linked_ids.uid = sch_n_aff_n_kws.doors_uid
+            ON linked_ids.uid = luid
 
         -- WHERE our user UID
-        WHERE  doors_uid = "%s"
-        GROUP BY doors_uid
+        WHERE  luid = "%s"
+        GROUP BY luid
     """ % str(uid)
 
     mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt)
@@ -289,16 +327,17 @@ def get_full_scholar(uid):
 
     # post-treatments
     # ---------------
-    # 1/ split concatenated kw lists and check correct length
-    if urow_dict['keywords_nb'] == 0:
-        urow_dict['keywords'] = []
-    else:
-        kws_array = urow_dict['keywords'].split(',')
-
-        if len(kws_array) != urow_dict['keywords_nb']:
-            raise ValueError("Can't correctly split keywords for user %s" % uid)
+    # 1/ split concatenated kw an ht lists and check correct length
+    for toktype in ['keywords', 'hashtags']:
+        if urow_dict[toktype+'_nb'] == 0:
+            urow_dict[toktype] = []
         else:
-            urow_dict['keywords'] = kws_array
+            tokarray = urow_dict[toktype].split(',')
+
+            if len(tokarray) != urow_dict[toktype+'_nb']:
+                raise ValueError("Can't correctly split %s for user %s" % (toktype, uid))
+            else:
+                urow_dict[toktype] = tokarray
 
     # 2/ also split and parse all linked_ids
     if urow_dict['linked_ids_nb'] == 0:
@@ -326,21 +365,21 @@ def get_full_scholar(uid):
     return urow_dict
 
 
-def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
+def save_scholar(safe_recs, reg_db, uactive=True, update_luid=None):
     """
     For new registration:
-      -> add to *scholars* table
+      -> add to *scholars* table, return new local uid
 
-    For profile change (just toggle update_flag to True)
+    For profile change (just pass previous local uid in update_luid)
       -> *update* scholars table
 
     see also COLS variable and doc/table_specifications.md
     """
 
-    # we already have the first two columns
-    db_tgtcols = ['doors_uid', 'last_modified_date']
-    db_qstrvals = ["'"+str(uid)+"'", "'"+str(date)+"'"]
-    actual_len_dbg = 2
+    # column names and column quoted values
+    db_tgtcols = []
+    db_qstrvals = []
+    actual_len_dbg = 0
 
     # REMARK:
     # => In theory should be possible to execute(statment, values) to insert all
@@ -353,8 +392,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
     #                            -------------              -----------
     #    and then we execute(full_statmt)         :-)
 
-
-    for colinfo in USER_COLS[2:]:
+    for colinfo in USER_COLS:
         colname = colinfo[0]
 
         # NB: each val already contains no quotes because of sanitize()
@@ -385,7 +423,7 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
 
     reg_db_c = reg_db.cursor()
 
-    if not update_flag:
+    if not update_luid:
         # expected colnames "(doors_uid, last_modified_date, email, ...)"
         db_tgtcols_str = ','.join(db_tgtcols)
 
@@ -404,63 +442,82 @@ def save_scholar(uid, date, safe_recs, reg_db, uactive=True, update_flag=False):
         set_full_str = ','.join([db_tgtcols[i] + '=' + db_qstrvals[i] for i in range(len(db_tgtcols))])
 
         # UPDATE: full_statement with formated values
-        full_statmt = 'UPDATE scholars SET %s WHERE doors_uid = "%s"' % (
+        full_statmt = 'UPDATE scholars SET %s WHERE luid = "%s"' % (
                             set_full_str,
-                            uid
+                            update_luid
         )
 
-    mlog("DEBUG", "UPDATE" if update_flag else "INSERT",  "SQL statement:", full_statmt)
+    mlog("DEBUG", "UPDATE" if update_luid else "INSERT",  "SQL statement:", full_statmt)
 
     reg_db_c.execute(full_statmt)
+    if not update_luid:
+        luid = reg_db_c.lastrowid
+    else:
+        luid = update_luid
     reg_db.commit()
+    return luid
 
 
-def save_pairs_sch_kw(pairings_list, comex_db):
+def save_pairs_sch_tok(pairings_list, comex_db, map_table='sch_kw'):
     """
-    Simply save all pairings (uid, kwid) in the list
+    Simply save all pairings (luid, kwid) or (luid, htid) in the list
     """
     db_cursor = comex_db.cursor()
     for id_pair in pairings_list:
-        db_cursor.execute('INSERT INTO sch_kw VALUES %s' % str(id_pair))
+        db_cursor.execute('INSERT INTO %s VALUES %s' % (map_table, str(id_pair)))
         comex_db.commit()
-        mlog("DEBUG", "Keywords: saved %s pair" % str(id_pair))
+        mlog("DEBUG", "%s: saved %s pair" % (map_table, str(id_pair)))
 
 
-def delete_pairs_sch_kw(uid, comex_db):
+def delete_pairs_sch_tok(uid, comex_db, map_table='sch_kw'):
     """
-    Simply deletes all pairings (uid, *) in the table
+    Simply deletes all pairings (luid, *) in the table
     """
+    if map_table not in ['sch_kw', 'sch_ht']:
+        raise TypeError('ERROR: Unknown map_table')
     db_cursor = comex_db.cursor()
-    n = db_cursor.execute('DELETE FROM sch_kw WHERE uid = "%s"' % uid)
+    n = db_cursor.execute('DELETE FROM %s WHERE uid = "%s"' % (map_table, uid))
     comex_db.commit()
-    mlog("DEBUG", "Keywords: DELETED %i pairings for %s" % (n, str(uid)))
+    mlog("DEBUG", "%s: DELETED %i pairings for %s" % (map_table, n, str(uid)))
 
 
-def get_or_create_keywords(kw_list, comex_db):
+def get_or_create_tokitems(tok_list, comex_db, tok_table='keywords'):
     """
         kw_str -> lookup/add to *keywords* table -> kw_id
+        ht_str -> lookup/add to *hashtags* table -> ht_id
         -------------------------------------------------
 
-    kw_list is an array of strings
+    tok_list is an array of strings
 
     NB keywords are mandatory: each registration should provide at least MIN_KW
-
+       hashtags aren't
 
     for loop
-       1) query to *keywords* table (exact match)
+       1) query to *keywords* or *hashtags* table (exact match)
        2) return id
-          => if a keyword matches return kwid
-          => if no keyword matches create new and return kwid
+          => if a keyword/tag matches return kwid/htid
+          => if no keyword/tag matches create new and return kwid/htid
     """
 
+    # sql names
+    fill = {'tb': tok_table}
+    if tok_table == 'keywords':
+        fill['idc'] = 'kwid'
+        fill['strc']= 'kwstr'
+    elif tok_table == 'hashtags':
+        fill['idc'] = 'htid'
+        fill['strc']= 'htstr'
+
     db_cursor = comex_db.cursor()
     found_ids = []
-    for kw_str in kw_list:
+    for tok_str in tok_list:
 
         # TODO better string normalization here or in read_record
-        kw_str = kw_str.lower()
+        tok_str = tok_str.lower()
+        fill['q'] = tok_str
 
-        n_matched = db_cursor.execute('SELECT kwid FROM keywords WHERE kwstr = "%s"' % kw_str)
+        # ex: SELECT kwid FROM keywords WHERE kwstr = "complexity"
+        n_matched = db_cursor.execute('SELECT %(idc)s FROM %(tb)s WHERE %(strc)s = "%(q)s"' % fill)
 
         # ok existing keyword => row id
         if n_matched == 1:
@@ -468,15 +525,17 @@ def get_or_create_keywords(kw_list, comex_db):
 
         # no matching keyword => add => row id
         elif n_matched == 0:
-            db_cursor.execute('INSERT INTO keywords(kwstr) VALUES ("%s")' % kw_str)
+
+            # ex: INSERT INTO keywords(kwstr) VALUES ("complexity")
+            db_cursor.execute('INSERT INTO %(tb)s(%(strc)s) VALUES ("%(q)s")' % fill)
             comex_db.commit()
 
-            mlog("INFO", "Added keyword '%s'" % kw_str)
+            mlog("INFO", "Added '%s' to %s table" % (tok_str, tok_table))
 
             found_ids.append(db_cursor.lastrowid)
 
         else:
-            raise Exception("ERROR: non-unique keyword '%s'" % kw_str)
+            raise Exception("ERROR: non-unique token '%s'" % tok_str)
     return found_ids
 
 
@@ -489,9 +548,11 @@ def get_or_create_affiliation(org_info, comex_db):
      1) query to *affiliations* table
      2) return id
         => TODO if institution almost matches send suggestion
-        => TODO unicity constraint on institution + lab
+        => unicity constraint on institution + lab + org_type
         => if an institution matches return affid
         => if no institution matches create new and return affid
+
+        TODO test more
     """
 
     the_aff_id = None
@@ -512,11 +573,9 @@ def get_or_create_affiliation(org_info, comex_db):
             db_qstrvals.append(quotedstrval)
 
             # for select
-            if colname != 'org_type':
-                db_constraints.append("%s = %s" % (colname, quotedstrval))
+            db_constraints.append("%s = %s" % (colname, quotedstrval))
         else:
-            if colname != 'org_type':
-                db_constraints.append("%s IS NULL" % colname)
+            db_constraints.append("%s IS NULL" % colname)
 
     db_cursor = comex_db.cursor()
 
diff --git a/services/main.py b/services/main.py
index 76055a8..009cbdd 100755
--- a/services/main.py
+++ b/services/main.py
@@ -43,7 +43,7 @@ if __package__ == 'services':
     from services.user  import User, login_manager, doors_login, UCACHE
     from services.text  import keywords
     from services.tools import restparse, mlog, re_hash, REALCONFIG
-    from services.db    import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs
+    from services.db    import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs
     from services.db_to_tina_api.extractDataCustom import MyExtractor as MySQL
 else:
     # when this script is run directly
@@ -51,7 +51,7 @@ else:
     from user           import User, login_manager, doors_login, UCACHE
     from text           import keywords
     from tools          import restparse, mlog, re_hash, REALCONFIG
-    from db             import connect_db, get_or_create_keywords, save_pairs_sch_kw, delete_pairs_sch_kw, get_or_create_affiliation, save_scholar, get_field_aggs
+    from db             import connect_db, get_or_create_tokitems, save_pairs_sch_tok, delete_pairs_sch_tok, get_or_create_affiliation, save_scholar, get_field_aggs
     from db_to_tina_api.extractDataCustom import MyExtractor as MySQL
 
 # ============= app creation ============
@@ -61,7 +61,7 @@ app = Flask("services",
              static_folder=path.join(config['HOME'],"static"),
              template_folder=path.join(config['HOME'],"templates"))
 
-app.config['DEBUG'] = (config['LOG_LEVEL'] == "DEBUG")
+app.config['DEBUG'] = (config['LOG_LEVEL'] in ["DEBUG","DEBUGSQL"])
 app.config['SECRET_KEY'] = 'TODO fill secret key for sessions for login'
 
 # for SSL
@@ -82,6 +82,7 @@ login_manager.init_app(app)
 # all inputs as they are declared in form, as a couple
 SOURCE_FIELDS = [
 #             NAME,              SANITIZE?
+         ("luid",                  False  ),
          ("doors_uid",             False  ),
          ("last_modified_date",    False  ),   # ex 2016-11-16T17:47:07.308Z
          ("email",                  True  ),
@@ -95,8 +96,7 @@ SOURCE_FIELDS = [
          ("position",               True  ),
          ("hon_title",              True  ),
          ("interests_text",         True  ),
-         ("community_hashtags",     True  ),
-         ("gender",                 True  ),   # M|F
+         ("gender",                False  ),   # M|F
          ("job_looking_date",       True  ),   # def null: not looking for a job
          ("home_url",               True  ),   # scholar's homepage
          ("pic_url",                True  ),
@@ -110,8 +110,11 @@ SOURCE_FIELDS = [
          ("org_city",               True  ),
          # => for *affiliations* table
 
-         ("keywords",               True  )
+         ("keywords",               True  ),
          # => for *keywords* table (after split str)
+
+         ("community_hashtags",     True  )
+         # => for *hashtags* table (after split str)
       ]
 
 # NB password values have already been sent by ajax to Doors
@@ -442,10 +445,11 @@ def save_form(request_form, request_files, update_flag=False):
     """
     # only safe values
     clean_records = {}
-    kw_array = []
 
     # 1) handles all the inputs from form, no matter what target table
-    (duuid, rdate, kw_array, clean_records) = read_record(request_form)
+    clean_records = read_record(request_form)
+
+    mlog("DEBUG", "===== clean_records =====", clean_records)
 
     # 2) handles the pic_file if present
     if 'pic_file' in request_files:
@@ -461,28 +465,42 @@ def save_form(request_form, request_files, update_flag=False):
     # B) read/fill the affiliation table to get associated id
     clean_records['affiliation_id'] = get_or_create_affiliation(clean_records, reg_db)
 
-    # C) create record into the primary user table
-    # ---------------------------------------------
+    # C) create/update record into the primary user table
+    # ----------------------------------------------------
         # TODO class User method !!
-    save_scholar(duuid, rdate, clean_records, reg_db, update_flag=update_flag)
+    luid = None
+    if update_flag:
+        luid = clean_records['luid']
+        save_scholar(clean_records, reg_db, update_luid=luid)
+    else:
+        luid = save_scholar(clean_records, reg_db)
+
 
     # D) read/fill each keyword and save the (uid <=> kwid) pairings
-    kwids = get_or_create_keywords(kw_array, reg_db)
+    #    read/fill each hashtag and save the (uid <=> htid) pairings
+    for intables in [['keywords',           'keywords', 'sch_kw'],
+                     ['community_hashtags', 'hashtags', 'sch_ht']]:
+        tok_field = intables[0]
+        if tok_field in clean_records:
+            tok_table = intables[1]
+            map_table = intables[2]
 
-        # TODO class User method !!
-        # POSS selective delete ?
-    if update_flag:
-        delete_pairs_sch_kw(duuid, reg_db)
+            tokids = get_or_create_tokitems(clean_records[tok_field], reg_db, tok_table)
 
-    save_pairs_sch_kw([(duuid, kwid) for kwid in kwids], reg_db)
+                # TODO class User method !!
+                # POSS selective delete ?
+            if update_flag:
+                delete_pairs_sch_tok(luid, reg_db, map_table)
 
-    # clear cache concerning this scholar
-        # TODO class User method !!
-    if duuid in UCACHE: UCACHE.pop(duuid)
+            save_pairs_sch_tok([(luid, tokid) for tokid in tokids], reg_db, map_table)
 
-    # E) end connection
+    # F) end connection
     reg_db.close()
 
+    # clear cache concerning this scholar
+    # TODO class User method !!
+    if luid in UCACHE: UCACHE.pop(luid)
+
     return clean_records
 
 
@@ -492,14 +510,9 @@ def read_record(incoming_data):
       - custom made for regcomex/templates/base_form
       - uses SOURCE_FIELDS
     """
-
     # init var
     clean_records = {}
 
-    # read in + sanitize values
-    duuid = None
-    rdate = None
-
     # we should have all the mandatory fields (checked in client-side js)
     # TODO recheck b/c if post comes from elsewhere
     for field_info in SOURCE_FIELDS:
@@ -513,31 +526,29 @@ def read_record(incoming_data):
                 else:
                     # mysql will want None instead of ''
                     val = None
-
-            # these 2 fields already validated and useful separately
-            elif field == 'doors_uid':
-                duuid = incoming_data[field]
-            elif field == 'last_modified_date':
-                rdate = incoming_data[field]
-
             # any other fields that don't need sanitization (ex: menu options)
             else:
                 clean_records[field] = incoming_data[field]
 
+
     # special treatment for "other" subquestions
     if 'org_type' in clean_records:
         if clean_records['org_type'] == 'other' and 'other_org_type' in clean_records:
             clean_records['org_type'] = clean_records['other_org_type']
 
-    # split for kw_array
-    kw_array = []
-    if 'keywords' in clean_records:
-        for kw in clean_records['keywords'].split(','):
-            kw = sanitize(kw)
-            if kw != '':
-                kw_array.append(kw)
+    # splits for kw_array and ht_array
+    for tok_field in ['keywords', 'community_hashtags']:
+        if tok_field in clean_records:
+            print(tok_field, "in clean_records")
+            temp_array = []
+            for tok in clean_records[tok_field].split(','):
+                tok = sanitize(tok)
+                if tok != '':
+                    temp_array.append(tok)
+            # replace str by array
+            clean_records[tok_field] = temp_array
 
-    return (duuid, rdate, kw_array, clean_records)
+    return clean_records
 
 
 # TODO move to text submodules
diff --git a/setup/dockers/comex2_mysql_server/init_comex_shared.sql b/setup/dockers/comex2_mysql_server/init_comex_shared.sql
index 6dd0111..9f13890 100644
--- a/setup/dockers/comex2_mysql_server/init_comex_shared.sql
+++ b/setup/dockers/comex2_mysql_server/init_comex_shared.sql
@@ -11,7 +11,7 @@ CREATE TABLE scholars (
     -- local uid necessary for users who still have no doors_uid
     luid                 int(15) not null auto_increment unique primary key,
     -- doors uid common to all lab's services
-    doors_uid            char(36) not null unique,
+    doors_uid            char(36) unique,
     last_modified_date   char(24) not null,
     email                varchar(255) not null unique,
     country              varchar(60) not null,
-- 
2.21.0