adapt dbcrud for new org table structure (profile, registration)

a8ac0399 · Romain Loth · 326c2ed8 · a8ac0399 · a8ac0399 · a8ac0399
Commit a8ac0399 authored Mar 13, 2017 by Romain Loth
15 changed files
--- a/doc/cascade_full_scholar_info.sql
+++ b/doc/cascade_full_scholar_info.sql
@@ -25,19 +25,23 @@ SELECT * FROM (
                    FROM scholars
                    LEFT JOIN sch_org AS map_labs
                        ON map_labs.uid = luid
-                    JOIN orgs AS labs
+                    LEFT JOIN (
+                        -- class constraint can't appear later,
+                        -- it would give no scholar when empty
+                        SELECT * FROM orgs WHERE class='lab'
+                    ) AS labs
                        ON map_labs.orgid = labs.orgid
                    WHERE (record_status = 'active'
                            OR (record_status = 'legacy' AND valid_date >= NOW()))
-                    AND labs.class = 'lab'
                    GROUP BY luid
                    ) AS scholars_and_labs
                LEFT JOIN sch_org AS map_insts
                    ON map_insts.uid = luid
-                JOIN orgs AS insts
+                LEFT JOIN (
+                    SELECT * FROM orgs WHERE class='inst'
+                ) AS insts
                    ON map_insts.orgid = insts.orgid
-                AND insts.class = 'inst'
                GROUP BY luid
        ) AS scholars_and_orgs

--- a/doc/data_mining_exemples/org_to_orgs.sql
+++ b/doc/data_mining_exemples/org_to_orgs.sql
+-- we pass through scholars
+--   org1 => scholars => orgs2
+-- (for suggestions and/or than mapping)
+SELECT orgs.*,
+       GROUP_CONCAT( tgt_tostring ORDER BY tgt_freq DESC SEPARATOR '%%%')
+        AS related_insts
+FROM orgs
+LEFT JOIN (
+    SELECT sch_org.orgid AS src_orgid,
+          sch_org2.orgid AS tgt_orgid,
+          orgs2.tostring AS tgt_tostring,
+          count(*) AS tgt_freq
+    FROM sch_org
+    LEFT JOIN sch_org AS sch_org2
+        ON sch_org.uid = sch_org2.uid
+    JOIN orgs AS orgs2
+        ON sch_org2.orgid = orgs2.orgid
+    WHERE orgs2.class = 'inst'
+    AND  sch_org.orgid != sch_org2.orgid
+    GROUP BY sch_org.orgid, sch_org2.orgid
+    ) AS lab_relationship_to_inst_via_scholars ON src_orgid = orgs.orgid
+WHERE orgs.orgid IN ( {$ids_str} )
+AND orgs.name != '_NULL'
+GROUP BY orgs.orgid
+ORDER BY orgs.name, orgs.acro
+;
+-- a POSSible alternative would be create an org_org tabls
+-- relationship organizations <=> organizations
+-- formally many-to-many but one could say many-to-few :)
+CREATE TABLE org_org(
+   orgid_src          int(15) not null,   -- @class 'lab'
+   orgid_tgt          int(15) not null,   -- @class 'inst'
+   sch_freq           int(15) default 0,  -- how often declared in sch records
+                                          -- (useful if unsure main parent org)
+   PRIMARY KEY (orgid_src, orgid_tgt),
+   FOREIGN KEY (orgid_src) REFERENCES orgs(orgid) ON DELETE CASCADE,
+   FOREIGN KEY (orgid_tgt) REFERENCES orgs(orgid) ON DELETE CASCADE
+);
+-- NB +/-1 to org -> org freq in org_org would be triggered indirectly by new scholars rows so made in profile saving at middle-ware lvl (dbcrud.py)
--- a/doc/dev_setup.md
+++ b/doc/dev_setup.md
 ## dev overview
 comex app contains:
@@ -34,13 +33,10 @@ cd $INSTALL_DIR
 sudo pip3 install -r setup/requirements.txt
 ```
-Then to run the comex2 services in the simplest way just do:
+Then to run the comex2 server just do:
 ```
-cd services
+bash comex-run.sh
-python3 comex_main_backend.py
 ```
-The form server is then accessible locally on `0.0.0.0:5000/services/user`
-The tina api server is on `0.0.0.0:5000/services/api`
 Check the parameters in `config/parametres_comex.ini`
@@ -49,7 +45,7 @@ Finally, simply configure the serving of your php|www documentroot in nginx (cf
 -------
-#### Advanced dev config
+#### Full dev config
  1. external mysql database
  2. external doors (or simulated by docker)
  3. gunicorn webserver (linked to 1 & 2 via `$SQL_HOST` and `$DOORS_HOST`)
@@ -92,15 +88,8 @@ nano config/parametres_comex.ini
 ###### If you have no doors server
-For tests you can use a `minidoors` container
+For tests you can use a self-deployed doors container, available on [this repository](https://github.com/ISCPIF/doors-docker)
-```
-# build the docker image (once)
-cd setup/dockers
-docker build -t minidoors:latest minidoors/
-# run the container (each time)
-docker run -it -p 32789:8989 --name doors_test minidoors
-```
 ##### 3) Run the regomex app with gunicorn
 ```

--- a/doc/table_specifications.md
+++ b/doc/table_specifications.md
@@ -46,7 +46,6 @@ CREATE TABLE scholars (
 ) ;
 CREATE TABLE locs(
    locname             varchar(120),
    lat                 float(6,4),
@@ -54,12 +53,11 @@ CREATE TABLE locs(
    PRIMARY KEY (locname)
 ) ;
 -- table for all organization classes (team, lab, large institution)
 CREATE TABLE orgs(
    orgid               int(15) not null auto_increment,
    name                varchar(120),   -- full name
-    acro                varchar(20),    -- acronym or short name
+    acro                varchar(30),    -- acronym or short name
    class               varchar(25),   -- "team|lab|inst"
                                    -- like the calibre of the organization
@@ -78,15 +76,14 @@ CREATE TABLE orgs(
    -- address...          (...)      -- address elements POSS NOT IMPLEMENTED
    reserved            varchar(30),
-    -- generated column, often useful for autocompletes etc
+    -- tostring: generated column
    -- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain"
-    tostring            varchar(800) AS (CONCAT(
+    -- searchable + human readable, often useful for autocompletes etc
-                                         name, ' (', acro, ')',
+    tostring            varchar(800)
-                                         IF(locname IS NOT NULL ,
+        AS (CONCAT_WS( '',
-                                                 CONCAT(', ', locname),
+                       CONCAT(name, ' '),
-                                                 '')
+                       CONCAT('(',acro,')'),
-                                     )),
+                       CONCAT(', ', locname)) ),
    PRIMARY KEY (orgid),
    UNIQUE KEY full_org (name, acro, locname)
@@ -106,6 +103,7 @@ CREATE TABLE sch_org(
 -- POSS: relationship organizations <=> keywords
 -- POSS: relationship organizations <=> organizations
+-- cf. doc/data_mining_exemples/org_to_orgs.sql
 -- keyword/subject terms

--- a/print_directory.php
+++ b/print_directory.php
@@ -303,19 +303,21 @@ SELECT * FROM (
                    FROM scholars
                    LEFT JOIN sch_org AS map_labs
                        ON map_labs.uid = luid
-                    JOIN orgs AS labs
+                    LEFT JOIN (
+                        SELECT * FROM orgs WHERE class='lab'
+                    ) AS labs
                        ON map_labs.orgid = labs.orgid
                    WHERE (record_status = 'active'
                            OR (record_status = 'legacy' AND valid_date >= NOW()))
-                    AND labs.class = 'lab'
                    GROUP BY luid
                    ) AS scholars_and_labs
                LEFT JOIN sch_org AS map_insts
                    ON map_insts.uid = luid
-                JOIN orgs AS insts
+                LEFT JOIN (
+                    SELECT * FROM orgs WHERE class='inst'
+                ) AS insts
                    ON map_insts.orgid = insts.orgid
-                AND insts.class = 'inst'
                GROUP BY luid
        ) AS scholars_and_orgs

--- a/services/dbcrud.py
+++ b/services/dbcrud.py
@@ -10,10 +10,12 @@ from MySQLdb.cursors  import DictCursor
 if __package__ == 'services':
    # when we're run via import
-    from services.tools import mlog, REALCONFIG
+    from services.tools      import mlog, REALCONFIG
+    from services.text.utils import normalize_chars, normalize_forms
 else:
    # when this script is run directly
    from tools          import mlog, REALCONFIG
+    from text.utils     import normalize_chars, normalize_forms
 # sorted columns as declared in DB, as a tuple
@@ -21,14 +23,12 @@ USER_COLS = [
 #          NAME,               NOT NULL,  N or MAXCHARS (if applicable)
         ("luid",                   True,        15),
         ("doors_uid",             False,        36),
-        #  ("last_modified",          True,      None),  # autoset on update
         ("email",                  True,       255),
         ("country",                True,        60),
         ("first_name",             True,        30),
         ("middle_name",           False,        30),
         ("last_name",              True,        50),
         ("initials",               True,         7),
-         ("affiliation_id",        False,      None),   # from db_get_or_create_affiliation
         ("position",              False,        30),
         ("hon_title",             False,        30),
         ("interests_text",        False,      1200),
@@ -43,10 +43,19 @@ USER_COLS = [
      ]
 ORG_COLS = [
-         ("org",                   False,       120),
+         ("class",                 False,        25),  # "lab" or "inst"
-         ("org_type",              False,        50),
+         ("name",                  False,       120),
-         ("team_lab",               True,       120),
+         ("acro",                  False,        30),  # acronym or short name
-         ("org_city",              False,        50)
+         ("locname",              False,        120),
+         ("inst_type",             False,        50),
+         ("lab_code",              False,        25),  # not in GUI yet
+         ("url",                  False,        180),  # not in GUI yet
+         ("contact_name",         False,         80),  # not in GUI yet
+         ("contact_email",        False,        255)   # not in GUI yet
+         # also in concatenations:
+         #  label    = name + acro
+         #  tostring = name + acro + locname
    ]
@@ -156,6 +165,10 @@ def get_full_scholar(uid, cmx_db = None):
        db = connect_db()
    db_c = db.cursor(DictCursor)
+    print('DBG', 'uid', uid)
+    print('DBG', 'type(uid)', type(uid))
    # one user + all linked infos concatenated in one row
    #                                   <= 3 LEFT JOINS sequentially GROUPed
    #                                     (b/c if simultaneous, loses unicity)
@@ -182,7 +195,7 @@ def get_full_scholar(uid, cmx_db = None):
            FROM (
                    SELECT
-                        sch_n_aff.*,
+                        sch_n_orgs.*,
                        -- kws info condensed
                        COUNT(keywords.kwid) AS keywords_nb,
@@ -191,24 +204,39 @@ def get_full_scholar(uid, cmx_db = None):
                    FROM (
                        SELECT
-                            scholars.*,
+                            sch_n_labs.*,
-                            -- for debug replace scholars.* by
+                            COUNT(insts.orgid) AS insts_ids_nb,
-                            -- scholars.luid,
+                            GROUP_CONCAT(insts.orgid) AS insts_ids
-                            -- scholars.doors_uid,
-                            -- scholars.email,
+                        FROM (
-                            -- scholars.last_modified_date,
+                            SELECT
-                            -- scholars.initials,
+                                scholars.*,
+                                COUNT(labs.orgid) AS labs_ids_nb,
-                            affiliations.*
+                                GROUP_CONCAT(labs.orgid) AS labs_ids
-                        FROM scholars
+                            FROM scholars
-                        LEFT JOIN affiliations
+                            LEFT JOIN sch_org AS map_labs
-                            ON scholars.affiliation_id = affiliations.affid
+                                ON map_labs.uid = luid
+                            LEFT JOIN (
+                                -- class constraint can't appear later,
+                                -- it would give no scholar when empty
+                                SELECT * FROM orgs WHERE class='lab'
+                            ) AS labs
+                                ON map_labs.orgid = labs.orgid
+                            GROUP BY luid
+                            ) AS sch_n_labs
+                        LEFT JOIN sch_org AS map_insts
+                            ON map_insts.uid = luid
+                        LEFT JOIN (
+                            SELECT * FROM orgs WHERE class='inst'
+                        ) AS insts
+                            ON map_insts.orgid = insts.orgid
                        GROUP BY luid
+                        ) AS sch_n_orgs
-                        ) AS sch_n_aff
                    -- two step JOIN for keywords
                    LEFT JOIN sch_kw
@@ -232,9 +260,9 @@ def get_full_scholar(uid, cmx_db = None):
            ON linked_ids.uid = luid
        -- WHERE our user UID
-        WHERE  luid = "%s"
+        WHERE  luid = %i
        GROUP BY luid
-    """ % str(uid)
+    """ % int(uid)
    mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt)
@@ -246,10 +274,6 @@ def get_full_scholar(uid, cmx_db = None):
    urow_dict = db_c.fetchone()
-    # we won't use the connect
-    if not cmx_db:
-        db.close()
    # break with None if no results
    if urow_dict is None:
        mlog("WARNING", "DB get_full_scholar attempt got no rows for: %s" % uid)
@@ -258,9 +282,9 @@ def get_full_scholar(uid, cmx_db = None):
    # normal case <=> exactly one row
-    # Exemple data in urow_dict
+    # Exemple initial data in urow_dict
-    # --------------------------
+    # ----------------------------------
-    # {'affid': 1, 'affiliation_id': 1, 'hashtags': '#something, #another',
+    # {'hashtags': '#something, #another',
    #  'country': 'France', 'doors_uid': '5e3adbc1-bcfb-42da-a2c4-4af006fe2b91',
    #  'email': 'jfk@usa.com', 'first_name': 'John', 'gender': 'M',
    #  'home_url': 'http://localhost/regcomex/', 'hon_title': 'Student',
@@ -268,19 +292,19 @@ def get_full_scholar(uid, cmx_db = None):
    #  'job_looking_date': datetime.date(2019, 9, 28),
    #  'hashtags': '#eccs15', 'hashtags_nb': 1,
    #  'keywords': 'complex networks,complex systems,text mining,machine learning', 'keywords_nb': 4,
+    #  'labs_ids': '3888,3444', 'labs_ids_nb': 2,
+    #  'insts_ids': '3295', 'insts_ids_nb': 1,
    #  'last_modified_date': datetime.datetime(2017, 2, 22, 12, 25, 59),
    #  'last_name': 'Kennedy',
    #  'linked_ids': 'twitter:@jfk,yoyo:42,foobar:XWING', 'linked_ids_nb': 3,
    #  'middle_name': 'Fitzgerald',
-    #  'org': 'Centre National de la Recherche Scientifique (CNRS)',
-    #  'org_city': 'Paris', 'org_type': 'public R&D org',
    #  'pic_fname': '12345.jpg', 'pic_url': None, 'position': 'Research Fellow',
    #  'record_status': 'legacy', 'valid_date': datetime.date(2017, 5, 22)}
    # post-treatments
    # ---------------
-    # 1/ split concatenated kw an ht lists and check correct length
+    # 1/ split concatenated kw, ht, lab id, inst id lists and check correct length
-    for toktype in ['keywords', 'hashtags']:
+    for toktype in ['keywords', 'hashtags', 'labs_ids', 'insts_ids']:
        if urow_dict[toktype+'_nb'] == 0:
            urow_dict[toktype] = []
        else:
@@ -291,7 +315,33 @@ def get_full_scholar(uid, cmx_db = None):
            else:
                urow_dict[toktype] = tokarray
-    # 2/ also split and parse all linked_ids
+    # 2/ must do a secondary SELECT for detailed org info
+    #       dict['labs_ids']: [id1,    id2    ..]
+    #     => dict['labs']   : [{info1},{info2}..]
+    for orgclass in ['labs', 'insts']:
+        id_list = urow_dict[orgclass+"_ids"]  # <- ! naming convention
+        if not len(id_list):
+            urow_dict[orgclass] = []
+        else:
+            org_info = """SELECT name, acro, locname,
+                                 inst_type, lab_code,
+                                 tostring
+                            FROM orgs WHERE orgid IN (%s)""" % ','.join(id_list)
+            mlog('DEBUGSQL', "org_info stmt :", org_info)
+            new_cursor = db.cursor(DictCursor)
+            new_cursor.execute(org_info)
+            urow_dict[orgclass] = new_cursor.fetchall()
+    # print('===urow_dict with orgs[]===')
+    # print(urow_dict)
+    # print('==/urow_dict with orgs[]===')
+    # 3/ also split and parse all linked_ids
    if urow_dict['linked_ids_nb'] == 0:
        urow_dict['linked_ids'] = {}
    else:
@@ -313,6 +363,9 @@ def get_full_scholar(uid, cmx_db = None):
    mlog("INFO", "get_full_scholar %s: OK" % uid)
+    if not cmx_db:
+        db.close()
    # full user info as a dict
    return urow_dict
@@ -540,34 +593,64 @@ def get_or_create_tokitems(tok_list, cmx_db, tok_table='keywords'):
    return found_ids
-def get_or_create_affiliation(org_info, cmx_db):
+def record_sch_org_link(luid, orgid, cmx_db = None):
+    if cmx_db:
+        db = cmx_db
+    else:
+        db = connect_db()
+    db_c = db.cursor(DictCursor)
+    luid = int(luid)
+    orgid = int(orgid)
+    db_c.execute(
+        'INSERT INTO sch_org(uid,orgid) VALUES (%i,%i)' % (luid, orgid)
+    )
+    if not cmx_db:
+        db.close()
+def record_org_org_link(orgid_src, orgid_tgt, cmx_db = None):
+    """
+    new mapping or freq++ if mapping already exists
+    TODO LATER (not a priority)
+               method cf. php_library/directory_content.php/$labs
    """
-    (parent organization + lab) ---> lookup/add to *affiliations* table -> affid
+    pass
-    org_info should contain properties like in ORG_COLS names
+def get_or_create_org(org_info, cmx_db = None):
+    """
+    (scholar's parent org(s)) ---> lookup/add to *orgs* table -> orgid
-     1) query to *affiliations* table
+     1) query to *orgs* table
     2) return id
-        => TODO if institution almost matches send suggestion
+        => TODO if institution almost matches API to send suggestion
        => unicity constraint on institution + lab + org_type
-        => if an institution matches return affid
+        => if an institution matches return orgid
-        => if no institution matches create new and return affid
+        => if no institution matches create new and return orgid
-        TODO test more
+        ! WIP !
    """
+    if cmx_db:
+        db = cmx_db
+    else:
+        db = connect_db()
+    db_c = db.cursor(DictCursor)
    the_aff_id = None
    db_tgtcols = []
    db_qstrvals = []
    db_constraints = []
+    mlog("INFO", "get_or_create_org, org_info:", org_info)
    for colinfo in ORG_COLS:
        colname = colinfo[0]
        val = org_info.get(colname, None)
        if val != None:
-             # TODO better string normalization but not lowercase for acronyms...
+            val = str(normalize_forms(normalize_chars(val, rm_qt=True)))
-            quotedstrval = "'"+str(val)+"'"
+            quotedstrval = "'"+val+"'"
            # for insert
            db_tgtcols.append(colname)
@@ -580,28 +663,33 @@ def get_or_create_affiliation(org_info, cmx_db):
    db_cursor = cmx_db.cursor()
+    mlog("DEBUGSQL", "SELECT org.. WHERE %s" % ("\n AND ".join(db_constraints)))
    n_matched = db_cursor.execute(
-                    'SELECT affid FROM affiliations WHERE %s' %
+                    'SELECT orgid FROM orgs WHERE %s' %
                                        " AND ".join(db_constraints)
                )
    # ok existing affiliation => row id
    if n_matched == 1:
        the_aff_id = db_cursor.fetchone()[0]
-        mlog("DEBUG", "Found affiliation (affid %i) (WHERE %s)" % (the_aff_id, " AND ".join(db_constraints)))
+        mlog("DEBUG", "Found affiliation (orgid %i) (WHERE %s)" % (the_aff_id, " AND ".join(db_constraints)))
    # no matching affiliation => add => row id
    elif n_matched == 0:
-        db_cursor.execute('INSERT INTO affiliations(%s) VALUES (%s)' % (
+        db_cursor.execute('INSERT INTO orgs(%s) VALUES (%s)' % (
                            ','.join(db_tgtcols),
                            ','.join(db_qstrvals)
                           )
                         )
        the_aff_id = db_cursor.lastrowid
        cmx_db.commit()
-        mlog("DEBUG", "Added affiliation '%s'" % str(db_qstrvals))
+        mlog("DEBUG", "dbcrud: added org '%s'" % str(db_qstrvals))
    else:
-        raise Exception("ERROR: non-unique affiliation '%s'" % str(db_qstrvals))
+        raise Exception("ERROR: get_or_create_org non-unique match '%s'" % str(db_qstrvals))
+    if not cmx_db:
+        db.close()
    return the_aff_id

--- a/services/main.py
+++ b/services/main.py
@@ -25,7 +25,7 @@ __status__    = "Dev"
 # ============== imports ==============
-from re           import sub
+from re           import sub, match
 from os           import path, remove
 from json         import dumps
 from datetime     import timedelta
@@ -101,12 +101,12 @@ SOURCE_FIELDS = [
         ("pic_file",              False,        None),   # saved separately
         # => for *scholars* table (optional)
-         ("org",                    True,        None),
+         ("lab_label",              True,        None),   # ~ /acro (name)/
-         ("org_type",              False,        None),   # predefined values
+         ("lab_locname",               True,        None),   #  'Paris, France'
-         (  "other_org_type",       True,        None),   # +=> org_type
+         ("inst_label",             True,        None),   # ~ /acro (name)/
-         ("team_lab",               True,        None),
+         ("inst_type",             False,        None),   # predefined values
-         ("org_city",               True,        None),
+         (  "other_inst_type",      True,        None),   # +=> org_type
-         # => for *affiliations* table
+         # => for *orgs* table via sort_affiliation_records
         ("keywords",               True,        None),
         # => for *keywords* table (after split str)
@@ -752,6 +752,84 @@ def show_privacy():
 ########### SUBS ###########
+def sort_affiliation_records(clean_records):
+    """
+    Transform GUI side input data into at most 2 orgs objects for DB
+    In general:
+        1) the front-end inputs are less free than the DB structure
+            (DB could save an array of orgids but in the inputs they're only allowed max 2 atm : lab and inst)
+        2) each org has its microstructure:
+            - name, acronym, class, location (base properties)
+            - inst_type (specific to institutions)
+            - lab_code, url, contact <= not fillable in GUI yet
+        3) between themselves 2 orgs can have org_org relationships
+            TODO LATER (not a priority)
+        4) we want at least one of lab_label or inst_label to be NOT NULL
+    Choices:
+        - values are already sanitized by read_record_from_request
+        - We call label the concatenated name + acronym information,
+          handling here the possibilities for the input via str analysis
+          (just short name, just long name, both)
+        - We return a map with 2 key/value submaps for lab and institutions
+    """
+    new_orgs = {'lab': None, 'inst': None}
+    for org_class in new_orgs:
+        # can't create org without some kind of label
+        if (org_class+"_label" not in clean_records
+           or not len(clean_records[org_class+"_label"])):
+           pass
+        else:
+            # submap
+            new_org_info = {}
+            # 1) label analysis
+            clean_input = clean_records[org_class+"_label"]
+            # custom split attempt
+            # eg 'CNRS (Centre National de la Recherche Scientifique)'
+            #     vvvv  vvvvvvvvvv
+            #     acro     name
+            test_two_groups = match(
+                                r'([^\(]{1,30}) \(([^\)]+)\)',
+                                clean_input
+                              )
+            if test_two_groups:
+                new_org_info['acro'] = test_two_groups.groups()[0]
+                new_org_info['name'] = test_two_groups.groups()[1]
+            # fallback cases
+            elif len(clean_input) < 30:
+                new_org_info['acro'] = clean_input
+            else:
+                new_org_info['name'] = clean_input
+            # 2) enrich with any other optional org info
+            for detail_col in ['type', 'code', 'locname',
+                               'url', 'contact_email', 'contact_name']:
+                # this is a convention in our templates
+                org_detail = org_class + '_' + detail_col
+                if org_detail in clean_records:
+                    val = clean_records[org_detail]
+                    if len(val):
+                        new_org_info[detail_col] = val
+            # 3) keep
+            new_orgs[org_class] = new_org_info
+    return new_orgs
 def save_form(clean_records, update_flag=False, previous_user_info=None):
    """
    wrapper function for save profile/register (all DB-related form actions)
@@ -767,11 +845,23 @@ def save_form(clean_records, update_flag=False, previous_user_info=None):
    # A) a new DB connection
    reg_db = dbcrud.connect_db(config)
-    # B) read/fill the affiliation table to get associated id
+    # B1) re-group the org fields into at most 2 org 'objects'
-    clean_records['affiliation_id'] = dbcrud.get_or_create_affiliation(
+    declared_orgs = sort_affiliation_records(clean_records)
-        clean_records,
-        reg_db
+    # B2) check our constraint (cf. also E.)
-    )
+    if (declared_orgs['lab'] is None or declared_orgs['inst'] is None):
+        raise ValueError("At least 1 org (lab or institution) must be filled")
+    # B3) for each, read/fill the orgs table to get associated id(s) in DB
+    orgids = []
+    for oclass in ['lab', 'inst']:
+        if (declared_orgs[oclass]):
+            orgids.append(
+                dbcrud.get_or_create_org(declared_orgs[oclass], reg_db)
+            )
+    # B4) save the org <=> org mappings TODO LATER (not a priority)
+    # dbcrud.record_org_org_link(src_orgid, tgt_orgid, reg_db)
    # C) create/update record into the primary user table
    # ----------------------------------------------------
@@ -824,6 +914,10 @@ def save_form(clean_records, update_flag=False, previous_user_info=None):
                map_table
            )
+    # E) save the (uid <=> orgid) mapping(s)
+    for orgid in orgids:
+        dbcrud.record_sch_org_link(luid, orgid, reg_db)
    # F) end connection
    reg_db.close()
@@ -872,9 +966,9 @@ def read_record_from_request(request):
                clean_records[field] = request.form[field]
    # special treatment for "other" subquestions
-    if 'org_type' in clean_records:
+    if 'inst_type' in clean_records:
-        if clean_records['org_type'] == 'other' and 'other_org_type' in clean_records:
+        if clean_records['inst_type'] == 'other' and 'other_inst_type' in clean_records:
-            clean_records['org_type'] = clean_records['other_org_type']
+            clean_records['inst_type'] = clean_records['other_inst_type']
    # splits for kw_array and ht_array
    for tok_field in ['keywords', 'hashtags']:

--- a/services/text/utils.py
+++ b/services/text/utils.py
@@ -140,3 +140,162 @@ class CountryConverter:
        self.connDBLP.close()
        return fails
+#! /usr/bin/python3
+from re  import sub
+from sys import stdin, stderr
+# settings
+dont_touch_first_column = False
+NCOLS = 1
+# functions
+def normalize_chars(my_str, rm_qt=False):
+    """
+    Simplification des chaînes de caractères en entrée de la BDD
+       - normalisation
+            > espaces
+            > tirets
+            > guillemets
+       - déligatures
+    Goal: normalize input values more like ascii will be easier to process
+    """
+    # print('normalize_chars  IN: "%s"' % my_str)
+    # --------------
+    # E S P A C E S
+    # --------------
+    # tous les caractères de contrôle (dont \t = \x{0009} et \r = \x{000D}) --> espace
+    my_str = sub(r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]', ' ', my_str)
+    # mais pas \n = \x{000A}
+    # Line separator
+    my_str = sub(r'\u2028',' ', my_str)
+    my_str = sub(r'\u2029',' ', my_str)
+    # U+0092: parfois quote parfois cara de contrôle
+    my_str = sub(r'\u0092', ' ', my_str)
+    # tous les espaces alternatifs --> espace
+    my_str = sub(r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]', ' ' , my_str)
+    # quelques puces courantes (bullets)
+    my_str = sub(r'^\s+►', '   • ', my_str)
+    my_str = sub(r'^\s+●', '   • ', my_str)
+    my_str = sub(r'^\s+◘', '   • ', my_str)
+    my_str = sub(r'^\s+→', '   • ', my_str)
+    my_str = sub(r'^\s+▪', '   • ', my_str)
+    my_str = sub(r'^\s+·', '   • ', my_str)
+    my_str = sub(r'^\s+\*', '   • ', my_str)
+    # pour finir on enlève les espaces en trop
+    # (dits "trailing spaces")
+    my_str = sub(r' +', ' ', my_str)
+    my_str = sub(r'^ +', '', my_str)
+    my_str = sub(r' +$', '', my_str)
+    # ------------------------
+    # P O N C T U A T I O N S
+    # ------------------------
+    # la plupart des tirets alternatifs --> tiret normal (dit "du 6")
+    # (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
+    my_str = sub(r'[‐‑‒–—―−﹣]','-', my_str)
+    # le macron aussi parfois comme tiret
+    my_str = sub(r'\u00af','-', my_str)
+    # Guillemets
+    # ----------
+    # la plupart des quotes simples --> ' APOSTROPHE
+    my_str = sub(r"[‘’‚`‛]", "'", my_str) # U+2018 U+2019 U+201a U+201b
+    my_str = sub(r'‹ ?',"'", my_str)    # U+2039 plus espace éventuel après
+    my_str = sub(r' ?›',"'", my_str)    # U+203A plus espace éventuel avant
+    # la plupart des quotes doubles --> " QUOTATION MARK
+    my_str = sub(r'[“”„‟]', '"', my_str)  # U+201C U+201D U+201E U+201F
+    # my_str = sub(r'« ?', '"', my_str)   # U+20AB plus espace éventuel après
+    # my_str = sub(r' ?»', '"', my_str)   # U+20AB plus espace éventuel avant
+    # deux quotes simples (préparées ci-dessus) => une double
+    my_str = sub(r"''", '"', my_str)
+    # if we need to remove single quotes
+    if rm_qt:
+        my_str = sub(r"'", '"', my_str)
+    # print('normalize_chars OUT: "%s"' % my_str)
+    return my_str
+def normalize_forms(term_str, do_lowercase=False):
+    """
+    Removes unwanted trailing punctuation
+    AND optionally puts everything to lowercase
+    ex /©""ecosystem services"";/ => /"ecosystem services"/
+    (benefits from normalize_chars upstream so there's less cases to consider)
+    largely inadequate to the enormity of the task
+    """
+    # print('normalize_forms  IN: "%s"' % term_str)
+    term_str = sub(r'^[,; ©]+', '', term_str)
+    term_str = sub(r'[,; ©]+$', '', term_str)
+    term_str = sub(r'"+', '"', term_str)
+    term_str = sub(r'/+', '/', term_str)
+    term_str = sub(r"'+", "'", term_str)
+    if do_lowercase:
+        term_str = term_str.lower()
+    # print('normalize_forms OUT: "%s"' % term_str)
+    return term_str
+if __name__ == "__main__":
+    for i, line in enumerate(stdin):
+        fields = line.rstrip().split('\t')
+        if len(fields) > NCOLS:
+            print ("skipping line %i (%s)" % (i, fields), file=stderr)
+            continue
+        if dont_touch_first_column:
+            # some ID supposed in 1st col => kept unchanged
+            clean_fields = [fields[0]]
+            todo_fields = fields[1:]
+        else:
+            # normalize in all columns
+            clean_fields = []
+            todo_fields = fields
+        for field in todo_fields:
+            clean_lines = []
+            last_line = None
+            for line in field.split('%%%'):
+                # print(">> (doing line)", line)
+                clean_line = normalize_forms(normalize_chars(line))
+                if clean_line == '' and last_line == '':
+                    last_line = clean_line
+                    continue
+                else:
+                    clean_lines.append(normalize_forms(normalize_chars(line)))
+                    last_line = clean_line
+            # remove trailing lines
+            # TODO test if instead s/(?:%%%)+$// on clean_fields later is faster
+            for i in range(len(clean_lines)-1, 0, -1):
+                if not len(clean_lines[i]):
+                    clean_lines.pop()
+                else:
+                    break
+            clean_fields.append('%%%'.join(clean_lines))
+        # OUTPUT
+        print("\t".join(clean_fields))
--- a/setup/dockers/comex2_mysql_server/init_comex_shared.sql
+++ b/setup/dockers/comex2_mysql_server/init_comex_shared.sql
@@ -69,15 +69,14 @@ CREATE TABLE orgs(
    -- address...          (...)      -- address elements POSS NOT IMPLEMENTED
    reserved            varchar(30),
-    -- generated column, often useful for autocompletes etc
+    -- tostring: generated column
    -- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain"
-    tostring            varchar(800) AS (CONCAT(
+    -- searchable + human readable, often useful for autocompletes etc
-                                         name, ' (', acro, ')',
+    tostring            varchar(800)
-                                         IF(locname IS NOT NULL ,
+        AS (CONCAT_WS( '',
-                                                 CONCAT(', ', locname),
+                       CONCAT(name, ' '),
-                                                 '')
+                       CONCAT('(',acro,')'),
-                                     )),
+                       CONCAT(', ', locname)) ),
    PRIMARY KEY (orgid),
    UNIQUE KEY full_org (name, acro, locname)
@@ -97,6 +96,7 @@ CREATE TABLE sch_org(
 -- POSS: relationship organizations <=> keywords
 -- POSS: relationship organizations <=> organizations
+-- cf. doc/data_mining_exemples/org_to_orgs.sql
 -- keyword/subject terms

--- a/static/js/comex_page_profile_controllers.js
+++ b/static/js/comex_page_profile_controllers.js
@@ -14,10 +14,10 @@
 */
 // 3 exposed vars for inline js controls
- var teamCityDiv = document.getElementById('team_city_div')
+ var teamCityDiv = document.getElementById('lab_locname_div')
- var otherInstDiv = document.getElementById('other_org_div')
+ var otherInstDiv = document.getElementById('other_inst_div')
- // TODO make relative to org_type and move inline snippet to extended form obj
+ // TODO make relative to inst_type and move inline snippet to extended form obj
- var otherOrgTypeInput = document.getElementById('other_org_type')
+ var otherOrgTypeInput = document.getElementById('other_inst_type')
 // reselecting current_user's info choices
 function setupSavedItems(uinfo) {

--- a/static/js/comex_page_reg_controllers.js
+++ b/static/js/comex_page_reg_controllers.js
@@ -39,8 +39,8 @@ var validateWithMessage = false
 var shortRegVersion = true
 var ignoredFields = []
 if (shortRegVersion) {
-    ignoredFields = ['gender', 'home_url', 'org',
+    ignoredFields = ['gender', 'home_url', 'inst_label',
-                     'hon_title', 'position', 'org_type',
+                     'hon_title', 'position', 'inst_type',
                     'hashtags']
 }
@@ -68,7 +68,7 @@ function testAsYouGo() {
  }
 }
-var teamCityDivStyle = document.getElementById('team_city_div').style
+var teamCityDivStyle = document.getElementById('lab_locname_div').style
 function registerDoorsAndSubmit(){
    regfo.elMainMessage.innerHTML = "Registering with the test login portal<br/> and sending validation email..."

--- a/static/js/comex_user_shared.js
+++ b/static/js/comex_user_shared.js
@@ -61,11 +61,25 @@ var cmxClt = (function() {
        ["pic_file",              false,       "pref"   , "f",  "other_infos"],
        // ==> *scholars* table
-        ["org",                   false,       "plsfill", "t", "org_infos"],
-        ["org_type",              false,       "plsfill", "m", "org_infos"],
+        // org field
-        ["team_lab",               true,       "plsfill", "t", "org_infos"],
+        //   => name, acro in one field "label": #lab_label, #inst_label
-        ["org_city",              false,       "pref"   , "t", "org_infos"]
+        //   => all other fields
-        // ==> *affiliations* table
+        //        - are optional
+        //        - if present, should be named: lab|inst + '_' + colname
+        //   => TODO org details suggestions
+        //      url, loc should have autofill when name or acro is chosen
+        //   => POSS org <-> org suggestions
+        //      once a lab is filled, we could propose the institution
+        ["lab_label",              false,       "plsfill", "t", "org_infos"],
+        ["lab_locname",            false,       "pref", "t", "org_infos"],
+        ["inst_label",             false,       "pref", "t", "org_infos"],
+        ["inst_type",              false,       "pref", "m", "org_infos"],
+        // ["lab_code",            false,       "pref", "t", "org_infos"],
+        // ["lab_url",             false,       "pref", "t", "org_infos"],
+        // ["inst_locname",        false,       "pref"   , "t", "org_infos"],
+        // ["inst_url",            false,       "pref"   , "t", "org_infos"],
+        // ==> *orgs* table via pretreatment org is inst or org is lab
    ]
    // group "auto"    === filled by controllers

--- a/templates/profile.html
+++ b/templates/profile.html
@@ -347,73 +347,15 @@
                 class="panel-body ccsection-uform-body panel-collapse collapse out"
                 role="tabpanel" aria-expanded="false">
-                 <div class="question input-group">
+                <div class="question input-group">
-                   <label for="position" class="smlabel input-group-addon">* Job Position</label>
+                <label for="position" class="smlabel input-group-addon">* Job Position</label>
-                   <input id="position" name="position" maxlength="30"
+                <input id="position" name="position" maxlength="30"
-                          type="text" class="form-control autocomp" placeholder="titre"
+                      type="text" class="form-control autocomp" placeholder="titre"
-                          onblur="cmxClt.makeBold(this)" onfocus="cmxClt.makeNormal(this)"
+                      onblur="cmxClt.makeBold(this)" onfocus="cmxClt.makeNormal(this)"
-                          value="{{ current_user.info.position }}">
+                      value="{{ current_user.info.position }}">
-                 </div>
+                </div>
-                 <!-- ORG QUESTIONS -->
-                 <div class="question">
-                   <div class="input-group">
-                     <label for="org" class="smlabel input-group-addon">Parent Institution</label>
-                     <input id="org" name="org" maxlength="120"
-                            type="text" class="form-control autocomp" placeholder='eg "CNRS" or "University of Oxford"'
-                            value="{{ current_user.info.org }}">
-                   </div>
-                 </div>
-                 <div class="question">
-                   <div class="input-group">
-                     <label for="org_type" class="smlabel input-group-addon">Institution Type</label>
-                     <select id="org_type" name="org_type"
-                             class="custom-select form-control"
-                             onchange="if(this.value=='other'){otherInstDiv.style.display = 'block'} else {otherInstDiv.style.display='none';otherOrgTypeInput.value=''}">
-                       <option selected disabled value="">Please select</option>
-                       <option value="university">University</option>
-                       <option value="public R&amp;D org">Public sector R&amp;D organization</option>
-                       <option value="public other org">Other public sector organization</option>
-                       <option value="private org">Private sector organization</option>
-                       <option value="none">None at the moment</option>
-                       <option value="other"
-                               onclick="otherInstDiv.style.display = 'block'"
-                               >Other</option>
-                     </select>
-                   </div>
-                   <!-- Other institution type <=> only if previous choice == 5 -->
-                   <div class="question conditional-q" id="other_org_div">
-                     <div class="input-group">
-                       <label for="other_org_type" class="smlabel input-group-addon">Other type</label>
-                       <input id="other_org_type" name="other_org_type" maxlength="120"
-                               type="text" class="form-control" placeholder="Clarify here the type of your parent institution">
-                     </div>
-                   </div>
-                 </div>
-                  <!-- TEAM QUESTIONS -->
-                 <div class="question">
-                   <div class="input-group">
-                     <label for="team_lab" class="smlabel input-group-addon">* Lab / Team / Dept</label>
-                     <input id="team_lab" name="team_lab" maxlength="120"
-                            type="text" class="form-control" placeholder="More detailed affiliation, if relevant"
-                            value="{{ current_user.info.team_lab }}">
-                   </div>
-                 </div>
-                 <!-- Lab city <=> only for France -->
-                 <div class="question conditional-q" id="team_city_div">
-                   <div class="input-group">
-                     <label for="org_city" class="smlabel input-group-addon">Lab city</label>
-                     <input id="org_city" name="org_city" maxlength="50"
-                            type="text" class="form-control" placeholder="Ville de votre institution"
-                            value="{{ current_user.info.org_city }}">
-                   </div>
-                 </div>
+                {% include 'questions/org_details.html' %}
             </div> <!-- /panel-body -->
            <div class="panel-footer ccsection-footer">&nbsp;</div>

--- a/templates/questions/org_details.html
+++ b/templates/questions/org_details.html
+<!-- ORG QUESTIONS -->
+<!-- lab or team and details -->
+<div class="question">
+    <div class="input-group">
+     <label for="lab_label" class="smlabel input-group-addon">* Lab / Team / Dept</label>
+     <input id="lab_label" name="lab_label" maxlength="250"
+            type="text" class="form-control" placeholder="More detailed affiliation, if relevant"
+            value="{{ current_user.info.labs[0].tostring if current_user.info.labs|length > 0 }}">
+    </div>
+</div>
+<!-- lab locname <=> only for France -->
+<div class="question conditional-q" id="lab_locname_div">
+    <div class="input-group">
+     <label for="lab_locname" class="smlabel input-group-addon">Lab city</label>
+     <input id="lab_locname" name="lab_locname" maxlength="50"
+            type="text" class="form-control" placeholder="Ville de votre institution"
+            value="{{ current_user.info.labs[0].locname if current_user.info.labs|length > 0 }}">
+    </div>
+</div>
+<!-- larger institution and details -->
+<div class="question">
+    <div class="input-group">
+     <label for="inst_label" class="smlabel input-group-addon">Parent Institution</label>
+     <input id="inst_label" name="inst_label" maxlength="250"
+            type="text" class="form-control autocomp" placeholder='eg "CNRS" or "University of Oxford"'
+            value="{{ current_user.info.insts[0].tostring if current_user.info.insts|length > 0 }}">
+    </div>
+</div>
+<div class="question">
+    <div class="input-group">
+      <label for="inst_type" class="smlabel input-group-addon">Institution Type</label>
+      <select id="inst_type" name="inst_type"
+              class="custom-select form-control"
+              onchange="if(this.value=='other'){otherInstDiv.style.display = 'block'} else {otherInstDiv.style.display='none';otherOrgTypeInput.value=''}">
+        <option selected disabled value="">Please select</option>
+        <option value="university">University</option>
+        <option value="public R&amp;D org">Public sector R&amp;D organization</option>
+        <option value="public other org">Other public sector organization</option>
+        <option value="private org">Private sector organization</option>
+        <option value="none">None at the moment</option>
+        <option value="other"
+                onclick="otherInstDiv.style.display = 'block'"
+                >Other</option>
+      </select>
+    </div>
+    <!-- Other institution type <=> only if previous choice == 5 -->
+    <div class="question conditional-q" id="other_org_div">
+      <div class="input-group">
+        <label for="other_inst_type" class="smlabel input-group-addon">Other type</label>
+        <input id="other_inst_type" name="other_inst_type" maxlength="120"
+                type="text" class="form-control" placeholder="Clarify here the type of your parent institution">
+      </div>
+    </div>
+</div>
--- a/templates/registration_super_short_form.html
+++ b/templates/registration_super_short_form.html
@@ -160,20 +160,20 @@
               <div class="question">
                 <div class="input-group">
-                   <label for="team_lab" class="smlabel input-group-addon">* Lab / Team / Dept</label>
+                   <label for="lab_label" class="smlabel input-group-addon">* Lab / Team / Dept</label>
-                   <input id="team_lab" name="team_lab" maxlength="120"
+                   <input id="lab_label" name="lab_label" maxlength="120"
                          type="text" class="form-control" placeholder="Your lab"
-                          placeholder="team_lab">
+                          placeholder="lab_label">
                 </div>
               </div>
               <!-- Lab city <=> only for France -->
-               <div class="question conditional-q" id="team_city_div">
+               <div class="question conditional-q" id="lab_locname_div">
                 <div class="input-group">
-                   <label for="org_city" class="smlabel input-group-addon">Lab city</label>
+                   <label for="lab_locname" class="smlabel input-group-addon">Lab city</label>
-                   <input id="org_city" name="org_city" maxlength="50"
+                   <input id="lab_locname" name="lab_locname" maxlength="50"
                          type="text" class="form-control" placeholder="Ville de votre institution"
-                          placeholder="org_city">
+                          placeholder="lab_locname">
                 </div>
               </div>