Commit a8ac0399 authored by Romain Loth's avatar Romain Loth

adapt dbcrud for new org table structure (profile, registration)

parent 326c2ed8
...@@ -25,19 +25,23 @@ SELECT * FROM ( ...@@ -25,19 +25,23 @@ SELECT * FROM (
FROM scholars FROM scholars
LEFT JOIN sch_org AS map_labs LEFT JOIN sch_org AS map_labs
ON map_labs.uid = luid ON map_labs.uid = luid
JOIN orgs AS labs LEFT JOIN (
-- class constraint can't appear later,
-- it would give no scholar when empty
SELECT * FROM orgs WHERE class='lab'
) AS labs
ON map_labs.orgid = labs.orgid ON map_labs.orgid = labs.orgid
WHERE (record_status = 'active' WHERE (record_status = 'active'
OR (record_status = 'legacy' AND valid_date >= NOW())) OR (record_status = 'legacy' AND valid_date >= NOW()))
AND labs.class = 'lab'
GROUP BY luid GROUP BY luid
) AS scholars_and_labs ) AS scholars_and_labs
LEFT JOIN sch_org AS map_insts LEFT JOIN sch_org AS map_insts
ON map_insts.uid = luid ON map_insts.uid = luid
JOIN orgs AS insts LEFT JOIN (
SELECT * FROM orgs WHERE class='inst'
) AS insts
ON map_insts.orgid = insts.orgid ON map_insts.orgid = insts.orgid
AND insts.class = 'inst'
GROUP BY luid GROUP BY luid
) AS scholars_and_orgs ) AS scholars_and_orgs
......
-- we pass through scholars
-- org1 => scholars => orgs2
-- (for suggestions and/or than mapping)
SELECT orgs.*,
GROUP_CONCAT( tgt_tostring ORDER BY tgt_freq DESC SEPARATOR '%%%')
AS related_insts
FROM orgs
LEFT JOIN (
SELECT sch_org.orgid AS src_orgid,
sch_org2.orgid AS tgt_orgid,
orgs2.tostring AS tgt_tostring,
count(*) AS tgt_freq
FROM sch_org
LEFT JOIN sch_org AS sch_org2
ON sch_org.uid = sch_org2.uid
JOIN orgs AS orgs2
ON sch_org2.orgid = orgs2.orgid
WHERE orgs2.class = 'inst'
AND sch_org.orgid != sch_org2.orgid
GROUP BY sch_org.orgid, sch_org2.orgid
) AS lab_relationship_to_inst_via_scholars ON src_orgid = orgs.orgid
WHERE orgs.orgid IN ( {$ids_str} )
AND orgs.name != '_NULL'
GROUP BY orgs.orgid
ORDER BY orgs.name, orgs.acro
;
-- a POSSible alternative would be create an org_org tabls
-- relationship organizations <=> organizations
-- formally many-to-many but one could say many-to-few :)
CREATE TABLE org_org(
orgid_src int(15) not null, -- @class 'lab'
orgid_tgt int(15) not null, -- @class 'inst'
sch_freq int(15) default 0, -- how often declared in sch records
-- (useful if unsure main parent org)
PRIMARY KEY (orgid_src, orgid_tgt),
FOREIGN KEY (orgid_src) REFERENCES orgs(orgid) ON DELETE CASCADE,
FOREIGN KEY (orgid_tgt) REFERENCES orgs(orgid) ON DELETE CASCADE
);
-- NB +/-1 to org -> org freq in org_org would be triggered indirectly by new scholars rows so made in profile saving at middle-ware lvl (dbcrud.py)
## dev overview ## dev overview
comex app contains: comex app contains:
...@@ -34,13 +33,10 @@ cd $INSTALL_DIR ...@@ -34,13 +33,10 @@ cd $INSTALL_DIR
sudo pip3 install -r setup/requirements.txt sudo pip3 install -r setup/requirements.txt
``` ```
Then to run the comex2 services in the simplest way just do: Then to run the comex2 server just do:
``` ```
cd services bash comex-run.sh
python3 comex_main_backend.py
``` ```
The form server is then accessible locally on `0.0.0.0:5000/services/user`
The tina api server is on `0.0.0.0:5000/services/api`
Check the parameters in `config/parametres_comex.ini` Check the parameters in `config/parametres_comex.ini`
...@@ -49,7 +45,7 @@ Finally, simply configure the serving of your php|www documentroot in nginx (cf ...@@ -49,7 +45,7 @@ Finally, simply configure the serving of your php|www documentroot in nginx (cf
------- -------
#### Advanced dev config #### Full dev config
1. external mysql database 1. external mysql database
2. external doors (or simulated by docker) 2. external doors (or simulated by docker)
3. gunicorn webserver (linked to 1 & 2 via `$SQL_HOST` and `$DOORS_HOST`) 3. gunicorn webserver (linked to 1 & 2 via `$SQL_HOST` and `$DOORS_HOST`)
...@@ -92,15 +88,8 @@ nano config/parametres_comex.ini ...@@ -92,15 +88,8 @@ nano config/parametres_comex.ini
###### If you have no doors server ###### If you have no doors server
For tests you can use a `minidoors` container For tests you can use a self-deployed doors container, available on [this repository](https://github.com/ISCPIF/doors-docker)
```
# build the docker image (once)
cd setup/dockers
docker build -t minidoors:latest minidoors/
# run the container (each time)
docker run -it -p 32789:8989 --name doors_test minidoors
```
##### 3) Run the regomex app with gunicorn ##### 3) Run the regomex app with gunicorn
``` ```
......
...@@ -46,7 +46,6 @@ CREATE TABLE scholars ( ...@@ -46,7 +46,6 @@ CREATE TABLE scholars (
) ; ) ;
CREATE TABLE locs( CREATE TABLE locs(
locname varchar(120), locname varchar(120),
lat float(6,4), lat float(6,4),
...@@ -54,12 +53,11 @@ CREATE TABLE locs( ...@@ -54,12 +53,11 @@ CREATE TABLE locs(
PRIMARY KEY (locname) PRIMARY KEY (locname)
) ; ) ;
-- table for all organization classes (team, lab, large institution) -- table for all organization classes (team, lab, large institution)
CREATE TABLE orgs( CREATE TABLE orgs(
orgid int(15) not null auto_increment, orgid int(15) not null auto_increment,
name varchar(120), -- full name name varchar(120), -- full name
acro varchar(20), -- acronym or short name acro varchar(30), -- acronym or short name
class varchar(25), -- "team|lab|inst" class varchar(25), -- "team|lab|inst"
-- like the calibre of the organization -- like the calibre of the organization
...@@ -78,15 +76,14 @@ CREATE TABLE orgs( ...@@ -78,15 +76,14 @@ CREATE TABLE orgs(
-- address... (...) -- address elements POSS NOT IMPLEMENTED -- address... (...) -- address elements POSS NOT IMPLEMENTED
reserved varchar(30), reserved varchar(30),
-- generated column, often useful for autocompletes etc -- tostring: generated column
-- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain" -- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain"
tostring varchar(800) AS (CONCAT( -- searchable + human readable, often useful for autocompletes etc
name, ' (', acro, ')', tostring varchar(800)
IF(locname IS NOT NULL , AS (CONCAT_WS( '',
CONCAT(', ', locname), CONCAT(name, ' '),
'') CONCAT('(',acro,')'),
)), CONCAT(', ', locname)) ),
PRIMARY KEY (orgid), PRIMARY KEY (orgid),
UNIQUE KEY full_org (name, acro, locname) UNIQUE KEY full_org (name, acro, locname)
...@@ -106,6 +103,7 @@ CREATE TABLE sch_org( ...@@ -106,6 +103,7 @@ CREATE TABLE sch_org(
-- POSS: relationship organizations <=> keywords -- POSS: relationship organizations <=> keywords
-- POSS: relationship organizations <=> organizations -- POSS: relationship organizations <=> organizations
-- cf. doc/data_mining_exemples/org_to_orgs.sql
-- keyword/subject terms -- keyword/subject terms
......
...@@ -303,19 +303,21 @@ SELECT * FROM ( ...@@ -303,19 +303,21 @@ SELECT * FROM (
FROM scholars FROM scholars
LEFT JOIN sch_org AS map_labs LEFT JOIN sch_org AS map_labs
ON map_labs.uid = luid ON map_labs.uid = luid
JOIN orgs AS labs LEFT JOIN (
SELECT * FROM orgs WHERE class='lab'
) AS labs
ON map_labs.orgid = labs.orgid ON map_labs.orgid = labs.orgid
WHERE (record_status = 'active' WHERE (record_status = 'active'
OR (record_status = 'legacy' AND valid_date >= NOW())) OR (record_status = 'legacy' AND valid_date >= NOW()))
AND labs.class = 'lab'
GROUP BY luid GROUP BY luid
) AS scholars_and_labs ) AS scholars_and_labs
LEFT JOIN sch_org AS map_insts LEFT JOIN sch_org AS map_insts
ON map_insts.uid = luid ON map_insts.uid = luid
JOIN orgs AS insts LEFT JOIN (
SELECT * FROM orgs WHERE class='inst'
) AS insts
ON map_insts.orgid = insts.orgid ON map_insts.orgid = insts.orgid
AND insts.class = 'inst'
GROUP BY luid GROUP BY luid
) AS scholars_and_orgs ) AS scholars_and_orgs
......
...@@ -10,10 +10,12 @@ from MySQLdb.cursors import DictCursor ...@@ -10,10 +10,12 @@ from MySQLdb.cursors import DictCursor
if __package__ == 'services': if __package__ == 'services':
# when we're run via import # when we're run via import
from services.tools import mlog, REALCONFIG from services.tools import mlog, REALCONFIG
from services.text.utils import normalize_chars, normalize_forms
else: else:
# when this script is run directly # when this script is run directly
from tools import mlog, REALCONFIG from tools import mlog, REALCONFIG
from text.utils import normalize_chars, normalize_forms
# sorted columns as declared in DB, as a tuple # sorted columns as declared in DB, as a tuple
...@@ -21,14 +23,12 @@ USER_COLS = [ ...@@ -21,14 +23,12 @@ USER_COLS = [
# NAME, NOT NULL, N or MAXCHARS (if applicable) # NAME, NOT NULL, N or MAXCHARS (if applicable)
("luid", True, 15), ("luid", True, 15),
("doors_uid", False, 36), ("doors_uid", False, 36),
# ("last_modified", True, None), # autoset on update
("email", True, 255), ("email", True, 255),
("country", True, 60), ("country", True, 60),
("first_name", True, 30), ("first_name", True, 30),
("middle_name", False, 30), ("middle_name", False, 30),
("last_name", True, 50), ("last_name", True, 50),
("initials", True, 7), ("initials", True, 7),
("affiliation_id", False, None), # from db_get_or_create_affiliation
("position", False, 30), ("position", False, 30),
("hon_title", False, 30), ("hon_title", False, 30),
("interests_text", False, 1200), ("interests_text", False, 1200),
...@@ -43,10 +43,19 @@ USER_COLS = [ ...@@ -43,10 +43,19 @@ USER_COLS = [
] ]
ORG_COLS = [ ORG_COLS = [
("org", False, 120), ("class", False, 25), # "lab" or "inst"
("org_type", False, 50), ("name", False, 120),
("team_lab", True, 120), ("acro", False, 30), # acronym or short name
("org_city", False, 50) ("locname", False, 120),
("inst_type", False, 50),
("lab_code", False, 25), # not in GUI yet
("url", False, 180), # not in GUI yet
("contact_name", False, 80), # not in GUI yet
("contact_email", False, 255) # not in GUI yet
# also in concatenations:
# label = name + acro
# tostring = name + acro + locname
] ]
...@@ -156,6 +165,10 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -156,6 +165,10 @@ def get_full_scholar(uid, cmx_db = None):
db = connect_db() db = connect_db()
db_c = db.cursor(DictCursor) db_c = db.cursor(DictCursor)
print('DBG', 'uid', uid)
print('DBG', 'type(uid)', type(uid))
# one user + all linked infos concatenated in one row # one user + all linked infos concatenated in one row
# <= 3 LEFT JOINS sequentially GROUPed # <= 3 LEFT JOINS sequentially GROUPed
# (b/c if simultaneous, loses unicity) # (b/c if simultaneous, loses unicity)
...@@ -182,7 +195,7 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -182,7 +195,7 @@ def get_full_scholar(uid, cmx_db = None):
FROM ( FROM (
SELECT SELECT
sch_n_aff.*, sch_n_orgs.*,
-- kws info condensed -- kws info condensed
COUNT(keywords.kwid) AS keywords_nb, COUNT(keywords.kwid) AS keywords_nb,
...@@ -191,24 +204,39 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -191,24 +204,39 @@ def get_full_scholar(uid, cmx_db = None):
FROM ( FROM (
SELECT SELECT
scholars.*, sch_n_labs.*,
-- for debug replace scholars.* by COUNT(insts.orgid) AS insts_ids_nb,
-- scholars.luid, GROUP_CONCAT(insts.orgid) AS insts_ids
-- scholars.doors_uid,
-- scholars.email, FROM (
-- scholars.last_modified_date, SELECT
-- scholars.initials, scholars.*,
COUNT(labs.orgid) AS labs_ids_nb,
affiliations.* GROUP_CONCAT(labs.orgid) AS labs_ids
FROM scholars FROM scholars
LEFT JOIN affiliations LEFT JOIN sch_org AS map_labs
ON scholars.affiliation_id = affiliations.affid ON map_labs.uid = luid
LEFT JOIN (
-- class constraint can't appear later,
-- it would give no scholar when empty
SELECT * FROM orgs WHERE class='lab'
) AS labs
ON map_labs.orgid = labs.orgid
GROUP BY luid
) AS sch_n_labs
LEFT JOIN sch_org AS map_insts
ON map_insts.uid = luid
LEFT JOIN (
SELECT * FROM orgs WHERE class='inst'
) AS insts
ON map_insts.orgid = insts.orgid
GROUP BY luid GROUP BY luid
) AS sch_n_orgs
) AS sch_n_aff
-- two step JOIN for keywords -- two step JOIN for keywords
LEFT JOIN sch_kw LEFT JOIN sch_kw
...@@ -232,9 +260,9 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -232,9 +260,9 @@ def get_full_scholar(uid, cmx_db = None):
ON linked_ids.uid = luid ON linked_ids.uid = luid
-- WHERE our user UID -- WHERE our user UID
WHERE luid = "%s" WHERE luid = %i
GROUP BY luid GROUP BY luid
""" % str(uid) """ % int(uid)
mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt) mlog("DEBUGSQL", "DB get_full_scholar STATEMENT:\n-- SQL\n%s\n-- /SQL" % one_usr_stmt)
...@@ -246,10 +274,6 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -246,10 +274,6 @@ def get_full_scholar(uid, cmx_db = None):
urow_dict = db_c.fetchone() urow_dict = db_c.fetchone()
# we won't use the connect
if not cmx_db:
db.close()
# break with None if no results # break with None if no results
if urow_dict is None: if urow_dict is None:
mlog("WARNING", "DB get_full_scholar attempt got no rows for: %s" % uid) mlog("WARNING", "DB get_full_scholar attempt got no rows for: %s" % uid)
...@@ -258,9 +282,9 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -258,9 +282,9 @@ def get_full_scholar(uid, cmx_db = None):
# normal case <=> exactly one row # normal case <=> exactly one row
# Exemple data in urow_dict # Exemple initial data in urow_dict
# -------------------------- # ----------------------------------
# {'affid': 1, 'affiliation_id': 1, 'hashtags': '#something, #another', # {'hashtags': '#something, #another',
# 'country': 'France', 'doors_uid': '5e3adbc1-bcfb-42da-a2c4-4af006fe2b91', # 'country': 'France', 'doors_uid': '5e3adbc1-bcfb-42da-a2c4-4af006fe2b91',
# 'email': 'jfk@usa.com', 'first_name': 'John', 'gender': 'M', # 'email': 'jfk@usa.com', 'first_name': 'John', 'gender': 'M',
# 'home_url': 'http://localhost/regcomex/', 'hon_title': 'Student', # 'home_url': 'http://localhost/regcomex/', 'hon_title': 'Student',
...@@ -268,19 +292,19 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -268,19 +292,19 @@ def get_full_scholar(uid, cmx_db = None):
# 'job_looking_date': datetime.date(2019, 9, 28), # 'job_looking_date': datetime.date(2019, 9, 28),
# 'hashtags': '#eccs15', 'hashtags_nb': 1, # 'hashtags': '#eccs15', 'hashtags_nb': 1,
# 'keywords': 'complex networks,complex systems,text mining,machine learning', 'keywords_nb': 4, # 'keywords': 'complex networks,complex systems,text mining,machine learning', 'keywords_nb': 4,
# 'labs_ids': '3888,3444', 'labs_ids_nb': 2,
# 'insts_ids': '3295', 'insts_ids_nb': 1,
# 'last_modified_date': datetime.datetime(2017, 2, 22, 12, 25, 59), # 'last_modified_date': datetime.datetime(2017, 2, 22, 12, 25, 59),
# 'last_name': 'Kennedy', # 'last_name': 'Kennedy',
# 'linked_ids': 'twitter:@jfk,yoyo:42,foobar:XWING', 'linked_ids_nb': 3, # 'linked_ids': 'twitter:@jfk,yoyo:42,foobar:XWING', 'linked_ids_nb': 3,
# 'middle_name': 'Fitzgerald', # 'middle_name': 'Fitzgerald',
# 'org': 'Centre National de la Recherche Scientifique (CNRS)',
# 'org_city': 'Paris', 'org_type': 'public R&D org',
# 'pic_fname': '12345.jpg', 'pic_url': None, 'position': 'Research Fellow', # 'pic_fname': '12345.jpg', 'pic_url': None, 'position': 'Research Fellow',
# 'record_status': 'legacy', 'valid_date': datetime.date(2017, 5, 22)} # 'record_status': 'legacy', 'valid_date': datetime.date(2017, 5, 22)}
# post-treatments # post-treatments
# --------------- # ---------------
# 1/ split concatenated kw an ht lists and check correct length # 1/ split concatenated kw, ht, lab id, inst id lists and check correct length
for toktype in ['keywords', 'hashtags']: for toktype in ['keywords', 'hashtags', 'labs_ids', 'insts_ids']:
if urow_dict[toktype+'_nb'] == 0: if urow_dict[toktype+'_nb'] == 0:
urow_dict[toktype] = [] urow_dict[toktype] = []
else: else:
...@@ -291,7 +315,33 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -291,7 +315,33 @@ def get_full_scholar(uid, cmx_db = None):
else: else:
urow_dict[toktype] = tokarray urow_dict[toktype] = tokarray
# 2/ also split and parse all linked_ids # 2/ must do a secondary SELECT for detailed org info
# dict['labs_ids']: [id1, id2 ..]
# => dict['labs'] : [{info1},{info2}..]
for orgclass in ['labs', 'insts']:
id_list = urow_dict[orgclass+"_ids"] # <- ! naming convention
if not len(id_list):
urow_dict[orgclass] = []
else:
org_info = """SELECT name, acro, locname,
inst_type, lab_code,
tostring
FROM orgs WHERE orgid IN (%s)""" % ','.join(id_list)
mlog('DEBUGSQL', "org_info stmt :", org_info)
new_cursor = db.cursor(DictCursor)
new_cursor.execute(org_info)
urow_dict[orgclass] = new_cursor.fetchall()
# print('===urow_dict with orgs[]===')
# print(urow_dict)
# print('==/urow_dict with orgs[]===')
# 3/ also split and parse all linked_ids
if urow_dict['linked_ids_nb'] == 0: if urow_dict['linked_ids_nb'] == 0:
urow_dict['linked_ids'] = {} urow_dict['linked_ids'] = {}
else: else:
...@@ -313,6 +363,9 @@ def get_full_scholar(uid, cmx_db = None): ...@@ -313,6 +363,9 @@ def get_full_scholar(uid, cmx_db = None):
mlog("INFO", "get_full_scholar %s: OK" % uid) mlog("INFO", "get_full_scholar %s: OK" % uid)
if not cmx_db:
db.close()
# full user info as a dict # full user info as a dict
return urow_dict return urow_dict
...@@ -540,34 +593,64 @@ def get_or_create_tokitems(tok_list, cmx_db, tok_table='keywords'): ...@@ -540,34 +593,64 @@ def get_or_create_tokitems(tok_list, cmx_db, tok_table='keywords'):
return found_ids return found_ids
def get_or_create_affiliation(org_info, cmx_db): def record_sch_org_link(luid, orgid, cmx_db = None):
if cmx_db:
db = cmx_db
else:
db = connect_db()
db_c = db.cursor(DictCursor)
luid = int(luid)
orgid = int(orgid)
db_c.execute(
'INSERT INTO sch_org(uid,orgid) VALUES (%i,%i)' % (luid, orgid)
)
if not cmx_db:
db.close()
def record_org_org_link(orgid_src, orgid_tgt, cmx_db = None):
"""
new mapping or freq++ if mapping already exists
TODO LATER (not a priority)
method cf. php_library/directory_content.php/$labs
""" """
(parent organization + lab) ---> lookup/add to *affiliations* table -> affid pass
org_info should contain properties like in ORG_COLS names def get_or_create_org(org_info, cmx_db = None):
"""
(scholar's parent org(s)) ---> lookup/add to *orgs* table -> orgid
1) query to *affiliations* table 1) query to *orgs* table
2) return id 2) return id
=> TODO if institution almost matches send suggestion => TODO if institution almost matches API to send suggestion
=> unicity constraint on institution + lab + org_type => unicity constraint on institution + lab + org_type
=> if an institution matches return affid => if an institution matches return orgid
=> if no institution matches create new and return affid => if no institution matches create new and return orgid
TODO test more ! WIP !
""" """
if cmx_db:
db = cmx_db
else:
db = connect_db()
db_c = db.cursor(DictCursor)
the_aff_id = None the_aff_id = None
db_tgtcols = [] db_tgtcols = []
db_qstrvals = [] db_qstrvals = []
db_constraints = [] db_constraints = []
mlog("INFO", "get_or_create_org, org_info:", org_info)
for colinfo in ORG_COLS: for colinfo in ORG_COLS:
colname = colinfo[0] colname = colinfo[0]
val = org_info.get(colname, None) val = org_info.get(colname, None)
if val != None: if val != None:
# TODO better string normalization but not lowercase for acronyms... val = str(normalize_forms(normalize_chars(val, rm_qt=True)))
quotedstrval = "'"+str(val)+"'" quotedstrval = "'"+val+"'"
# for insert # for insert
db_tgtcols.append(colname) db_tgtcols.append(colname)
...@@ -580,28 +663,33 @@ def get_or_create_affiliation(org_info, cmx_db): ...@@ -580,28 +663,33 @@ def get_or_create_affiliation(org_info, cmx_db):
db_cursor = cmx_db.cursor() db_cursor = cmx_db.cursor()
mlog("DEBUGSQL", "SELECT org.. WHERE %s" % ("\n AND ".join(db_constraints)))
n_matched = db_cursor.execute( n_matched = db_cursor.execute(
'SELECT affid FROM affiliations WHERE %s' % 'SELECT orgid FROM orgs WHERE %s' %
" AND ".join(db_constraints) " AND ".join(db_constraints)
) )
# ok existing affiliation => row id # ok existing affiliation => row id
if n_matched == 1: if n_matched == 1:
the_aff_id = db_cursor.fetchone()[0] the_aff_id = db_cursor.fetchone()[0]
mlog("DEBUG", "Found affiliation (affid %i) (WHERE %s)" % (the_aff_id, " AND ".join(db_constraints))) mlog("DEBUG", "Found affiliation (orgid %i) (WHERE %s)" % (the_aff_id, " AND ".join(db_constraints)))
# no matching affiliation => add => row id # no matching affiliation => add => row id
elif n_matched == 0: elif n_matched == 0:
db_cursor.execute('INSERT INTO affiliations(%s) VALUES (%s)' % ( db_cursor.execute('INSERT INTO orgs(%s) VALUES (%s)' % (
','.join(db_tgtcols), ','.join(db_tgtcols),
','.join(db_qstrvals) ','.join(db_qstrvals)
) )
) )
the_aff_id = db_cursor.lastrowid the_aff_id = db_cursor.lastrowid
cmx_db.commit() cmx_db.commit()
mlog("DEBUG", "Added affiliation '%s'" % str(db_qstrvals)) mlog("DEBUG", "dbcrud: added org '%s'" % str(db_qstrvals))
else: else:
raise Exception("ERROR: non-unique affiliation '%s'" % str(db_qstrvals)) raise Exception("ERROR: get_or_create_org non-unique match '%s'" % str(db_qstrvals))
if not cmx_db:
db.close()
return the_aff_id return the_aff_id
......
...@@ -25,7 +25,7 @@ __status__ = "Dev" ...@@ -25,7 +25,7 @@ __status__ = "Dev"
# ============== imports ============== # ============== imports ==============
from re import sub from re import sub, match
from os import path, remove from os import path, remove
from json import dumps from json import dumps
from datetime import timedelta from datetime import timedelta
...@@ -101,12 +101,12 @@ SOURCE_FIELDS = [ ...@@ -101,12 +101,12 @@ SOURCE_FIELDS = [
("pic_file", False, None), # saved separately ("pic_file", False, None), # saved separately
# => for *scholars* table (optional) # => for *scholars* table (optional)
("org", True, None), ("lab_label", True, None), # ~ /acro (name)/
("org_type", False, None), # predefined values ("lab_locname", True, None), # 'Paris, France'
( "other_org_type", True, None), # +=> org_type ("inst_label", True, None), # ~ /acro (name)/
("team_lab", True, None), ("inst_type", False, None), # predefined values
("org_city", True, None), ( "other_inst_type", True, None), # +=> org_type
# => for *affiliations* table # => for *orgs* table via sort_affiliation_records
("keywords", True, None), ("keywords", True, None),
# => for *keywords* table (after split str) # => for *keywords* table (after split str)
...@@ -752,6 +752,84 @@ def show_privacy(): ...@@ -752,6 +752,84 @@ def show_privacy():
########### SUBS ########### ########### SUBS ###########
def sort_affiliation_records(clean_records):
"""
Transform GUI side input data into at most 2 orgs objects for DB
In general:
1) the front-end inputs are less free than the DB structure
(DB could save an array of orgids but in the inputs they're only allowed max 2 atm : lab and inst)
2) each org has its microstructure:
- name, acronym, class, location (base properties)
- inst_type (specific to institutions)
- lab_code, url, contact <= not fillable in GUI yet
3) between themselves 2 orgs can have org_org relationships
TODO LATER (not a priority)
4) we want at least one of lab_label or inst_label to be NOT NULL
Choices:
- values are already sanitized by read_record_from_request
- We call label the concatenated name + acronym information,
handling here the possibilities for the input via str analysis
(just short name, just long name, both)
- We return a map with 2 key/value submaps for lab and institutions
"""
new_orgs = {'lab': None, 'inst': None}
for org_class in new_orgs:
# can't create org without some kind of label
if (org_class+"_label" not in clean_records
or not len(clean_records[org_class+"_label"])):
pass
else:
# submap
new_org_info = {}
# 1) label analysis
clean_input = clean_records[org_class+"_label"]
# custom split attempt
# eg 'CNRS (Centre National de la Recherche Scientifique)'
# vvvv vvvvvvvvvv
# acro name
test_two_groups = match(
r'([^\(]{1,30}) \(([^\)]+)\)',
clean_input
)
if test_two_groups:
new_org_info['acro'] = test_two_groups.groups()[0]
new_org_info['name'] = test_two_groups.groups()[1]
# fallback cases
elif len(clean_input) < 30:
new_org_info['acro'] = clean_input
else:
new_org_info['name'] = clean_input
# 2) enrich with any other optional org info
for detail_col in ['type', 'code', 'locname',
'url', 'contact_email', 'contact_name']:
# this is a convention in our templates
org_detail = org_class + '_' + detail_col
if org_detail in clean_records:
val = clean_records[org_detail]
if len(val):
new_org_info[detail_col] = val
# 3) keep
new_orgs[org_class] = new_org_info
return new_orgs
def save_form(clean_records, update_flag=False, previous_user_info=None): def save_form(clean_records, update_flag=False, previous_user_info=None):
""" """
wrapper function for save profile/register (all DB-related form actions) wrapper function for save profile/register (all DB-related form actions)
...@@ -767,11 +845,23 @@ def save_form(clean_records, update_flag=False, previous_user_info=None): ...@@ -767,11 +845,23 @@ def save_form(clean_records, update_flag=False, previous_user_info=None):
# A) a new DB connection # A) a new DB connection
reg_db = dbcrud.connect_db(config) reg_db = dbcrud.connect_db(config)
# B) read/fill the affiliation table to get associated id # B1) re-group the org fields into at most 2 org 'objects'
clean_records['affiliation_id'] = dbcrud.get_or_create_affiliation( declared_orgs = sort_affiliation_records(clean_records)
clean_records,
reg_db # B2) check our constraint (cf. also E.)
) if (declared_orgs['lab'] is None or declared_orgs['inst'] is None):
raise ValueError("At least 1 org (lab or institution) must be filled")
# B3) for each, read/fill the orgs table to get associated id(s) in DB
orgids = []
for oclass in ['lab', 'inst']:
if (declared_orgs[oclass]):
orgids.append(
dbcrud.get_or_create_org(declared_orgs[oclass], reg_db)
)
# B4) save the org <=> org mappings TODO LATER (not a priority)
# dbcrud.record_org_org_link(src_orgid, tgt_orgid, reg_db)
# C) create/update record into the primary user table # C) create/update record into the primary user table
# ---------------------------------------------------- # ----------------------------------------------------
...@@ -824,6 +914,10 @@ def save_form(clean_records, update_flag=False, previous_user_info=None): ...@@ -824,6 +914,10 @@ def save_form(clean_records, update_flag=False, previous_user_info=None):
map_table map_table
) )
# E) save the (uid <=> orgid) mapping(s)
for orgid in orgids:
dbcrud.record_sch_org_link(luid, orgid, reg_db)
# F) end connection # F) end connection
reg_db.close() reg_db.close()
...@@ -872,9 +966,9 @@ def read_record_from_request(request): ...@@ -872,9 +966,9 @@ def read_record_from_request(request):
clean_records[field] = request.form[field] clean_records[field] = request.form[field]
# special treatment for "other" subquestions # special treatment for "other" subquestions
if 'org_type' in clean_records: if 'inst_type' in clean_records:
if clean_records['org_type'] == 'other' and 'other_org_type' in clean_records: if clean_records['inst_type'] == 'other' and 'other_inst_type' in clean_records:
clean_records['org_type'] = clean_records['other_org_type'] clean_records['inst_type'] = clean_records['other_inst_type']
# splits for kw_array and ht_array # splits for kw_array and ht_array
for tok_field in ['keywords', 'hashtags']: for tok_field in ['keywords', 'hashtags']:
......
...@@ -140,3 +140,162 @@ class CountryConverter: ...@@ -140,3 +140,162 @@ class CountryConverter:
self.connDBLP.close() self.connDBLP.close()
return fails return fails
#! /usr/bin/python3
from re import sub
from sys import stdin, stderr
# settings
dont_touch_first_column = False
NCOLS = 1
# functions
def normalize_chars(my_str, rm_qt=False):
"""
Simplification des chaînes de caractères en entrée de la BDD
- normalisation
> espaces
> tirets
> guillemets
- déligatures
Goal: normalize input values more like ascii will be easier to process
"""
# print('normalize_chars IN: "%s"' % my_str)
# --------------
# E S P A C E S
# --------------
# tous les caractères de contrôle (dont \t = \x{0009} et \r = \x{000D}) --> espace
my_str = sub(r'[\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F]', ' ', my_str)
# mais pas \n = \x{000A}
# Line separator
my_str = sub(r'\u2028',' ', my_str)
my_str = sub(r'\u2029',' ', my_str)
# U+0092: parfois quote parfois cara de contrôle
my_str = sub(r'\u0092', ' ', my_str)
# tous les espaces alternatifs --> espace
my_str = sub(r'[\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]', ' ' , my_str)
# quelques puces courantes (bullets)
my_str = sub(r'^\s+►', ' • ', my_str)
my_str = sub(r'^\s+●', ' • ', my_str)
my_str = sub(r'^\s+◘', ' • ', my_str)
my_str = sub(r'^\s+→', ' • ', my_str)
my_str = sub(r'^\s+▪', ' • ', my_str)
my_str = sub(r'^\s+·', ' • ', my_str)
my_str = sub(r'^\s+\*', ' • ', my_str)
# pour finir on enlève les espaces en trop
# (dits "trailing spaces")
my_str = sub(r' +', ' ', my_str)
my_str = sub(r'^ +', '', my_str)
my_str = sub(r' +$', '', my_str)
# ------------------------
# P O N C T U A T I O N S
# ------------------------
# la plupart des tirets alternatifs --> tiret normal (dit "du 6")
# (dans l'ordre U+002D U+2010 U+2011 U+2012 U+2013 U+2014 U+2015 U+2212 U+FE63)
my_str = sub(r'[‐‑‒–—―−﹣]','-', my_str)
# le macron aussi parfois comme tiret
my_str = sub(r'\u00af','-', my_str)
# Guillemets
# ----------
# la plupart des quotes simples --> ' APOSTROPHE
my_str = sub(r"[‘’‚`‛]", "'", my_str) # U+2018 U+2019 U+201a U+201b
my_str = sub(r'‹ ?',"'", my_str) # U+2039 plus espace éventuel après
my_str = sub(r' ?›',"'", my_str) # U+203A plus espace éventuel avant
# la plupart des quotes doubles --> " QUOTATION MARK
my_str = sub(r'[“”„‟]', '"', my_str) # U+201C U+201D U+201E U+201F
# my_str = sub(r'« ?', '"', my_str) # U+20AB plus espace éventuel après
# my_str = sub(r' ?»', '"', my_str) # U+20AB plus espace éventuel avant
# deux quotes simples (préparées ci-dessus) => une double
my_str = sub(r"''", '"', my_str)
# if we need to remove single quotes
if rm_qt:
my_str = sub(r"'", '"', my_str)
# print('normalize_chars OUT: "%s"' % my_str)
return my_str
def normalize_forms(term_str, do_lowercase=False):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
ex /©""ecosystem services"";/ => /"ecosystem services"/
(benefits from normalize_chars upstream so there's less cases to consider)
largely inadequate to the enormity of the task
"""
# print('normalize_forms IN: "%s"' % term_str)
term_str = sub(r'^[,; ©]+', '', term_str)
term_str = sub(r'[,; ©]+$', '', term_str)
term_str = sub(r'"+', '"', term_str)
term_str = sub(r'/+', '/', term_str)
term_str = sub(r"'+", "'", term_str)
if do_lowercase:
term_str = term_str.lower()
# print('normalize_forms OUT: "%s"' % term_str)
return term_str
if __name__ == "__main__":
for i, line in enumerate(stdin):
fields = line.rstrip().split('\t')
if len(fields) > NCOLS:
print ("skipping line %i (%s)" % (i, fields), file=stderr)
continue
if dont_touch_first_column:
# some ID supposed in 1st col => kept unchanged
clean_fields = [fields[0]]
todo_fields = fields[1:]
else:
# normalize in all columns
clean_fields = []
todo_fields = fields
for field in todo_fields:
clean_lines = []
last_line = None
for line in field.split('%%%'):
# print(">> (doing line)", line)
clean_line = normalize_forms(normalize_chars(line))
if clean_line == '' and last_line == '':
last_line = clean_line
continue
else:
clean_lines.append(normalize_forms(normalize_chars(line)))
last_line = clean_line
# remove trailing lines
# TODO test if instead s/(?:%%%)+$// on clean_fields later is faster
for i in range(len(clean_lines)-1, 0, -1):
if not len(clean_lines[i]):
clean_lines.pop()
else:
break
clean_fields.append('%%%'.join(clean_lines))
# OUTPUT
print("\t".join(clean_fields))
...@@ -69,15 +69,14 @@ CREATE TABLE orgs( ...@@ -69,15 +69,14 @@ CREATE TABLE orgs(
-- address... (...) -- address elements POSS NOT IMPLEMENTED -- address... (...) -- address elements POSS NOT IMPLEMENTED
reserved varchar(30), reserved varchar(30),
-- generated column, often useful for autocompletes etc -- tostring: generated column
-- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain" -- ex "Instituto de Fisica de Cantabria (IFCA), Santander, Spain"
tostring varchar(800) AS (CONCAT( -- searchable + human readable, often useful for autocompletes etc
name, ' (', acro, ')', tostring varchar(800)
IF(locname IS NOT NULL , AS (CONCAT_WS( '',
CONCAT(', ', locname), CONCAT(name, ' '),
'') CONCAT('(',acro,')'),
)), CONCAT(', ', locname)) ),
PRIMARY KEY (orgid), PRIMARY KEY (orgid),
UNIQUE KEY full_org (name, acro, locname) UNIQUE KEY full_org (name, acro, locname)
...@@ -97,6 +96,7 @@ CREATE TABLE sch_org( ...@@ -97,6 +96,7 @@ CREATE TABLE sch_org(
-- POSS: relationship organizations <=> keywords -- POSS: relationship organizations <=> keywords
-- POSS: relationship organizations <=> organizations -- POSS: relationship organizations <=> organizations
-- cf. doc/data_mining_exemples/org_to_orgs.sql
-- keyword/subject terms -- keyword/subject terms
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
*/ */
// 3 exposed vars for inline js controls // 3 exposed vars for inline js controls
var teamCityDiv = document.getElementById('team_city_div') var teamCityDiv = document.getElementById('lab_locname_div')
var otherInstDiv = document.getElementById('other_org_div') var otherInstDiv = document.getElementById('other_inst_div')
// TODO make relative to org_type and move inline snippet to extended form obj // TODO make relative to inst_type and move inline snippet to extended form obj
var otherOrgTypeInput = document.getElementById('other_org_type') var otherOrgTypeInput = document.getElementById('other_inst_type')
// reselecting current_user's info choices // reselecting current_user's info choices
function setupSavedItems(uinfo) { function setupSavedItems(uinfo) {
......
...@@ -39,8 +39,8 @@ var validateWithMessage = false ...@@ -39,8 +39,8 @@ var validateWithMessage = false
var shortRegVersion = true var shortRegVersion = true
var ignoredFields = [] var ignoredFields = []
if (shortRegVersion) { if (shortRegVersion) {
ignoredFields = ['gender', 'home_url', 'org', ignoredFields = ['gender', 'home_url', 'inst_label',
'hon_title', 'position', 'org_type', 'hon_title', 'position', 'inst_type',
'hashtags'] 'hashtags']
} }
...@@ -68,7 +68,7 @@ function testAsYouGo() { ...@@ -68,7 +68,7 @@ function testAsYouGo() {
} }
} }
var teamCityDivStyle = document.getElementById('team_city_div').style var teamCityDivStyle = document.getElementById('lab_locname_div').style
function registerDoorsAndSubmit(){ function registerDoorsAndSubmit(){
regfo.elMainMessage.innerHTML = "Registering with the test login portal<br/> and sending validation email..." regfo.elMainMessage.innerHTML = "Registering with the test login portal<br/> and sending validation email..."
......
...@@ -61,11 +61,25 @@ var cmxClt = (function() { ...@@ -61,11 +61,25 @@ var cmxClt = (function() {
["pic_file", false, "pref" , "f", "other_infos"], ["pic_file", false, "pref" , "f", "other_infos"],
// ==> *scholars* table // ==> *scholars* table
["org", false, "plsfill", "t", "org_infos"],
["org_type", false, "plsfill", "m", "org_infos"], // org field
["team_lab", true, "plsfill", "t", "org_infos"], // => name, acro in one field "label": #lab_label, #inst_label
["org_city", false, "pref" , "t", "org_infos"] // => all other fields
// ==> *affiliations* table // - are optional
// - if present, should be named: lab|inst + '_' + colname
// => TODO org details suggestions
// url, loc should have autofill when name or acro is chosen
// => POSS org <-> org suggestions
// once a lab is filled, we could propose the institution
["lab_label", false, "plsfill", "t", "org_infos"],
["lab_locname", false, "pref", "t", "org_infos"],
["inst_label", false, "pref", "t", "org_infos"],
["inst_type", false, "pref", "m", "org_infos"],
// ["lab_code", false, "pref", "t", "org_infos"],
// ["lab_url", false, "pref", "t", "org_infos"],
// ["inst_locname", false, "pref" , "t", "org_infos"],
// ["inst_url", false, "pref" , "t", "org_infos"],
// ==> *orgs* table via pretreatment org is inst or org is lab
] ]
// group "auto" === filled by controllers // group "auto" === filled by controllers
......
...@@ -347,73 +347,15 @@ ...@@ -347,73 +347,15 @@
class="panel-body ccsection-uform-body panel-collapse collapse out" class="panel-body ccsection-uform-body panel-collapse collapse out"
role="tabpanel" aria-expanded="false"> role="tabpanel" aria-expanded="false">
<div class="question input-group"> <div class="question input-group">
<label for="position" class="smlabel input-group-addon">* Job Position</label> <label for="position" class="smlabel input-group-addon">* Job Position</label>
<input id="position" name="position" maxlength="30" <input id="position" name="position" maxlength="30"
type="text" class="form-control autocomp" placeholder="titre" type="text" class="form-control autocomp" placeholder="titre"
onblur="cmxClt.makeBold(this)" onfocus="cmxClt.makeNormal(this)" onblur="cmxClt.makeBold(this)" onfocus="cmxClt.makeNormal(this)"
value="{{ current_user.info.position }}"> value="{{ current_user.info.position }}">
</div> </div>
<!-- ORG QUESTIONS -->
<div class="question">
<div class="input-group">
<label for="org" class="smlabel input-group-addon">Parent Institution</label>
<input id="org" name="org" maxlength="120"
type="text" class="form-control autocomp" placeholder='eg "CNRS" or "University of Oxford"'
value="{{ current_user.info.org }}">
</div>
</div>
<div class="question">
<div class="input-group">
<label for="org_type" class="smlabel input-group-addon">Institution Type</label>
<select id="org_type" name="org_type"
class="custom-select form-control"
onchange="if(this.value=='other'){otherInstDiv.style.display = 'block'} else {otherInstDiv.style.display='none';otherOrgTypeInput.value=''}">
<option selected disabled value="">Please select</option>
<option value="university">University</option>
<option value="public R&amp;D org">Public sector R&amp;D organization</option>
<option value="public other org">Other public sector organization</option>
<option value="private org">Private sector organization</option>
<option value="none">None at the moment</option>
<option value="other"
onclick="otherInstDiv.style.display = 'block'"
>Other</option>
</select>
</div>
<!-- Other institution type <=> only if previous choice == 5 -->
<div class="question conditional-q" id="other_org_div">
<div class="input-group">
<label for="other_org_type" class="smlabel input-group-addon">Other type</label>
<input id="other_org_type" name="other_org_type" maxlength="120"
type="text" class="form-control" placeholder="Clarify here the type of your parent institution">
</div>
</div>
</div>
<!-- TEAM QUESTIONS -->
<div class="question">
<div class="input-group">
<label for="team_lab" class="smlabel input-group-addon">* Lab / Team / Dept</label>
<input id="team_lab" name="team_lab" maxlength="120"
type="text" class="form-control" placeholder="More detailed affiliation, if relevant"
value="{{ current_user.info.team_lab }}">
</div>
</div>
<!-- Lab city <=> only for France -->
<div class="question conditional-q" id="team_city_div">
<div class="input-group">
<label for="org_city" class="smlabel input-group-addon">Lab city</label>
<input id="org_city" name="org_city" maxlength="50"
type="text" class="form-control" placeholder="Ville de votre institution"
value="{{ current_user.info.org_city }}">
</div>
</div>
{% include 'questions/org_details.html' %}
</div> <!-- /panel-body --> </div> <!-- /panel-body -->
<div class="panel-footer ccsection-footer">&nbsp;</div> <div class="panel-footer ccsection-footer">&nbsp;</div>
......
<!-- ORG QUESTIONS -->
<!-- lab or team and details -->
<div class="question">
<div class="input-group">
<label for="lab_label" class="smlabel input-group-addon">* Lab / Team / Dept</label>
<input id="lab_label" name="lab_label" maxlength="250"
type="text" class="form-control" placeholder="More detailed affiliation, if relevant"
value="{{ current_user.info.labs[0].tostring if current_user.info.labs|length > 0 }}">
</div>
</div>
<!-- lab locname <=> only for France -->
<div class="question conditional-q" id="lab_locname_div">
<div class="input-group">
<label for="lab_locname" class="smlabel input-group-addon">Lab city</label>
<input id="lab_locname" name="lab_locname" maxlength="50"
type="text" class="form-control" placeholder="Ville de votre institution"
value="{{ current_user.info.labs[0].locname if current_user.info.labs|length > 0 }}">
</div>
</div>
<!-- larger institution and details -->
<div class="question">
<div class="input-group">
<label for="inst_label" class="smlabel input-group-addon">Parent Institution</label>
<input id="inst_label" name="inst_label" maxlength="250"
type="text" class="form-control autocomp" placeholder='eg "CNRS" or "University of Oxford"'
value="{{ current_user.info.insts[0].tostring if current_user.info.insts|length > 0 }}">
</div>
</div>
<div class="question">
<div class="input-group">
<label for="inst_type" class="smlabel input-group-addon">Institution Type</label>
<select id="inst_type" name="inst_type"
class="custom-select form-control"
onchange="if(this.value=='other'){otherInstDiv.style.display = 'block'} else {otherInstDiv.style.display='none';otherOrgTypeInput.value=''}">
<option selected disabled value="">Please select</option>
<option value="university">University</option>
<option value="public R&amp;D org">Public sector R&amp;D organization</option>
<option value="public other org">Other public sector organization</option>
<option value="private org">Private sector organization</option>
<option value="none">None at the moment</option>
<option value="other"
onclick="otherInstDiv.style.display = 'block'"
>Other</option>
</select>
</div>
<!-- Other institution type <=> only if previous choice == 5 -->
<div class="question conditional-q" id="other_org_div">
<div class="input-group">
<label for="other_inst_type" class="smlabel input-group-addon">Other type</label>
<input id="other_inst_type" name="other_inst_type" maxlength="120"
type="text" class="form-control" placeholder="Clarify here the type of your parent institution">
</div>
</div>
</div>
...@@ -160,20 +160,20 @@ ...@@ -160,20 +160,20 @@
<div class="question"> <div class="question">
<div class="input-group"> <div class="input-group">
<label for="team_lab" class="smlabel input-group-addon">* Lab / Team / Dept</label> <label for="lab_label" class="smlabel input-group-addon">* Lab / Team / Dept</label>
<input id="team_lab" name="team_lab" maxlength="120" <input id="lab_label" name="lab_label" maxlength="120"
type="text" class="form-control" placeholder="Your lab" type="text" class="form-control" placeholder="Your lab"
placeholder="team_lab"> placeholder="lab_label">
</div> </div>
</div> </div>
<!-- Lab city <=> only for France --> <!-- Lab city <=> only for France -->
<div class="question conditional-q" id="team_city_div"> <div class="question conditional-q" id="lab_locname_div">
<div class="input-group"> <div class="input-group">
<label for="org_city" class="smlabel input-group-addon">Lab city</label> <label for="lab_locname" class="smlabel input-group-addon">Lab city</label>
<input id="org_city" name="org_city" maxlength="50" <input id="lab_locname" name="lab_locname" maxlength="50"
type="text" class="form-control" placeholder="Ville de votre institution" type="text" class="form-control" placeholder="Ville de votre institution"
placeholder="org_city"> placeholder="lab_locname">
</div> </div>
</div> </div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment