Commit 72859dbf authored by Romain Loth's avatar Romain Loth

adapt dbdatapi for new org tables (+ gui autocomplete 1/2)

parent 9fb58f01
......@@ -189,34 +189,42 @@ $loop = 0;
// all lab orgids except _NULL
$ids_str = implode(',', array_keys($lab_counts));
// print_r("all lab ids here:");
// print_r($ids_str);
// print_r("<br/>");
$lab_ids = array_filter(array_keys($lab_counts));
sort($lab_ids);
// all lab infos to retrieve
$labs = array();
// normal query would be enough for everything except parent org
// POSS page the request in nb of ids >= mysql technical limit for IN
// $sql = 'SELECT * FROM orgs WHERE orgid IN ('.$ids_str.') ORDER BY name, acro' ;
// variant with parent org
// unique org1 (=> unique pairs (sch_org => sch_org2)
// => org2 info)
//
// it's much longer in code but fast because of indexes
//
// a POSS alternative would be to
// create an org_org table
// at record time
$sql = <<< LABSQLEXTENDED
SELECT orgs.*,
// paging
$step = 2000;
$n_steps = ceil(count($lab_ids)/$step);
for($i = 0; $i < $n_steps; $i++) {
$batch = array_slice($lab_ids, $step * $i, $step);
$ids_str = implode(',', $batch);
// print_r("<br>step: ".$i." / ids_str".$ids_str."<br>");
// normal query would be enough for everything except parent org
// POSS page the request in nb of ids >= mysql technical limit for IN
// $sql = 'SELECT * FROM orgs WHERE orgid IN ('.$ids_str.')'; // ORDER BY name, acro' ;
// variant query with parent org
// unique org1 (=> unique pairs (sch_org => sch_org2)
// => org2 info)
//
// it's much longer in code but fast because of indexes
//
// a POSS alternative would be to
// create an org_org table
// at record time
//
$sql = <<< LABSQLEXTENDED
SELECT orgs.*,
GROUP_CONCAT( tgt_tostring ORDER BY tgt_freq DESC SEPARATOR '%%%')
AS related_insts
FROM orgs
LEFT JOIN (
FROM orgs
LEFT JOIN (
SELECT sch_org.orgid AS src_orgid,
sch_org2.orgid AS tgt_orgid,
orgs2.tostring AS tgt_tostring,
......@@ -230,15 +238,15 @@ LEFT JOIN (
AND sch_org.orgid != sch_org2.orgid
GROUP BY sch_org.orgid, sch_org2.orgid
) AS lab_relationship_to_inst_via_scholars ON src_orgid = orgs.orgid
WHERE orgs.orgid IN ( {$ids_str} )
AND orgs.name != '_NULL'
GROUP BY orgs.orgid
ORDER BY orgs.name, orgs.acro
WHERE orgs.orgid IN ( {$ids_str} )
AND orgs.name != '_NULL'
GROUP BY orgs.orgid
ORDER BY orgs.name, orgs.acro
LABSQLEXTENDED;
// print_r($sql);
// print_r($sql);
foreach ($base->query($sql) as $row) {
foreach ($base->query($sql) as $row) {
$info = array();
$info['unique_id'] = $row['orgid'];
......@@ -272,7 +280,13 @@ foreach ($base->query($sql) as $row) {
}
// print_r($info);
$labs[$row['orgid']] = $info;
// finished batch
}
// finished all labs
}
//
// print_r("all labs here:");
// print_r($labs);
......@@ -284,7 +298,7 @@ foreach ($base->query($sql) as $row) {
// debug
// $content .= var_dump($all_orga_list) ;
//
// $organiz = array();
// sort($all_orga_list);
// foreach ($all_orga_list as $name) {
......@@ -318,7 +332,7 @@ foreach ($base->query($sql) as $row) {
// }
//
// }
//
///////////////////////////////////////////////////////////////
......
......@@ -6,17 +6,19 @@
*/
// parameters : threshold to display orgs (labs / institutions) diagrams
$MIN_DISTINCT_LABS = 1 ;
$MIN_DISTINCT_LABS_SCHOLARS_SHARE = .25;
// paramters : threshold to display orgs (labs / institutions) diagrams
$MIN_DISTINCT_LABS = 5 ;
$MIN_DISTINCT_INSTS = 4 ;
$MIN_DISTINCT_INSTS = 1 ;
$MIN_DISTINCT_INSTS_SCHOLARS_SHARE = .20;
// main vars
$country_list = array();
$position_list = array();
$title_list = array();
// not needed already factorized in lab_counts, inst_counts
// $labs_list = array();
// $insts_list = array();
......@@ -129,16 +131,18 @@ asort($inst_counts);
// TODO factorize all this
// we are creating highcharts' arguments for pie chart
// eg position_data: data: [["senior researcher",11],["assistant professor",23],["lecturer",25],["engineer",26],["associate professor",28],["student",28],["post-doc",48],["professor",51],["phd student",53],["research director",64],["researcher",68],["Missing data",467],["Others",210]]
// NB escaping: no need to do htmlspeciazlchars($key, ENT_HTML5 | ENT_QUOTES, 'UTF-8'); because the target language is js (doesn't need html entities)
// données des pays
$country_data = "data: [";
foreach ($country_list as $key => $value) {
$key = addslashes($key);
if ($value > min(9, count($country_list) / 10)) {
$country_data.='["' . $key . '",' . $value . '],';
$thresh = min(9, count($country_list) / 10);
if ($value > $thresh) {
$country_data.='["' . addslashes($key) . '",' . $value . '],';
} else {
$other_country+=$value;
}
......@@ -162,9 +166,9 @@ $country_data.=']';
// données des position
$position_data = "data: [";
foreach ($position_list as $key => $value) {
$key = addslashes($key);
if ($value > min(9, count($position_list) / 10)) {
$position_data.='["' . $key . '",' . $value . '],';
$thresh = min(9, count($position_list) / 10);
if ($value > $thresh) {
$position_data.='["' . addslashes($key) . '",' . $value . '],';
} else {
$other_position+=$value;
}
......@@ -207,16 +211,26 @@ $title_data.=']';
$labs_data = "data: [";
$n_labs = count($lab_counts);
$n_shown_labs = 0 ;
$tot_shown_labs = 0;
$labs_total_responses = 0;
foreach ($lab_counts as $key => $value) {
// $key is the orgid, but we need the name
$label = $org_id_to_label[$key];
$thresh = min(9, $n_labs / 15);
$key = addslashes($key);
if ($value > min(9, $n_labs / 15)) {
$labs_data.='["' . $key . '",' . $value . '],';
if (!$label || $label == "_NULL") {
$missing_labs += $value;
}
elseif ($value > $thresh) {
$labs_data.='["' . addslashes($label) . '",' . $value . '],';
$n_shown_labs += 1;
$tot_shown_labs += $value;
} else {
$other_labs+=$value;
}
# doesn't include missing, but we can compare to n_scholars to know
$labs_total_responses += $value;
}
if ($missing_labs>0){
$labs_data.='["Missing data",' . $missing_labs . '],';
......@@ -230,19 +244,31 @@ if ($other_labs>0){
$labs_data.=']';
// $share_of_shown_labs = sprintf("%.6f", $tot_shown_labs/$labs_total_responses);
$share_of_shown_labs = sprintf("%.6f", $tot_shown_labs/count($scholars));
$insts_data = "data: [";
$n_insts = count($inst_counts);
$n_shown_insts = 0 ;
$tot_shown_insts = 0;
$insts_total_responses = 0;
foreach ($inst_counts as $key => $value) {
$label = $org_id_to_label[$key];
$thresh = min(9, $n_insts / 15);
$key = addslashes($key);
if ($value > min(9, $n_insts / 15)) {
$insts_data.='["' . $key . '",' . $value . '],';
if (!$label) {
$missing_insts += $value;
}
elseif ($value > $thresh) {
$insts_data.='["' . addslashes($label) . '",' . $value . '],';
$n_shown_insts += 1;
$tot_shown_insts += $value;
} else {
$other_insts+=$value;
}
$insts_total_responses+=$value;
}
if ($missing_insts>0){
$insts_data.='["Missing data",' . $missing_insts . '],';
......@@ -256,7 +282,10 @@ if ($other_labs>0){
$insts_data.=']';
// $share_of_shown_insts = sprintf("%.6f", $tot_shown_insts/$insts_total_responses);
$share_of_shown_insts = sprintf("%.6f", $tot_shown_insts/count($scholars));
// print_r("shown_insts_total % ".$share_of_shown_insts);
// TODO separate this Highcharts js to factorize and expose as functions
// (or replace it by D3 and also separate)
......@@ -372,7 +401,13 @@ $(document).ready(function() {
'}]
});
if (parseInt('.$n_shown_labs.') >= parseInt('.$MIN_DISTINCT_LABS.')) {
var MIN_DISTINCT_LABS = parseInt('.$MIN_DISTINCT_LABS.')
var MIN_DISTINCT_LABS_SCHOLARS_SHARE = parseFloat('.$MIN_DISTINCT_LABS_SCHOLARS_SHARE.')
if (
parseInt('.$n_shown_labs.') >= MIN_DISTINCT_LABS
&& parseFloat('.$share_of_shown_labs.') >= MIN_DISTINCT_LABS_SCHOLARS_SHARE
) {
labs= new Highcharts.Chart({
chart: {
......@@ -412,7 +447,12 @@ $(document).ready(function() {
document.getElementById("labs_div").style.display = "none"
}
if (parseInt('.$n_shown_insts.') >= parseInt('.$MIN_DISTINCT_INSTS.')) {
var MIN_DISTINCT_INSTS = parseInt('.$MIN_DISTINCT_INSTS.')
var MIN_DISTINCT_INSTS_SCHOLARS_SHARE = parseFloat('.$MIN_DISTINCT_INSTS_SCHOLARS_SHARE.')
if ( parseInt('.$n_shown_insts.') >= MIN_DISTINCT_INSTS
&& parseFloat('.$share_of_shown_insts.') >= MIN_DISTINCT_INSTS_SCHOLARS_SHARE
) {
insts= new Highcharts.Chart({
chart: {
......
......@@ -3,8 +3,6 @@ include ("php_library/comex_library.php");
include ("php_library/parametres.php");
include ("php_library/normalize.php");
//include("../common/library/fonctions_php.php");
$meta = '<!DOCTYPE html>
<html lang="en">
<head>
......@@ -86,6 +84,7 @@ function objectToArray($d) {
$data = objectToArray($data);
// REST query params
$categorya = $data["categorya"] ?? [];
$categoryb = $data["categoryb"] ?? [];
$countries = $data["countries"] ?? [];
......@@ -195,9 +194,6 @@ if ($countries) {
$f .= ") ";
}
// £TODO_ORGS FILTER x 2
if ($laboratories) {
// debug
// echo '<p style="color:white">MATCHING ON labs<p>';
......@@ -211,7 +207,7 @@ if ($laboratories) {
if ($lab == "") continue;
if ($i > 0)
$f .= " OR ";
$f .= 'team_lab LIKE "%' . $lab . '%" ';
$f .= 'labs_list LIKE "%' . $lab . '%" ';
$query_details.=$lab.', ';
$i++;
}
......@@ -230,11 +226,11 @@ if ($organizations) {
foreach ($organizations as $org) {
// echo '<p style="color:white">========> org =====> '. $org ."<p>";
$org = sanitize_input(trim(strtolower($org)));
if ($org == "") continue;
if ($i > 0)
$f .= " OR ";
$f .= 'insts_list LIKE "%' . $org . '%" ';
$query_details.=$org.', ';
$f .= 'org LIKE "%' . $org . '%" ';
//'affiliation LIKE "%' . $org . '% OR affiliation2 LIKE "%' . $org . '%"';
$i++;
}
$f .= ") ";
......@@ -242,6 +238,9 @@ if ($organizations) {
$query_details.='</ul>';
// debug SQL filters
// print_r("query filters: ". $f);
$base = new PDO($dsn, $user, $pass, $opt);
$termsMatrix = array(); // liste des termes présents chez les scholars avec leurs cooc avec les autres termes
$scholarsMatrix = array(); // liste des scholars avec leurs cooc avec les autres termes
......@@ -338,7 +337,7 @@ SELECT * FROM (
END_QUERY;
// debug
// echo '<p style="color:white">query:'. $sql ."<p>";
// echo '<p style="color:grey;">query:<br>'. $sql ."<p>";
// liste des chercheurs
$scholars = array();
......@@ -491,11 +490,8 @@ $header = '<div class="row" id="welcome">
<br/>
<br/>
<p>
This directory presents the profiles of <a href="#scholars">'. count($scholars).' scholars</a> and <a href="#labs">'. count($labs).' labs</a> in the field of Complex Systems';
This directory presents the profiles of <a href="#scholars">'. count($scholars).' scholars</a>, <a href="#labs">'. count($labs).' labs</a> and <a href="#orga">'.$orga_count.' organizations</a> in the field of Complex Systems';
// TODO restore old version before duplicate lab/orga
// This directory presents the profiles of <a href="#scholars">'. count($scholars).' scholars</a>, <a href="#labs">'. count($labs).' labs</a> and <a href="#orga">'.$orga_count.' organizations</a> in the field of Complex Systems';
if (strlen(trim($query_details))>3){
......
......@@ -70,6 +70,13 @@ $base = new PDO($dsn, $user, $pass, $opt);
// liste des chercheurs
$scholars = array();
// these stats are useful BOTH in stat-prep and directory_content
// => should be prepared right now (the label mapping contain all orgs ie both labs and institutions)
$lab_counts = array();
$inst_counts = array();
$org_id_to_label = array();
if ($userid) {
// query idea:
......@@ -118,27 +125,30 @@ if ($userid) {
FROM (
SELECT
scholars.*,
-- GROUP_CONCAT(labs.orgid SEPARATOR ',') AS labs_ids,
GROUP_CONCAT(labs.orgid SEPARATOR ',') AS labs_ids,
GROUP_CONCAT(labs.tostring SEPARATOR '%%%') AS labs_list
FROM scholars
LEFT JOIN sch_org AS map_labs
ON map_labs.uid = luid
JOIN orgs AS labs
LEFT JOIN (
SELECT * FROM orgs WHERE class='lab'
) AS labs
ON map_labs.orgid = labs.orgid
WHERE (record_status = 'active'
OR (record_status = 'legacy' AND valid_date >= NOW()))
AND labs.class = 'lab'
GROUP BY luid
) AS scholars_and_labs
LEFT JOIN sch_org AS map_insts
ON map_insts.uid = luid
JOIN orgs AS insts
LEFT JOIN (
SELECT * FROM orgs WHERE class='inst'
) AS insts
ON map_insts.orgid = insts.orgid
AND insts.class = 'inst'
GROUP BY luid
) AS scholars_and_orgs
-- expansion (+kw info)
LEFT JOIN sch_kw AS second_level
ON second_level.uid = scholars_and_orgs.luid
JOIN sch_kw ON sch_kw.kwid = second_level.kwid
......@@ -172,18 +182,14 @@ HERE_QUERY;
$info['country'] = $row['country'];
$info['homepage'] = $row['home_url'];
// TODO recreate difference between lab and org --------->8--------
// recreated arrays
$info['labs'] = explode('%%%', $row['labs_list'] ?? "") ;
$info['institutions'] = explode('%%%', $row['insts_list'] ?? "") ;
// right now duplicate treatment short-circuited like this
// (effect visible in stat-prep_from_array)
$info['affiliation'] = $row['org'] . $row['team_lab'];
$info['affiliation_id'] = $row['affiliation_id'];
// ----------------------------------------------------->8---------
// $info['lab2'] = $row['lab2'];
// $info['affiliation2'] = $row['affiliation2'];
$info['labs_ids'] = explode(',', $row['labs_ids'] ?? "") ;
$info['insts_ids'] = explode(',', $row['insts_ids'] ?? "") ;
$info['title'] = $row['hon_title'];
$info['position'] = $row['position'];
$info['pic_src'] = $row['pic_fname'] ? '/data/shared_user_img/'.$row['pic_fname'] : $row['pic_url'] ;
......@@ -196,11 +202,53 @@ HERE_QUERY;
// $info['fax'] = $row['fax'];
// $info['affiliation_acronym'] = $row['affiliation_acronym'];
$scholars[$row['luid']] = $info;
// we prepare the agregated lab stats in this loop too
foreach ( array(
array('labs','labs_ids', &$lab_counts),
array('institutions','insts_ids', &$inst_counts)
) as $cat) {
// var_dump($cat);
$namekey = $cat[0];
$idkey = $cat[1];
$counthash_ref = &$cat[2];
// £TODO_ORGS we'll need a missing_labs
$j = -1 ;
foreach ($info[$idkey] as $org_id) {
$j++;
$org_label = $info[$namekey][$j];
$org_label = trim($org_label);
if (strcmp($org_label, "") == 0) {
$org_label = null;
} else {
$org_label = weedout_alt_nulls($org_label);
}
// all non-values are there as null
$org_id_to_label[$org_id] = $org_label;
if (array_key_exists($org_id, $counthash_ref)) {
$counthash_ref[$org_id]+=1;
} else {
$counthash_ref[$org_id] = 1;
}
}
}
}
}
// both our stats have been filled
// var_dump($lab_counts) ;
// var_dump($inst_counts) ;
// creates js for stats visualisations
// creates js for stats visualisations and counts (we re-use the orgs counts)
include ("php_library/stat-prep_from_array.php");
// debug
......@@ -211,8 +259,6 @@ include ("php_library/directory_content.php");
$content .= '</div>';
$content .= '</div>
<footer style="color:white">
......@@ -261,12 +307,10 @@ $header = '<div class="row" id="welcome">
<br/>
<br/>
<p>
This directory presents the profiles of <a href="#scholars">'. count($scholars).' scholars</a> and <a href="#labs">'. count($labs).' labs</a> in the field of Complex Systems
This directory presents the profiles of <a href="#scholars">'. count($scholars).' scholars</a>, <a href="#labs">'. count($labs).' labs</a> and <a href="#orga">'.$orga_count.' organizations</a> in the field of Complex Systems
<br/>
Scholars have been selected from the complex systems directory when sharing common keywords with '.$target_name.'
<!-- TODO restore old version before duplicate lab/orga with $orga_count -->
</p>
<h4>About the complex systems directory</h4>
<p>
......@@ -287,7 +331,11 @@ Contributions and ideas are welcome to improve this directory.
<div id="country" style="width: 800px; height: 300px; margin: 0 auto"></div>
<div id="title" style="width: 800px; height: 300px; margin: 0 auto"></div>
<div id="position" style="width: 800px; height: 300px; margin: 0 auto"></div>
<div id="organizations" style="width: 800px; height: 300px; margin: 0 auto"></div>
<!-- these two are displayed only if the distribution has
at least 3 big groups (cf. n_shown in stats-prep) -->
<div id="labs_div" style="width: 800px; height: 300px; margin: 0 auto"></div>
<div id="insts_div" style="width: 800px; height: 300px; margin: 0 auto"></div>
<br/>
......@@ -301,6 +349,8 @@ Contributions and ideas are welcome to improve this directory.
echo $meta.' '.$stats.'</head>';
if (count($scholars)==0){
// TODO message in modal panel
echo '<h2>Sorry, '.$target_name.' did not mention any keywords ... we cannot process its network.</h2><br/>
If you are '.$target_name.', you can <a href="/services/user/profile" target="_BLANK">modify your profile</a> and see your
network in few minutes.';
......@@ -308,5 +358,6 @@ echo '<h2>Sorry, '.$target_name.' did not mention any keywords ... we cannot pr
echo $header;
echo $content;
}
exit(0);
?>
......@@ -14,6 +14,7 @@ from math import floor, log, log1p
from cgi import escape
from re import sub, match
from traceback import format_tb
from json import loads
if __package__ == 'services':
from services.tools import mlog, REALCONFIG
......@@ -257,6 +258,24 @@ def find_scholar(some_key, some_str_value, cmx_db = None):
return luid
class Org:
" tiny helper class to serialize/deserialize orgs TODO use more OOP :) "
def __init__(self, org_array, org_class=None):
if len(org_array) < 3:
raise ValueError("Org is implemented for at least [name, acr, loc]")
self.name = org_array[0]
self.acro = org_array[1]
self.locname = org_array[2]
self.org_class = org_class
# DB specifications say that at least one of name||acr is NOT NULL
self.any = self.acro if self.acro else self.name
self.tostring = ( ( self.name if self.name else "")
+ ((' ('+self.acro+')') if self.acro else "")
+ ((', '+self.locname) if self.locname else "")
)
class BipartiteExtractor:
"""
......@@ -405,7 +424,6 @@ class BipartiteExtractor:
(record_status = 'legacy' AND valid_date >= NOW())
)
"""
else:
# query is a set of filters like: key <=> array of values
# (expressed as rest parameters: "keyA[]=valA1&keyB[]=valB1&keyB[]=valB2")
......@@ -426,41 +444,28 @@ class BipartiteExtractor:
continue
else:
known_filter = key
sql_column = FIELDS_FRONTEND_TO_SQL[key]['col']
sql_field = FIELDS_FRONTEND_TO_SQL[key]['grouped']
# "LIKE_relation" or "EQ_relation"
rel_type = FIELDS_FRONTEND_TO_SQL[key]['type']
# pre-treatment: rewrite tables' names if they're inside the sub-query
# exemple:
# scholars.country ~~~~~> scholars_n_hashtags.country
# hashtags.htstr ~~~~~> scholars_n_hashtags.htstr
# (see cascaded join below for explanation)
if match("scholars", sql_column):
(sql_table, sql_field) = sql_column.split('.')
sql_column = 'scholars_n_hashtags.'+sql_field
mlog('DBG', "rewrote sql col", sql_column)
elif match("hashtags.htstr", sql_column):
sql_column = 'scholars_n_hashtags.hashtags_list'
mlog('DBG', "rewrote sql col", sql_column)
# now create the constraints
val = filter_dict[known_filter]
if len(val):
# clause type clause is full
# IN (val1, val2) False
# "= val" False
# "col LIKE '%val%'" True
clause_is_full = False
rhsclause = ""
fullclause = ""
if (isinstance(val, list) or isinstance(val, tuple)):
# clause exemples
# "col IN (val1, val2)"
# "col = val"
# "col LIKE '%escapedval%'"
if (not isinstance(val, list)
and not isinstance(val, tuple)):
mlog("WARNING", "direct graph api query without tina")
clause = sql_field + type_to_sql_filter(val)
# normal case
# tina sends an array of str filters
else:
tested_array = [x for x in val if x]
mlog("DEBUG", "tested_array", tested_array)
if len(tested_array):
......@@ -468,33 +473,20 @@ class BipartiteExtractor:
qwliststr = repr(tested_array)
qwliststr = sub(r'^\[', '(', qwliststr)
qwliststr = sub(r'\]$', ')', qwliststr)
clause = 'IN '+qwliststr
clause = sql_field + ' IN '+qwliststr
# ex: country IN ('France', 'USA')
elif rel_type == "LIKE_relation":
like_clauses = []
for singleval in tested_array:
if type(singleval) == str and len(singleval):
like_clauses.append(
sql_column+" LIKE '%"+singleval+"%'"
sql_field+" LIKE '%"+quotestr(singleval)+"%'"
)
clause = " OR ".join(like_clauses)
# clause already includes col name
clause_is_full = True
elif isinstance(val, int):
clause = '= %i' % val
elif isinstance(val, float):
clause = '= %f' % val
# elif isinstance(val, str):
# clause = '= "%s"' % val
elif isinstance(val, str):
clause = 'LIKE "%'+val+'%"'
clause_is_full = True
if len(clause):
if clause_is_full:
sql_constraints.append("(%s)" % clause)
else:
sql_constraints.append("(%s %s)" % (sql_column, clause))
# debug
mlog("INFO", "SELECTing active users with sql_constraints", sql_constraints)
......@@ -502,44 +494,48 @@ class BipartiteExtractor:
# use constraints as WHERE-clause
# NB we must cascade join because
# both hashtags and keywords are one-to-many
# => it renames scholars and hashtag tables
# into 'scholars_n_hashtags'
# orgs, hashtags and keywords are one-to-many
# => it renames tables into 'full_scholar'
sql_query = """
SELECT * FROM (
SELECT
scholars_n_hashtags.luid,
scholars_n_hashtags.affiliation_id,
sch_org_n_tags.*,
-- kws info
GROUP_CONCAT(keywords.kwstr) AS keywords_list
FROM (
SELECT
scholars.*,
scholars_and_orgs.*,
-- hts info
GROUP_CONCAT(hashtags.htstr) AS hashtags_list
FROM (
SELECT scholars.*,
-- org info
-- GROUP_CONCAT(orgs.orgid) AS orgs_ids_list,
GROUP_CONCAT(orgs_set.tostring) AS orgs_list
FROM scholars
LEFT JOIN sch_org ON luid = sch_org.uid
LEFT JOIN (
SELECT * FROM orgs
) AS orgs_set ON sch_org.orgid = orgs_set.orgid
GROUP BY luid
) AS scholars_and_orgs
LEFT JOIN sch_ht
ON uid = luid
JOIN hashtags
ON sch_ht.htid = hashtags.htid
GROUP BY luid
) AS scholars_n_hashtags
) AS sch_org_n_tags
-- two step JOIN for keywords
LEFT JOIN sch_kw
ON uid = luid
JOIN keywords
ON sch_kw.kwid = keywords.kwid
-- we still must keep affiliations in case it's used in the WHERE-clause...
LEFT JOIN affiliations
ON affiliation_id = affid
-- our filtering constraints fit here
WHERE %s
AND (
WHERE (
record_status = 'active'
OR
(record_status = 'legacy' AND valid_date >= NOW())
......@@ -547,7 +543,13 @@ class BipartiteExtractor:
GROUP BY luid
""" % (" AND ".join(sql_constraints))
) AS full_scholar
-- our filtering constraints fit here
WHERE %s
""" % " AND ".join(sql_constraints)
mlog("DEBUGSQL", "getScholarsList SELECT: ", sql_query)
# in both cases "*" or constraints
self.cursor.execute(sql_query)
......@@ -573,32 +575,55 @@ class BipartiteExtractor:
Adding each connected scholar per unique_id
(getting details for selected scholars into graph object)
# TODO do it along with previous step getScholarsList
# POSS if filters, could do it along with previous step getScholarsList
# (less modular but a lot faster)
NB here scholar_array is actually a dict :/ ...
"""
# debug
# mlog("DEBUG", "MySQL extract scholar_array:", scholar_array)
# scholar_array = list(scholar_array.keys())[0:3]
# TODO loop could be after SELECT
for scholar_id in scholar_array:
sql3='''
SELECT
scholars_and_affiliations.*,
scholars_and_orgs.*,
COUNT(keywords.kwid) AS keywords_nb,
GROUP_CONCAT(keywords.kwid) AS keywords_ids,
GROUP_CONCAT(kwstr) AS keywords_list
FROM (
SELECT
scholars_and_insts.*,
-- small serializations here to avoid 2nd query
GROUP_CONCAT(
JSON_ARRAY(labs.name, labs.acro, labs.locname)
) AS labs_list
FROM (
SELECT
scholars.*,
affiliations.*
FROM scholars
LEFT JOIN affiliations
ON scholars.affiliation_id = affiliations.affid
GROUP_CONCAT(
JSON_ARRAY(insts.name, insts.acro, insts.locname)
) AS insts_list
FROM
scholars
LEFT JOIN sch_org ON luid = sch_org.uid
LEFT JOIN (
SELECT * FROM orgs WHERE class = 'inst'
) AS insts ON sch_org.orgid = insts.orgid
WHERE (record_status = 'active'
OR (record_status = 'legacy' AND valid_date >= NOW()))
) AS scholars_and_affiliations
GROUP BY luid
) AS scholars_and_insts
LEFT JOIN sch_org ON luid = sch_org.uid
LEFT JOIN (
SELECT * FROM orgs WHERE class = 'lab'
) AS labs ON sch_org.orgid = labs.orgid
GROUP BY luid
) AS scholars_and_orgs
LEFT JOIN sch_kw
ON sch_kw.uid = scholars_and_affiliations.luid
ON sch_kw.uid = scholars_and_orgs.luid
LEFT JOIN keywords
ON sch_kw.kwid = keywords.kwid
WHERE luid = %s
......@@ -623,6 +648,23 @@ class BipartiteExtractor:
else:
pic_src = ''
# NB instead of secondary query for orgs.*, we can
# simply parse orgs infos
# and take labs[0] and insts[0]
labs = list(map(
lambda arr: Org(arr, org_class='lab'),
loads('['+res3['labs_list'] +']')
))
insts = list(map(
lambda arr: Org(arr, org_class='insts'),
loads('['+res3['insts_list']+']')
))
mlog("DEBUGSQL", "main lab:", labs[0])
mlog("DEBUGSQL", "main inst:", insts[0])
# each lab is an array [name, acronym, location]
# all detailed node data
ide="D::"+res3['initials']+("/%05i"%int(res3['luid']));
info['id'] = ide;
info['luid'] = res3['luid'];
......@@ -635,13 +677,16 @@ class BipartiteExtractor:
info['keywords_ids'] = res3['keywords_ids'].split(',') if res3['keywords_ids'] else [];
info['keywords_list'] = res3['keywords_list'];
info['country'] = res3['country'];
# info['ACR'] = res3['org_acronym'] # TODO create
info['ACR'] = labs[0].acro if labs[0].acro else labs[0].any
#info['CC'] = res3['norm_country'];
info['home_url'] = res3['home_url'];
info['team_lab'] = res3['team_lab'];
info['org'] = res3['org'];
# info['lab2'] = res3['lab2']; # TODO restore
# info['affiliation2'] = res3['affiliation2'];
info['team_lab'] = labs[0].tostring;
info['org'] = insts[0].tostring;
if len(labs) > 1:
info['lab2'] = labs[1].tostring
if len(insts) > 1:
info['affiliation2'] = insts[1].tostring
info['hon_title'] = res3['hon_title'] if res3['hon_title'] else ""
info['position'] = res3['position'];
info['job_looking'] = res3['job_looking'];
......@@ -975,14 +1020,13 @@ class BipartiteExtractor:
content += '<b>Position: </b>' +self.scholars[idNode]['position'].replace("&"," and ")+ '</br>'
affiliation=""
if self.scholars[idNode]['team_lab'] and self.scholars[idNode]['team_lab'] != "":
if self.scholars[idNode]['team_lab'] and self.scholars[idNode]['team_lab'] not in ["", "_NULL"]:
affiliation += self.scholars[idNode]['team_lab']+ ','
if self.scholars[idNode]['org'] and self.scholars[idNode]['org'] != "":
affiliation += self.scholars[idNode]['org']
# TODO restore if not redundant with org
# if self.scholars[idNode]['affiliation'] != "" or self.scholars[idNode]['lab'] != "":
# content += '<b>Affiliation: </b>' + affiliation.replace("&"," and ") + '</br>'
if affiliation != "":
content += '<b>Affiliation: </b>' + escape(affiliation) + '</br>'
if len(self.scholars[idNode]['keywords_list']) > 3:
content += '<b>Keywords: </b>' + self.scholars[idNode]['keywords_list'].replace(",",", ")+'.</br>'
......@@ -1009,8 +1053,6 @@ class BipartiteExtractor:
else: node["CC"]="-"
# Affiliation
# TODO restore with org_acronym
# node["ACR"] = self.scholars[idNode]["ACR"]
node["ACR"] = self.scholars[idNode]["org"]
if node["ACR"]=="": node["ACR"]="-"
......@@ -1089,3 +1131,23 @@ class BipartiteExtractor:
# mlog("DEBUG", "nodes2",edgesB)
# mlog("DEBUG", "bipartite",edgesAB)
return graph
def quotestr(a_str):
"helper function if we need to quote values ourselves"
return sub(r"(?<!\\)[']",r"\\'",a_str)
def type_to_sql_filter(val):
"helper functions if we need to build test filters ourselves"
if isinstance(val, int):
rhs = '= %i' % val
elif isinstance(val, float):
rhs = '= %f' % val
# elif isinstance(val, str):
# rhs = '= "%s"' % val
elif isinstance(val, str):
rhs = 'LIKE "%'+quotestr(val)+'%"'
return rhs
-- if serialization must be parsable, separators need to be absent tokens
SELECT
-- our convention (eg in dbdatapi.extract)
CONCAT(name, '((', acro, '))', ";;", locname)
FROM orgs
ORDER BY RAND()
LIMIT 10;
-- if serialization is just for display : for human-readable labels
-- with CONCAT_WS => nice because removes null segments eg '('+NULL+')'
SELECT
name,
acro,
locname,
CONCAT_WS( '',
CONCAT(name, ' '),
CONCAT('(',acro,')'),
CONCAT(', ', locname) )
FROM orgs
ORDER BY RAND()
LIMIT 10;
-- with CASE
SELECT
name,
acro,
locname,
-- 3 vars NULL or not => 8 cases
-- but by def either acro or name is not null => 7 cases
CASE
WHEN name IS NULL AND acro IS NULL AND locname IS NULL
THEN "_NULL"
WHEN name IS NULL AND locname IS NULL
THEN acro
WHEN acro IS NULL AND locname IS NULL
THEN name
WHEN locname IS NULL
THEN CONCAT (acro, ' (' ,name,')')
-- locname cases
WHEN name IS NULL
THEN CONCAT (acro, ', ', locname)
WHEN acro IS NULL
THEN CONCAT (name, ', ', locname)
-- eg "I3S (Laboratoire d'Informatique, Signaux et Systèmes), Sophia Antipolis, France"
ELSE CONCAT (acro, ' (' ,name,'), ', locname)
END AS tostring
FROM orgs
ORDER BY RAND()
LIMIT 10;
-- EXEMPLES:
-- +-----------------------------------------------------+-------------+--------------------------+----------------------------------------------------------------------------------+
-- | name | acro | locname | tostring |
-- +-----------------------------------------------------+-------------+--------------------------+----------------------------------------------------------------------------------+
-- | Dynamiques et écologie des paysages agroforestiers | DYNAFOR | NULL | DYNAFOR (Dynamiques et écologie des paysages agroforestiers) |
-- | University of Waterloo | NULL | Waterloo, Canada | University of Waterloo, Waterloo, Canada |
-- | University of Arizona | NULL | Tucson, Arizona, USA | University of Arizona, Tucson, Arizona, USA |
-- | Laboratoire d'Informatique, Signaux et Systèmes | I3S | Sophia Antipolis, France | I3S (Laboratoire d'Informatique, Signaux et Systèmes), Sophia Antipolis, France |
-- | Visvesvaraya National Institute of Technology | NULL | NULL | Visvesvaraya National Institute of Technology |
-- | Sciences Po | NULL | Paris, France | Sciences Po, Paris, France |
-- | School of Human Evolution and Social Change | SHESC | NULL | SHESC (School of Human Evolution and Social Change) |
-- | NULL | DSSCQ | NULL | DSSCQ |
-- +-----------------------------------------------------+-------------+--------------------------+----------------------------------------------------------------------------------+
......@@ -123,6 +123,11 @@
onclick='$(this).parents(".dropdown-menu").toggle();'>
Filter by laboratory</a>
</li>
<li>
<a id="addfilterorganization" href="#"
onclick='$(this).parents(".dropdown-menu").toggle();'>
Filter by organization</a>
</li>
</ul>
</li>
<li class="comex-nav-item">
......
......@@ -160,6 +160,5 @@
// ---------
var uinfo = {{ (current_user.json_info | safe) if current_user.info else ("null" | safe) }};
</script>
<script src="{{ url_for('static', filename='js/comex_page_rootindex.js') }}"></script>
{% endblock %}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment