Commit 123c5b92 authored by Romain Loth's avatar Romain Loth

custom 'users_status' param for api/aggs

parent 791106e9
...@@ -101,18 +101,38 @@ def doors_uid_to_luid(doors_uid): ...@@ -101,18 +101,38 @@ def doors_uid_to_luid(doors_uid):
return luid return luid
def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): def get_field_aggs(a_field,
hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD']),
users_status = "active"):
""" """
Use case: api/aggs?field=a_field Use case: api/aggs?field=a_field
---------------------------------
=> Retrieves distinct field values and count having it => Retrieves distinct field values and count having it
=> about *n* vs *occs*: => about *n* vs *occs*:
- for tables != keywords count is scholar count - for tables != keywords count is scholar count
- for table keywords count is occurrences count - for table keywords count is occurrences count
NB relies on FIELDS_FRONTEND_TO_SQL mapping Parameters
POSS: allow other fields than those in the mapping ----------
if they are already in sql table.col format? a_field: str
a front-end fieldname to aggregate, like "keywords" "countries"
(allowed values cf. FIELDS_FRONTEND_TO_SQL)
POSS: allow other fields than those in the mapping
if they are already in sql table.col format?
hapax_threshold: int
for all data_types, categories with a total equal or below this will be excluded from results
TODO: put them in an 'others' category
POSS: have a different threshold by type
users_status: str
defines the perimeter (set of scholars over which we work),
(allowed values are ['active', 'test', 'legacy', 'ALL'])
NB: if the param is 'legacy' here, set is indifferent to call_date
(because aggs useful for *entire* legacy group)
""" """
agg_rows = [] agg_rows = []
...@@ -127,82 +147,113 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): ...@@ -127,82 +147,113 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
db = connect_db() db = connect_db()
db_c = db.cursor(DictCursor) db_c = db.cursor(DictCursor)
if type(hapax_threshold) == int and hapax_threshold > 0: # constraints 1, if any
count_col = 'occs' if sql_tab == 'keywords' else 'n' prefilters = []
where_clause = "WHERE %s > %i" % (count_col, hapax_threshold) if users_status != 'ALL':
prefilters.append( "scholars.record_status = \"%s\"" % users_status)
if len(prefilters):
pre_where = "WHERE "+" AND ".join(
['('+f+')' for f in prefilters]
)
else:
pre_where = ""
# constraints 2, if any
postfilters = []
if hapax_threshold > 0:
count_col = 'occs' if sql_tab in ['keywords', 'hashtags'] else 'n'
postfilters.append( "%s > %i" % (count_col, hapax_threshold) )
if len(postfilters):
post_where = "WHERE "+" AND ".join(
['('+f+')' for f in postfilters]
)
else: else:
where_clause = "" post_where = ""
# retrieval cases
if sql_tab == 'scholars': if sql_tab == 'scholars':
stmt = """ stmt = """
SELECT * FROM ( SELECT x, n FROM (
SELECT %(col)s AS x, COUNT(*) AS n SELECT %(col)s AS x, COUNT(*) AS n, record_status
FROM scholars FROM scholars
%(pre_filter)s
GROUP BY %(col)s GROUP BY %(col)s
) AS allcounts ) AS allcounts
%(filter)s %(post_filter)s
ORDER BY n DESC ORDER BY n DESC
""" % {'col': sql_col, 'filter': where_clause} """ % {'col': sql_col, 'pre_filter': pre_where,
'post_filter': post_where}
elif sql_tab == 'affiliations': elif sql_tab == 'affiliations':
stmt = """ stmt = """
SELECT * FROM ( SELECT x, n FROM (
SELECT %(col)s AS x, COUNT(*) AS n SELECT %(col)s AS x, COUNT(*) AS n, record_status
FROM scholars FROM scholars
-- 0 or 1 -- 0 or 1
LEFT JOIN affiliations LEFT JOIN affiliations
ON scholars.affiliation_id = affiliations.affid ON scholars.affiliation_id = affiliations.affid
%(pre_filter)s
GROUP BY %(col)s GROUP BY %(col)s
) AS allcounts ) AS allcounts
%(filter)s %(post_filter)s
ORDER BY n DESC ORDER BY n DESC
""" % {'col': sql_col, 'filter': where_clause} """ % {'col': sql_col, 'pre_filter': pre_where,
'post_filter': post_where}
elif sql_tab == 'linked_ids': elif sql_tab == 'linked_ids':
stmt = """ stmt = """
SELECT * FROM ( SELECT x, n FROM (
SELECT %(col)s AS x, COUNT(*) AS n SELECT %(col)s AS x, COUNT(*) AS n, record_status
FROM scholars FROM scholars
-- 0 or 1 -- 0 or 1
LEFT JOIN linked_ids LEFT JOIN linked_ids
ON scholars.luid = linked_ids.uid ON scholars.luid = linked_ids.uid
%(pre_filter)s
GROUP BY %(col)s GROUP BY %(col)s
) AS allcounts ) AS allcounts
%(filter)s %(post_filter)s
ORDER BY n DESC ORDER BY n DESC
""" % {'col': sql_col, 'filter': where_clause} """ % {'col': sql_col, 'pre_filter': pre_where,
'post_filter': post_where}
elif sql_tab == 'keywords': elif sql_tab == 'keywords':
stmt = """ stmt = """
SELECT * FROM ( SELECT x, occs FROM (
SELECT %(col)s AS x, COUNT(*) AS occs SELECT %(col)s AS x, COUNT(*) AS occs, record_status
FROM scholars FROM scholars
-- 0 or many -- 0 or many
LEFT JOIN sch_kw LEFT JOIN sch_kw
ON scholars.luid = sch_kw.uid ON scholars.luid = sch_kw.uid
LEFT JOIN keywords LEFT JOIN keywords
ON sch_kw.kwid = keywords.kwid ON sch_kw.kwid = keywords.kwid
%(pre_filter)s
GROUP BY %(col)s GROUP BY %(col)s
) AS allcounts ) AS allcounts
%(filter)s %(post_filter)s
ORDER BY occs DESC ORDER BY occs DESC
""" % {'col': sql_col, 'filter': where_clause} """ % {'col': sql_col, 'pre_filter': pre_where,
'post_filter': post_where}
elif sql_tab == 'hashtags': elif sql_tab == 'hashtags':
stmt = """ stmt = """
SELECT * FROM ( SELECT x, occs FROM (
SELECT %(col)s AS x, COUNT(*) AS occs SELECT %(col)s AS x, COUNT(*) AS occs, record_status
FROM scholars FROM scholars
-- 0 or many -- 0 or many
LEFT JOIN sch_ht LEFT JOIN sch_ht
ON scholars.luid = sch_ht.uid ON scholars.luid = sch_ht.uid
LEFT JOIN hashtags LEFT JOIN hashtags
ON sch_ht.htid = hashtags.htid ON sch_ht.htid = hashtags.htid
%(pre_filter)s
GROUP BY %(col)s GROUP BY %(col)s
) AS allcounts ) AS allcounts
%(filter)s %(post_filter)s
ORDER BY occs DESC ORDER BY occs DESC
""" % {'col': sql_col, 'filter': where_clause} """ % {'col': sql_col, 'pre_filter': pre_where,
'post_filter': post_where}
mlog("DEBUGSQL", "get_field_aggs STATEMENT:\n-- SQL\n%s\n-- /SQL" % stmt) mlog("DEBUGSQL", "get_field_aggs STATEMENT:\n-- SQL\n%s\n-- /SQL" % stmt)
...@@ -214,7 +265,8 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])): ...@@ -214,7 +265,8 @@ def get_field_aggs(a_field, hapax_threshold=int(REALCONFIG['HAPAX_THRESHOLD'])):
db.close() db.close()
mlog('INFO', agg_rows) # mlog('DEBUG', "aggregation over %s: result rows =" % a_field, agg_rows)
return agg_rows return agg_rows
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment