Merge pull request #5614 from sysown/v3.0-pgsql-monitor-reschedule-on-interval-change

fix(pgsql-monitor): clamp next_*_at on LOAD PGSQL VARIABLES TO RUNTIME
pull/5591/head
René Cannaò 1 month ago committed by GitHub
commit f7ee5cd028
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -2533,14 +2533,83 @@ void* PgSQL_monitor_scheduler_thread() {
// Check variable version changes; refresh if needed
unsigned int glover = GloPTH->get_global_version();
bool vars_refreshed = false;
if (PgSQL_Thread__variables_version < glover) {
PgSQL_Thread__variables_version = glover;
pgsql_thread->refresh_variables();
vars_refreshed = true;
}
// Fetch config for next task scheduling
tasks_conf_t tasks_conf { fetch_updated_conf(GloPgMon, PgHGM) };
// When runtime variables were just refreshed (i.e. someone did
// SET pgsql-monitor_*_interval=<smaller_value>; LOAD PGSQL
// VARIABLES TO RUNTIME;), the newly-read interval values may be
// smaller than the ones we used the last time we recomputed
// next_intvs. Without the clamp below, each next_<type>_at still
// points at the cycle that was scheduled under the OLD (larger)
// interval, so a LOAD doesn't visibly take effect until the old
// cycle elapses — which can be up to ~2 minutes for connect
// checks under the default pgsql-monitor_connect_interval=120000.
//
// Example of the surprise this avoids:
//
// T=0 proxysql starts with monitor_connect_interval=120000,
// next_connect_at is scheduled for T=120000ms.
// T=5 user runs SET pgsql-monitor_connect_interval=2000;
// LOAD PGSQL VARIABLES TO RUNTIME; expecting the next
// connect cycle within ~2 seconds.
// T=7 without this clamp, next_connect_at is still 120000
// — no new connect fires for another 113 seconds.
//
// The fix: after every successful refresh, shrink any
// next_<type>_at that's now farther in the future than
// (now + new_interval). We never push next_<type>_at further
// into the future — growing the interval should not delay an
// already-imminent check — so the direction of the clamp is
// one-way (min()). Types whose interval is 0 (disabled) are
// skipped and handled by the existing compute_next_intvs()
// which sets them to ULONG_MAX.
if (vars_refreshed) {
if (tasks_conf.ping.params.interval > 0) {
const uint64_t clamped = cur_intv_start + tasks_conf.ping.params.interval;
if (next_intvs.next_ping_at > clamped) {
proxy_debug(PROXY_DEBUG_MONITOR, 5,
"Clamped next_ping_at old=%lu new=%lu interval=%d\n",
next_intvs.next_ping_at, clamped, tasks_conf.ping.params.interval);
next_intvs.next_ping_at = clamped;
}
}
if (tasks_conf.connect.params.interval > 0) {
const uint64_t clamped = cur_intv_start + tasks_conf.connect.params.interval;
if (next_intvs.next_connect_at > clamped) {
proxy_debug(PROXY_DEBUG_MONITOR, 5,
"Clamped next_connect_at old=%lu new=%lu interval=%d\n",
next_intvs.next_connect_at, clamped, tasks_conf.connect.params.interval);
next_intvs.next_connect_at = clamped;
}
}
if (tasks_conf.readonly.params.interval > 0) {
const uint64_t clamped = cur_intv_start + tasks_conf.readonly.params.interval;
if (next_intvs.next_readonly_at > clamped) {
proxy_debug(PROXY_DEBUG_MONITOR, 5,
"Clamped next_readonly_at old=%lu new=%lu interval=%d\n",
next_intvs.next_readonly_at, clamped, tasks_conf.readonly.params.interval);
next_intvs.next_readonly_at = clamped;
}
}
if (tasks_conf.repl_lag.params.interval > 0) {
const uint64_t clamped = cur_intv_start + tasks_conf.repl_lag.params.interval;
if (next_intvs.next_repl_lag_at > clamped) {
proxy_debug(PROXY_DEBUG_MONITOR, 5,
"Clamped next_repl_lag_at old=%lu new=%lu interval=%d\n",
next_intvs.next_repl_lag_at, clamped, tasks_conf.repl_lag.params.interval);
next_intvs.next_repl_lag_at = clamped;
}
}
}
// Perform table maintenance
maint_mon_tables(tasks_conf, next_intvs, cur_intv_start);

@ -648,16 +648,25 @@ int main(int argc, char** argv) {
// Configure monitor: use 'postgres' user which is accepted by the
// backend, set a tight connect_interval so cycles run within the test
// wait window. Restore original values afterwards.
//
// We deliberately do NOT change pgsql-monitor_username /
// pgsql-monitor_password. An earlier version of this test set them
// to 'postgres'/'postgres' on the assumption that the backend had a
// postgres user with password 'postgres', but the actual CI infra
// (docker-pgsql16-single) randomizes POSTGRES_PASSWORD per container
// startup (e.g. "05e792e51d"), so the hardcoded credentials never
// authenticated and every monitor connect failed with
// FATAL: password authentication failed for user "postgres"
// The default monitor/monitor user works against the infra's
// pg_hba.conf and is what the initial (pre-test) connect check
// already uses successfully.
long original_connect_interval = getConnectInterval(a);
std::string original_monitor_username = exec_scalar(a,
// Read current monitor username for diagnostic logging only.
std::string monitor_username = exec_scalar(a,
"SELECT Variable_Value FROM global_variables WHERE Variable_Name='pgsql-monitor_username'");
std::string original_monitor_password = exec_scalar(a,
"SELECT Variable_Value FROM global_variables WHERE Variable_Name='pgsql-monitor_password'");
diag("Original monitor: user=%s interval=%ld ms",
original_monitor_username.c_str(), original_connect_interval);
diag("Current monitor: user=%s interval=%ld ms",
monitor_username.c_str(), original_connect_interval);
exec_ok(a, "SET pgsql-monitor_username='postgres'");
exec_ok(a, "SET pgsql-monitor_password='postgres'");
setConnectInterval(a, 2000);
exec_ok(a, "UPDATE pgsql_servers SET use_ssl=1");
exec_ok(a, "LOAD PGSQL SERVERS TO RUNTIME");
@ -666,15 +675,9 @@ int main(int argc, char** argv) {
test_monitor_ssl_with_per_server_params(a);
test_monitor_uses_per_server_row(a);
// Restore original monitor settings
{
std::stringstream q;
q << "SET pgsql-monitor_username='" << original_monitor_username << "'";
exec_ok(a, q.str().c_str());
q.str("");
q << "SET pgsql-monitor_password='" << original_monitor_password << "'";
exec_ok(a, q.str().c_str());
}
// Restore original connect interval. Monitor username/password are
// no longer touched by this test - see comment above - so there's
// nothing to restore there.
setConnectInterval(a, (int)original_connect_interval);
// Part 3: Cluster query support

Loading…
Cancel
Save