mirror of https://github.com/sysown/proxysql
Extract monitor health decision logic into pure functions (Phase 3.3, #5491)
New files: - include/MonitorHealthDecision.h: declarations for 4 pure functions - lib/MonitorHealthDecision.cpp: implementations Extracted functions: - should_shun_on_connect_errors(): mirrors MySrvC::connect_error() threshold logic (min(shun_on_failures, connect_retries+1)) - can_unshun_server(): mirrors MyHGC recovery loop with time check, connect_timeout_max cap, min 1s floor, and kill_all drain check - should_shun_on_replication_lag(): mirrors MySQL_HostGroups_Manager lag check with consecutive count threshold - can_recover_from_replication_lag(): mirrors lag recovery check All functions are pure (no global state, no I/O) and added to libproxysql.a via lib/Makefile.pull/5507/head
parent
b0889e1313
commit
59654e48a3
@ -0,0 +1,95 @@
|
||||
/**
|
||||
* @file MonitorHealthDecision.h
|
||||
* @brief Pure decision functions for monitor health state transitions.
|
||||
*
|
||||
* Extracted from MySQL_Monitor, MySrvC, and MyHGC for unit testability.
|
||||
* These functions have no global state dependencies — all inputs are
|
||||
* passed as parameters.
|
||||
*
|
||||
* @see Phase 3.3 (GitHub issue #5491)
|
||||
*/
|
||||
|
||||
#ifndef MONITOR_HEALTH_DECISION_H
|
||||
#define MONITOR_HEALTH_DECISION_H
|
||||
|
||||
#include <ctime>
|
||||
|
||||
/**
|
||||
* @brief Determine if a server should be shunned based on connect errors.
|
||||
*
|
||||
* Mirrors the logic in MySrvC::connect_error() — a server is shunned
|
||||
* when errors in the current second reach min(shun_on_failures,
|
||||
* connect_retries_on_failure + 1).
|
||||
*
|
||||
* @param errors_this_second Number of connect errors in the current second.
|
||||
* @param shun_on_failures Config: mysql-shun_on_failures.
|
||||
* @param connect_retries Config: mysql-connect_retries_on_failure.
|
||||
* @return true if the error count meets or exceeds the shunning threshold.
|
||||
*/
|
||||
bool should_shun_on_connect_errors(
|
||||
unsigned int errors_this_second,
|
||||
int shun_on_failures,
|
||||
int connect_retries
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Determine if a shunned server can be brought back online.
|
||||
*
|
||||
* Mirrors the recovery logic in MyHGC's server scan loop. A server
|
||||
* can be unshunned when:
|
||||
* 1. Enough time has elapsed since the last detected error.
|
||||
* 2. If shunned_and_kill_all_connections is true, all connections
|
||||
* (both used and free) must be fully drained first.
|
||||
*
|
||||
* @param time_last_error Timestamp of the last detected error.
|
||||
* @param current_time Current time.
|
||||
* @param shun_recovery_time_sec Config: mysql-shun_recovery_time_sec.
|
||||
* @param connect_timeout_max_ms Config: mysql-connect_timeout_server_max (milliseconds).
|
||||
* @param kill_all_conns Whether shunned_and_kill_all_connections is set.
|
||||
* @param connections_used Number of in-use connections.
|
||||
* @param connections_free Number of idle connections.
|
||||
* @return true if the server can be unshunned.
|
||||
*/
|
||||
bool can_unshun_server(
|
||||
time_t time_last_error,
|
||||
time_t current_time,
|
||||
int shun_recovery_time_sec,
|
||||
int connect_timeout_max_ms,
|
||||
bool kill_all_conns,
|
||||
unsigned int connections_used,
|
||||
unsigned int connections_free
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Determine if a server should be shunned for replication lag.
|
||||
*
|
||||
* Mirrors the replication lag check in MySQL_HostGroups_Manager.
|
||||
* A server is shunned when its replication lag exceeds max_replication_lag
|
||||
* for N consecutive checks (where N = monitor_replication_lag_count).
|
||||
*
|
||||
* @param current_lag Measured replication lag in seconds (-1 = unknown).
|
||||
* @param max_replication_lag Configured max lag threshold (0 = disabled).
|
||||
* @param consecutive_count Number of consecutive checks exceeding threshold.
|
||||
* @param count_threshold Config: mysql-monitor_replication_lag_count.
|
||||
* @return true if the server should be shunned for replication lag.
|
||||
*/
|
||||
bool should_shun_on_replication_lag(
|
||||
int current_lag,
|
||||
unsigned int max_replication_lag,
|
||||
unsigned int consecutive_count,
|
||||
int count_threshold
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Determine if a server shunned for replication lag can be recovered.
|
||||
*
|
||||
* @param current_lag Measured replication lag in seconds.
|
||||
* @param max_replication_lag Configured max lag threshold.
|
||||
* @return true if the server's lag is now within acceptable bounds.
|
||||
*/
|
||||
bool can_recover_from_replication_lag(
|
||||
int current_lag,
|
||||
unsigned int max_replication_lag
|
||||
);
|
||||
|
||||
#endif // MONITOR_HEALTH_DECISION_H
|
||||
@ -0,0 +1,105 @@
|
||||
/**
|
||||
* @file MonitorHealthDecision.cpp
|
||||
* @brief Implementation of pure monitor health decision functions.
|
||||
*
|
||||
* These functions extract the decision logic from MySrvC::connect_error(),
|
||||
* MyHGC's unshun recovery loop, and MySQL_HostGroups_Manager's replication
|
||||
* lag check. They are intentionally free of global state and I/O.
|
||||
*
|
||||
* @see MonitorHealthDecision.h
|
||||
* @see Phase 3.3 (GitHub issue #5491)
|
||||
*/
|
||||
|
||||
#include "MonitorHealthDecision.h"
|
||||
|
||||
bool should_shun_on_connect_errors(
|
||||
unsigned int errors_this_second,
|
||||
int shun_on_failures,
|
||||
int connect_retries)
|
||||
{
|
||||
// Mirror MySrvC::connect_error() threshold logic:
|
||||
// max_failures = min(shun_on_failures, connect_retries + 1)
|
||||
int connect_retries_plus_1 = connect_retries + 1;
|
||||
int max_failures = (shun_on_failures > connect_retries_plus_1)
|
||||
? connect_retries_plus_1
|
||||
: shun_on_failures;
|
||||
|
||||
return (errors_this_second >= (unsigned int)max_failures);
|
||||
}
|
||||
|
||||
bool can_unshun_server(
|
||||
time_t time_last_error,
|
||||
time_t current_time,
|
||||
int shun_recovery_time_sec,
|
||||
int connect_timeout_max_ms,
|
||||
bool kill_all_conns,
|
||||
unsigned int connections_used,
|
||||
unsigned int connections_free)
|
||||
{
|
||||
if (shun_recovery_time_sec == 0) {
|
||||
return false; // recovery disabled
|
||||
}
|
||||
|
||||
// Mirror MyHGC recovery: compute max_wait_sec with timeout cap
|
||||
int max_wait_sec;
|
||||
if (shun_recovery_time_sec * 1000 >= connect_timeout_max_ms) {
|
||||
max_wait_sec = connect_timeout_max_ms / 1000 - 1;
|
||||
} else {
|
||||
max_wait_sec = shun_recovery_time_sec;
|
||||
}
|
||||
if (max_wait_sec < 1) {
|
||||
max_wait_sec = 1;
|
||||
}
|
||||
|
||||
// Time check
|
||||
if (current_time <= time_last_error) {
|
||||
return false;
|
||||
}
|
||||
if ((current_time - time_last_error) <= max_wait_sec) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Connection drain check for kill-all mode
|
||||
if (kill_all_conns) {
|
||||
if (connections_used != 0 || connections_free != 0) {
|
||||
return false; // connections still draining
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool should_shun_on_replication_lag(
|
||||
int current_lag,
|
||||
unsigned int max_replication_lag,
|
||||
unsigned int consecutive_count,
|
||||
int count_threshold)
|
||||
{
|
||||
// Mirror MySQL_HostGroups_Manager replication lag logic
|
||||
if (current_lag < 0) {
|
||||
return false; // lag unknown, don't shun
|
||||
}
|
||||
if (max_replication_lag == 0) {
|
||||
return false; // lag check disabled
|
||||
}
|
||||
if (current_lag <= (int)max_replication_lag) {
|
||||
return false; // within threshold
|
||||
}
|
||||
|
||||
// Lag exceeds threshold — check consecutive count
|
||||
// The caller is expected to have incremented consecutive_count
|
||||
// before calling this function
|
||||
return (consecutive_count >= (unsigned int)count_threshold);
|
||||
}
|
||||
|
||||
bool can_recover_from_replication_lag(
|
||||
int current_lag,
|
||||
unsigned int max_replication_lag)
|
||||
{
|
||||
// Mirror MySQL_HostGroups_Manager unshun for replication lag:
|
||||
// recover when lag drops to <= max_replication_lag
|
||||
if (current_lag < 0) {
|
||||
return false; // unknown lag, don't recover
|
||||
}
|
||||
return (current_lag <= (int)max_replication_lag);
|
||||
}
|
||||
Loading…
Reference in new issue