diff --git a/include/MonitorHealthDecision.h b/include/MonitorHealthDecision.h new file mode 100644 index 000000000..b186fec87 --- /dev/null +++ b/include/MonitorHealthDecision.h @@ -0,0 +1,95 @@ +/** + * @file MonitorHealthDecision.h + * @brief Pure decision functions for monitor health state transitions. + * + * Extracted from MySQL_Monitor, MySrvC, and MyHGC for unit testability. + * These functions have no global state dependencies — all inputs are + * passed as parameters. + * + * @see Phase 3.3 (GitHub issue #5491) + */ + +#ifndef MONITOR_HEALTH_DECISION_H +#define MONITOR_HEALTH_DECISION_H + +#include + +/** + * @brief Determine if a server should be shunned based on connect errors. + * + * Mirrors the logic in MySrvC::connect_error() — a server is shunned + * when errors in the current second reach min(shun_on_failures, + * connect_retries_on_failure + 1). + * + * @param errors_this_second Number of connect errors in the current second. + * @param shun_on_failures Config: mysql-shun_on_failures. + * @param connect_retries Config: mysql-connect_retries_on_failure. + * @return true if the error count meets or exceeds the shunning threshold. + */ +bool should_shun_on_connect_errors( + unsigned int errors_this_second, + int shun_on_failures, + int connect_retries +); + +/** + * @brief Determine if a shunned server can be brought back online. + * + * Mirrors the recovery logic in MyHGC's server scan loop. A server + * can be unshunned when: + * 1. Enough time has elapsed since the last detected error. + * 2. If shunned_and_kill_all_connections is true, all connections + * (both used and free) must be fully drained first. + * + * @param time_last_error Timestamp of the last detected error. + * @param current_time Current time. + * @param shun_recovery_time_sec Config: mysql-shun_recovery_time_sec. + * @param connect_timeout_max_ms Config: mysql-connect_timeout_server_max (milliseconds). + * @param kill_all_conns Whether shunned_and_kill_all_connections is set. + * @param connections_used Number of in-use connections. + * @param connections_free Number of idle connections. + * @return true if the server can be unshunned. + */ +bool can_unshun_server( + time_t time_last_error, + time_t current_time, + int shun_recovery_time_sec, + int connect_timeout_max_ms, + bool kill_all_conns, + unsigned int connections_used, + unsigned int connections_free +); + +/** + * @brief Determine if a server should be shunned for replication lag. + * + * Mirrors the replication lag check in MySQL_HostGroups_Manager. + * A server is shunned when its replication lag exceeds max_replication_lag + * for N consecutive checks (where N = monitor_replication_lag_count). + * + * @param current_lag Measured replication lag in seconds (-1 = unknown). + * @param max_replication_lag Configured max lag threshold (0 = disabled). + * @param consecutive_count Number of consecutive checks exceeding threshold. + * @param count_threshold Config: mysql-monitor_replication_lag_count. + * @return true if the server should be shunned for replication lag. + */ +bool should_shun_on_replication_lag( + int current_lag, + unsigned int max_replication_lag, + unsigned int consecutive_count, + int count_threshold +); + +/** + * @brief Determine if a server shunned for replication lag can be recovered. + * + * @param current_lag Measured replication lag in seconds. + * @param max_replication_lag Configured max lag threshold. + * @return true if the server's lag is now within acceptable bounds. + */ +bool can_recover_from_replication_lag( + int current_lag, + unsigned int max_replication_lag +); + +#endif // MONITOR_HEALTH_DECISION_H diff --git a/lib/Makefile b/lib/Makefile index 932cc386e..f7f24075c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -105,6 +105,7 @@ _OBJ_CXX := ProxySQL_GloVars.oo network.oo debug.oo configfile.oo Query_Cache.oo PgSQL_Variables_Validator.oo PgSQL_ExplicitTxnStateMgr.oo \ PgSQL_PreparedStatement.oo PgSQL_Extended_Query_Message.oo \ pgsql_tokenizer.oo \ + MonitorHealthDecision.oo \ proxy_sqlite3_symbols.oo # TSDB object files diff --git a/lib/MonitorHealthDecision.cpp b/lib/MonitorHealthDecision.cpp new file mode 100644 index 000000000..451537e30 --- /dev/null +++ b/lib/MonitorHealthDecision.cpp @@ -0,0 +1,105 @@ +/** + * @file MonitorHealthDecision.cpp + * @brief Implementation of pure monitor health decision functions. + * + * These functions extract the decision logic from MySrvC::connect_error(), + * MyHGC's unshun recovery loop, and MySQL_HostGroups_Manager's replication + * lag check. They are intentionally free of global state and I/O. + * + * @see MonitorHealthDecision.h + * @see Phase 3.3 (GitHub issue #5491) + */ + +#include "MonitorHealthDecision.h" + +bool should_shun_on_connect_errors( + unsigned int errors_this_second, + int shun_on_failures, + int connect_retries) +{ + // Mirror MySrvC::connect_error() threshold logic: + // max_failures = min(shun_on_failures, connect_retries + 1) + int connect_retries_plus_1 = connect_retries + 1; + int max_failures = (shun_on_failures > connect_retries_plus_1) + ? connect_retries_plus_1 + : shun_on_failures; + + return (errors_this_second >= (unsigned int)max_failures); +} + +bool can_unshun_server( + time_t time_last_error, + time_t current_time, + int shun_recovery_time_sec, + int connect_timeout_max_ms, + bool kill_all_conns, + unsigned int connections_used, + unsigned int connections_free) +{ + if (shun_recovery_time_sec == 0) { + return false; // recovery disabled + } + + // Mirror MyHGC recovery: compute max_wait_sec with timeout cap + int max_wait_sec; + if (shun_recovery_time_sec * 1000 >= connect_timeout_max_ms) { + max_wait_sec = connect_timeout_max_ms / 1000 - 1; + } else { + max_wait_sec = shun_recovery_time_sec; + } + if (max_wait_sec < 1) { + max_wait_sec = 1; + } + + // Time check + if (current_time <= time_last_error) { + return false; + } + if ((current_time - time_last_error) <= max_wait_sec) { + return false; + } + + // Connection drain check for kill-all mode + if (kill_all_conns) { + if (connections_used != 0 || connections_free != 0) { + return false; // connections still draining + } + } + + return true; +} + +bool should_shun_on_replication_lag( + int current_lag, + unsigned int max_replication_lag, + unsigned int consecutive_count, + int count_threshold) +{ + // Mirror MySQL_HostGroups_Manager replication lag logic + if (current_lag < 0) { + return false; // lag unknown, don't shun + } + if (max_replication_lag == 0) { + return false; // lag check disabled + } + if (current_lag <= (int)max_replication_lag) { + return false; // within threshold + } + + // Lag exceeds threshold — check consecutive count + // The caller is expected to have incremented consecutive_count + // before calling this function + return (consecutive_count >= (unsigned int)count_threshold); +} + +bool can_recover_from_replication_lag( + int current_lag, + unsigned int max_replication_lag) +{ + // Mirror MySQL_HostGroups_Manager unshun for replication lag: + // recover when lag drops to <= max_replication_lag + if (current_lag < 0) { + return false; // unknown lag, don't recover + } + return (current_lag <= (int)max_replication_lag); +}