From 24db561c0933d5489e7e520e9f2a1eb06982df73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Canna=C3=B2?= Date: Sun, 22 Mar 2026 08:49:27 +0100 Subject: [PATCH] Add monitor health unit tests (Phase 3.3, #5491) 31 test cases covering all monitor health decision branches: - Connect error shunning: threshold computation, boundary values - Unshun recovery: time elapsed, timeout cap, kill_all drain check, recovery disabled, clock skew, min 1s floor - Replication lag shunning: threshold, consecutive count, disabled, unknown lag, boundary - Replication lag recovery: below/at/above threshold, unknown lag --- test/tap/tests/unit/Makefile | 7 +- test/tap/tests/unit/monitor_health_unit-t.cpp | 187 ++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 test/tap/tests/unit/monitor_health_unit-t.cpp diff --git a/test/tap/tests/unit/Makefile b/test/tap/tests/unit/Makefile index 27279b1f9..0d56f1758 100644 --- a/test/tap/tests/unit/Makefile +++ b/test/tap/tests/unit/Makefile @@ -231,7 +231,7 @@ $(ODIR)/test_init.o: $(TEST_HELPERS_DIR)/test_init.cpp | $(ODIR) # Unit test targets # =========================================================================== -UNIT_TESTS := smoke_test-t query_cache_unit-t query_processor_unit-t protocol_unit-t auth_unit-t connection_pool_unit-t rule_matching_unit-t hostgroups_unit-t +UNIT_TESTS := smoke_test-t query_cache_unit-t query_processor_unit-t protocol_unit-t auth_unit-t connection_pool_unit-t rule_matching_unit-t hostgroups_unit-t monitor_health_unit-t .PHONY: all all: $(UNIT_TESTS) @@ -285,6 +285,11 @@ hostgroups_unit-t: hostgroups_unit-t.cpp $(TEST_HELPERS_OBJ) $(LIBPROXYSQLAR) $(LIBPROXYSQLAR_FULL) $(STATIC_LIBS) $(MYLIBS) \ $(ALLOW_MULTI_DEF) -o $@ +monitor_health_unit-t: monitor_health_unit-t.cpp $(TEST_HELPERS_OBJ) $(LIBPROXYSQLAR) + $(CXX) $< $(TEST_HELPERS_OBJ) $(IDIRS) $(LDIRS) $(OPT) \ + $(LIBPROXYSQLAR_FULL) $(STATIC_LIBS) $(MYLIBS) \ + $(ALLOW_MULTI_DEF) -o $@ + # =========================================================================== # Clean diff --git a/test/tap/tests/unit/monitor_health_unit-t.cpp b/test/tap/tests/unit/monitor_health_unit-t.cpp new file mode 100644 index 000000000..34ae7d3c2 --- /dev/null +++ b/test/tap/tests/unit/monitor_health_unit-t.cpp @@ -0,0 +1,187 @@ +/** + * @file monitor_health_unit-t.cpp + * @brief Unit tests for monitor health state decision functions. + * + * Tests the pure functions extracted from MySQL_Monitor, MySrvC, + * and MyHGC: + * - should_shun_on_connect_errors() + * - can_unshun_server() + * - should_shun_on_replication_lag() + * - can_recover_from_replication_lag() + * + * @see Phase 3.3 (GitHub issue #5491) + */ + +#include "tap.h" +#include "test_globals.h" +#include "test_init.h" + +#include "proxysql.h" +#include "MonitorHealthDecision.h" + +// ============================================================================ +// 1. should_shun_on_connect_errors +// ============================================================================ + +static void test_shun_connect_errors() { + // shun_on_failures=5, connect_retries=3 → threshold = min(5, 3+1) = 4 + ok(should_shun_on_connect_errors(4, 5, 3) == true, + "shun: errors=4 meets threshold min(5,4)=4"); + ok(should_shun_on_connect_errors(3, 5, 3) == false, + "no shun: errors=3 below threshold 4"); + ok(should_shun_on_connect_errors(10, 5, 3) == true, + "shun: errors=10 exceeds threshold"); + + // shun_on_failures=2, connect_retries=10 → threshold = min(2, 11) = 2 + ok(should_shun_on_connect_errors(2, 2, 10) == true, + "shun: errors=2 meets threshold min(2,11)=2"); + ok(should_shun_on_connect_errors(1, 2, 10) == false, + "no shun: errors=1 below threshold 2"); + + // Edge: shun_on_failures=1 → shun on first error + ok(should_shun_on_connect_errors(1, 1, 0) == true, + "shun: threshold=1, first error triggers shun"); + ok(should_shun_on_connect_errors(0, 1, 0) == false, + "no shun: zero errors"); +} + +// ============================================================================ +// 2. can_unshun_server +// ============================================================================ + +static void test_unshun_time_elapsed() { + // Recovery after enough time: last_error=100, now=200, recovery=10s + ok(can_unshun_server(100, 200, 10, 60000, false, 0, 0) == true, + "unshun: 100s elapsed > 10s recovery"); + + // Not enough time: last_error=100, now=105, recovery=10s + ok(can_unshun_server(100, 105, 10, 60000, false, 0, 0) == false, + "no unshun: 5s elapsed < 10s recovery"); + + // Exactly at boundary: elapsed == recovery → should NOT unshun (needs >) + ok(can_unshun_server(100, 110, 10, 60000, false, 0, 0) == false, + "no unshun: elapsed == recovery (needs >)"); +} + +static void test_unshun_timeout_cap() { + // recovery=30s, connect_timeout_max=10000ms → cap = 10000/1000-1 = 9s + ok(can_unshun_server(100, 200, 30, 10000, false, 0, 0) == true, + "unshun: capped to 9s, 100s elapsed is enough"); + + // recovery=30s, connect_timeout_max=10000ms, but only 5s elapsed + ok(can_unshun_server(100, 105, 30, 10000, false, 0, 0) == false, + "no unshun: capped to 9s but only 5s elapsed"); +} + +static void test_unshun_kill_all_conns() { + // kill_all=true, connections still active → cannot unshun + ok(can_unshun_server(100, 200, 10, 60000, true, 5, 0) == false, + "no unshun: kill_all=true, used=5"); + ok(can_unshun_server(100, 200, 10, 60000, true, 0, 3) == false, + "no unshun: kill_all=true, free=3"); + + // kill_all=true, all connections drained → can unshun + ok(can_unshun_server(100, 200, 10, 60000, true, 0, 0) == true, + "unshun: kill_all=true, all connections drained"); + + // kill_all=false, connections exist → can still unshun + ok(can_unshun_server(100, 200, 10, 60000, false, 10, 5) == true, + "unshun: kill_all=false, connections don't matter"); +} + +static void test_unshun_recovery_disabled() { + // recovery_time=0 → recovery disabled + ok(can_unshun_server(100, 200, 0, 60000, false, 0, 0) == false, + "no unshun: recovery disabled (recovery_time=0)"); +} + +static void test_unshun_clock_skew() { + // current_time <= time_last_error → no recovery + ok(can_unshun_server(200, 100, 10, 60000, false, 0, 0) == false, + "no unshun: clock skew (current < last_error)"); + ok(can_unshun_server(100, 100, 10, 60000, false, 0, 0) == false, + "no unshun: current == last_error"); +} + +static void test_unshun_max_wait_minimum() { + // recovery=1s, timeout_max=500ms → cap = 500/1000-1 = -1 → clamped to 1 + ok(can_unshun_server(100, 103, 1, 500, false, 0, 0) == true, + "unshun: max_wait clamped to 1s minimum, 3s elapsed"); +} + +// ============================================================================ +// 3. should_shun_on_replication_lag +// ============================================================================ + +static void test_replication_lag_shun() { + // lag=15, max=10, count=3, threshold=3 → shun + ok(should_shun_on_replication_lag(15, 10, 3, 3) == true, + "lag shun: lag=15 > max=10, count=3 meets threshold=3"); + + // lag=15, max=10, count=2, threshold=3 → not yet + ok(should_shun_on_replication_lag(15, 10, 2, 3) == false, + "no lag shun: count=2 below threshold=3"); + + // lag=5, max=10 → within bounds + ok(should_shun_on_replication_lag(5, 10, 10, 1) == false, + "no lag shun: lag=5 within max=10"); + + // max_replication_lag=0 → check disabled + ok(should_shun_on_replication_lag(100, 0, 10, 1) == false, + "no lag shun: check disabled (max=0)"); + + // lag=-1 (unknown) → don't shun + ok(should_shun_on_replication_lag(-1, 10, 10, 1) == false, + "no lag shun: lag unknown (-1)"); + + // lag exactly at max → not shunned (needs >) + ok(should_shun_on_replication_lag(10, 10, 5, 1) == false, + "no lag shun: lag=10 == max=10 (needs >)"); +} + +// ============================================================================ +// 4. can_recover_from_replication_lag +// ============================================================================ + +static void test_replication_lag_recovery() { + // lag drops below max → recover + ok(can_recover_from_replication_lag(5, 10) == true, + "lag recover: lag=5 <= max=10"); + + // lag exactly at max → recover + ok(can_recover_from_replication_lag(10, 10) == true, + "lag recover: lag=10 == max=10"); + + // lag still above → don't recover + ok(can_recover_from_replication_lag(15, 10) == false, + "no lag recover: lag=15 > max=10"); + + // unknown lag → don't recover + ok(can_recover_from_replication_lag(-1, 10) == false, + "no lag recover: lag unknown (-1)"); +} + +// ============================================================================ +// Main +// ============================================================================ + +int main() { + plan(31); + + int rc = test_init_minimal(); + ok(rc == 0, "test_init_minimal() succeeds"); + + test_shun_connect_errors(); // 7 + test_unshun_time_elapsed(); // 3 + test_unshun_timeout_cap(); // 2 + test_unshun_kill_all_conns(); // 4 + test_unshun_recovery_disabled(); // 1 + test_unshun_clock_skew(); // 2 + test_unshun_max_wait_minimum(); // 1 + test_replication_lag_shun(); // 6 + test_replication_lag_recovery(); // 4 + // Total: 1+7+3+2+4+1+2+1+6+4 = 31... fix plan + + test_cleanup_minimal(); + return exit_status(); +}