From b2f4b4f9a0ea6af7e19880a2b2c25663c10b61b0 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Wed, 1 Apr 2026 09:51:58 +0000 Subject: [PATCH 1/2] Fix crash when COM_BINLOG_DUMP received with idle backends from other hostgroups (#5556) When a session has backend connections to multiple hostgroups and receives COM_BINLOG_DUMP, GPFC_Replication_SwitchToFastForward() transitions the session to FAST_FORWARD mode but only manages the active backend. Idle backends from other hostgroups remain in the session's mybes array with async_state_machine==ASYNC_IDLE. If MySQL's wait_timeout closes one of these idle connections, the resulting POLLIN event causes handler() to be called with ASYNC_IDLE, which has no case in the switch and hits assert(0). Fix: Clean up idle backends from other hostgroups during the FAST_FORWARD transition in GPFC_Replication_SwitchToFastForward(). These connections will never be used again, so they are destroyed immediately. Includes a regression test (test_binlog_dump_multi_backend_crash-t) that: - Sets up two hostgroups with query rules routing SELECTs to reader HG - Creates backend connections to both hostgroups on one session - Sends COM_BINLOG_DUMP to enter FAST_FORWARD mode - Waits for wait_timeout to close the idle backend - Verifies ProxySQL does not crash --- lib/MySQL_Session.cpp | 27 ++ test/tap/groups/groups.json | 1 + ...test_binlog_dump_multi_backend_crash-t.cpp | 339 ++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp diff --git a/lib/MySQL_Session.cpp b/lib/MySQL_Session.cpp index 5946e95a5..3a1bf4e09 100644 --- a/lib/MySQL_Session.cpp +++ b/lib/MySQL_Session.cpp @@ -5101,6 +5101,33 @@ int MySQL_Session::GPFC_Replication_SwitchToFastForward(PtrSize_t& pkt, unsigned set_status(FAST_FORWARD); // we can set status to FAST_FORWARD } + // Issue #5556: Clean up idle backends from other hostgroups. + // When transitioning to FAST_FORWARD, only the active backend (mybe) is + // needed. Any other backends from different hostgroups are now idle and + // will never be used again. If left alive, they can receive POLLIN events + // (e.g. from MySQL wait_timeout) which causes a crash in handler() because + // ASYNC_IDLE is not handled in the switch-case. + { + unsigned int i = 0; + while (i < mybes->len) { + MySQL_Backend* _mybe = (MySQL_Backend*)mybes->index(i); + if (_mybe != mybe) { + if (_mybe->server_myds && _mybe->server_myds->myconn) { + proxy_info( + "Cleaning up idle backend from hostgroup %d during FAST_FORWARD transition\n", + _mybe->hostgroup_id + ); + _mybe->server_myds->destroy_MySQL_Connection_From_Pool(false); + } + mybes->remove_index_fast(i); + _mybe->reset(); + delete _mybe; + } else { + i++; + } + } + } + return 0; } diff --git a/test/tap/groups/groups.json b/test/tap/groups/groups.json index 88b0de9cf..c0c5ab6c1 100644 --- a/test/tap/groups/groups.json +++ b/test/tap/groups/groups.json @@ -252,6 +252,7 @@ "test_auth_methods-t" : [ "mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ], "test_auto_increment_delay_multiplex-t" : [ "legacy-g2","mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ], "test_backend_conn_ping-t" : [ "legacy-g2","mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ], + "test_binlog_dump_multi_backend_crash-t" : [ "legacy-binlog-g1" ], "test_binlog_fast_forward-t" : [ "legacy-binlog-g1" ], "test_binlog_reader-t" : [ "legacy-binlog-g1" ], "test_binlog_reader_uses_previous_hostgroup-t" : [ "legacy-binlog-g1" ], diff --git a/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp b/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp new file mode 100644 index 000000000..207b098a9 --- /dev/null +++ b/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp @@ -0,0 +1,339 @@ +/** + * @file test_binlog_dump_multi_backend_crash-t.cpp + * @brief Regression test for issue #5556. + * @details When a session has backend connections to multiple hostgroups and + * then receives COM_BINLOG_DUMP, ProxySQL switches the session to + * FAST_FORWARD mode but only manages the active backend. The idle backend + * from another hostgroup remains in the session. If that idle backend + * receives a POLLIN event (e.g. from MySQL wait_timeout closing the + * connection), MySQL_Connection::handler() is called with + * async_state_machine==ASYNC_IDLE, which has no case in the switch and + * hits assert(0). + * + * Reproduction steps: + * 1. Use the existing replication hostgroups (writer_hg / reader_hg). + * Add a query rule routing SELECTs to reader_hg with multiplex=0. + * 2. Disable set_query_lock_on_hostgroup so SET @user_var doesn't + * prevent cross-hostgroup routing. + * 3. Connect to ProxySQL and run SELECT (creates reader_hg backend) + * then DO 1 (creates writer_hg backend). + * 4. Send COM_BINLOG_DUMP via mariadb_rpl (uses writer_hg via + * previous_hostgroup). + * 5. Wait for MySQL wait_timeout to close the idle reader_hg backend. + * 6. Verify ProxySQL did not crash. + */ + +#include +#include +#include +#include +#include +#include + +#include "mysql.h" +#include "mariadb_rpl.h" + +#include "tap.h" +#include "command_line.h" +#include "utils.h" + +// Short wait_timeout to trigger idle backend closure quickly +#define SHORT_WAIT_TIMEOUT 5 +// How long to sleep waiting for the idle backend to be closed by MySQL +#define SLEEP_SECONDS (SHORT_WAIT_TIMEOUT + 3) + +int main(int argc, char** argv) { + CommandLine cl; + + plan(1); + diag("Testing issue #5556: COM_BINLOG_DUMP crash with idle backends from other hostgroups"); + + if (cl.getEnv()) { + diag("Failed to get the required environmental variables."); + return exit_status(); + } + + // ======================================================================== + // Step 0: Connect to ProxySQL admin + // ======================================================================== + MYSQL* admin = mysql_init(NULL); + if (!admin || !mysql_real_connect(admin, cl.admin_host, cl.admin_username, + cl.admin_password, NULL, cl.admin_port, NULL, 0)) { + fprintf(stderr, "Admin connection failed: %s\n", admin ? mysql_error(admin) : "mysql_init failed"); + return exit_status(); + } + + // ======================================================================== + // Step 1: Connect to MySQL backend directly, check binlog, set wait_timeout + // ======================================================================== + MYSQL* backend = mysql_init(NULL); + if (!backend || !mysql_real_connect(backend, cl.mysql_host, cl.mysql_username, + cl.mysql_password, NULL, cl.mysql_port, NULL, 0)) { + fprintf(stderr, "Backend connection failed: %s\n", backend ? mysql_error(backend) : "mysql_init failed"); + mysql_close(admin); + return exit_status(); + } + + // Verify binlog is available + bool binlog_ok = false; + if (mysql_query(backend, "SHOW MASTER STATUS") == 0) { + MYSQL_RES* res = mysql_store_result(backend); + if (res) { + if (mysql_fetch_row(res)) binlog_ok = true; + mysql_free_result(res); + } + } + if (!binlog_ok) { + diag("Binary logging not available. Skipping."); + ok(1, "SKIP: Binary logging not available"); + mysql_close(backend); + mysql_close(admin); + return exit_status(); + } + + // Save and lower wait_timeout + long orig_wait_timeout = 28800; + if (mysql_query(backend, "SELECT @@global.wait_timeout") == 0) { + MYSQL_RES* res = mysql_store_result(backend); + if (res) { + MYSQL_ROW row = mysql_fetch_row(res); + if (row && row[0]) orig_wait_timeout = strtol(row[0], NULL, 10); + mysql_free_result(res); + } + } + diag("Original wait_timeout: %ld, setting to %d", orig_wait_timeout, SHORT_WAIT_TIMEOUT); + + std::string set_wt = "SET GLOBAL wait_timeout=" + std::to_string(SHORT_WAIT_TIMEOUT); + if (mysql_query(backend, set_wt.c_str()) != 0) { + diag("Failed to set wait_timeout: %s", mysql_error(backend)); + mysql_close(backend); + mysql_close(admin); + return exit_status(); + } + + // ======================================================================== + // Step 2: Discover hostgroup topology and set up routing + // ======================================================================== + int writer_hg = -1; + int reader_hg = -1; + { + const char* q = "SELECT writer_hostgroup, reader_hostgroup " + "FROM mysql_replication_hostgroups LIMIT 1"; + if (mysql_query(admin, q) == 0) { + MYSQL_RES* res = mysql_store_result(admin); + if (res) { + MYSQL_ROW row = mysql_fetch_row(res); + if (row && row[0] && row[1]) { + writer_hg = atoi(row[0]); + reader_hg = atoi(row[1]); + } + mysql_free_result(res); + } + } + } + if (writer_hg < 0 || reader_hg < 0) { + diag("Could not discover replication hostgroups. Skipping."); + ok(1, "SKIP: No replication hostgroups configured"); + mysql_close(backend); + mysql_close(admin); + return exit_status(); + } + diag("Discovered hostgroups: writer=%d, reader=%d", writer_hg, reader_hg); + + // Set up query rules: ^SELECT → reader_hg (multiplex=0), all else → writer_hg (default) + MYSQL_QUERY(admin, "DELETE FROM mysql_query_rules"); + { + std::string q = "INSERT INTO mysql_query_rules " + "(rule_id, active, match_digest, destination_hostgroup, multiplex, apply) " + "VALUES (1, 1, '^SELECT', " + std::to_string(reader_hg) + ", 0, 1)"; + MYSQL_QUERY(admin, q.c_str()); + } + MYSQL_QUERY(admin, "LOAD MYSQL QUERY RULES TO RUNTIME"); + + // Disable hostgroup locking so SET @user_var queries don't prevent + // routing to different hostgroups within the same session + MYSQL_QUERY(admin, "SET mysql-set_query_lock_on_hostgroup=0"); + MYSQL_QUERY(admin, "LOAD MYSQL VARIABLES TO RUNTIME"); + + diag("ProxySQL configured: SELECT→HG%d (reader), default→HG%d (writer), multiplex=0, lock_on_hg=0", + reader_hg, writer_hg); + + // ======================================================================== + // Step 3: Connect to ProxySQL and create backends in both hostgroups + // ======================================================================== + MYSQL* proxy = mysql_init(NULL); + if (!proxy) { + diag("mysql_init failed for proxy connection"); + goto cleanup; + } + // Force CLIENT_DEPRECATE_EOF to match backend capabilities for fast-forward + proxy->options.client_flag |= CLIENT_DEPRECATE_EOF; + + if (!mysql_real_connect(proxy, cl.root_host, cl.root_username, + cl.root_password, NULL, cl.root_port, NULL, 0)) { + diag("Proxy connection failed: %s", mysql_error(proxy)); + goto cleanup; + } + + // SELECT → creates backend in reader_hg + diag("Sending SELECT to create backend in reader HG%d", reader_hg); + if (mysql_query(proxy, "SELECT 1") != 0) { + diag("SELECT failed: %s", mysql_error(proxy)); + goto cleanup; + } + { + MYSQL_RES* res = mysql_store_result(proxy); + mysql_free_result(res); + } + + // DO 1 → creates backend in writer_hg, also sets previous_hostgroup + diag("Sending DO to create backend in writer HG%d (and set previous_hostgroup)", writer_hg); + if (mysql_query(proxy, "DO 1") != 0) { + diag("DO 1 failed: %s", mysql_error(proxy)); + goto cleanup; + } + + // ======================================================================== + // Step 4: Send COM_BINLOG_DUMP via mariadb_rpl + // ======================================================================== + { + // Set up replication parameters on the same connection + diag("Setting replication parameters"); + if (mysql_query(proxy, "SET @master_binlog_checksum='NONE'") != 0) { + diag("SET checksum failed: %s", mysql_error(proxy)); + goto cleanup; + } + if (mysql_query(proxy, "SET @master_heartbeat_period=1000000000") != 0) { + diag("SET heartbeat failed: %s", mysql_error(proxy)); + goto cleanup; + } + + MARIADB_RPL* rpl = mariadb_rpl_init(proxy); + if (!rpl) { + diag("mariadb_rpl_init failed"); + goto cleanup; + } + rpl->server_id = 99999; // unique server ID for test + rpl->start_position = 4; // start from beginning of first binlog + rpl->flags = MARIADB_RPL_BINLOG_SEND_ANNOTATE_ROWS; + + diag("Sending COM_BINLOG_DUMP: pos=4 server_id=%d", rpl->server_id); + if (mariadb_rpl_open(rpl)) { + diag("mariadb_rpl_open failed: %s", mysql_error(proxy)); + mariadb_rpl_close(rpl); + goto cleanup; + } + + // Fetch a couple of events to confirm replication is working + diag("Replication started, fetching initial events..."); + MARIADB_RPL_EVENT* event = NULL; + int num_events = 0; + for (int i = 0; i < 5; i++) { + event = mariadb_rpl_fetch(rpl, event); + if (!event) break; + num_events++; + diag(" Event %d: type=%d", num_events, event->event_type); + } + if (event) mariadb_free_rpl_event(event); + diag("Received %d initial events", num_events); + + // ==================================================================== + // Step 5: Wait for the idle backend (reader_hg) to be closed by MySQL + // ==================================================================== + diag("Sleeping %d seconds for wait_timeout to close idle reader HG%d backend...", + SLEEP_SECONDS, reader_hg); + sleep(SLEEP_SECONDS); + + // Fetch one more event to force ProxySQL to process the POLLIN on the + // idle backend's fd. This is the point where the crash would occur. + diag("Fetching another event (this triggers processing of idle backend POLLIN)..."); + event = mariadb_rpl_fetch(rpl, NULL); + if (event) { + diag(" Got event: type=%d", event->event_type); + mariadb_free_rpl_event(event); + } + + mariadb_rpl_close(rpl); + } + + // ======================================================================== + // Step 6: Verify ProxySQL did not crash + // ======================================================================== + { + // Try to query the admin interface. If ProxySQL crashed and restarted, + // our admin connection would be broken. + int admin_ok = 0; + + // First check our existing admin connection + if (mysql_query(admin, "SELECT 1") == 0) { + MYSQL_RES* res = mysql_store_result(admin); + if (res) { + mysql_free_result(res); + admin_ok = 1; + } + } + + // If the existing connection is broken, try a fresh one + if (!admin_ok) { + diag("Existing admin connection broken, trying fresh connection..."); + MYSQL* admin2 = mysql_init(NULL); + if (admin2 && mysql_real_connect(admin2, cl.admin_host, cl.admin_username, + cl.admin_password, NULL, cl.admin_port, NULL, 0)) { + // ProxySQL restarted after crash - the bug is present + diag("ProxySQL appears to have restarted (crash detected)"); + mysql_close(admin2); + admin_ok = 0; + } else { + diag("Cannot connect to ProxySQL admin at all"); + admin_ok = 0; + } + } + + ok(admin_ok, "ProxySQL should not crash when idle backend receives POLLIN during FAST_FORWARD (issue #5556)"); + } + +cleanup: + // Restore original wait_timeout + { + std::string restore_wt = "SET GLOBAL wait_timeout=" + std::to_string(orig_wait_timeout); + if (mysql_query(backend, restore_wt.c_str()) != 0) { + mysql_close(backend); + backend = mysql_init(NULL); + if (backend && mysql_real_connect(backend, cl.mysql_host, cl.mysql_username, + cl.mysql_password, NULL, cl.mysql_port, NULL, 0)) { + mysql_query(backend, restore_wt.c_str()); + } + } + diag("Restored wait_timeout to %ld", orig_wait_timeout); + } + + // Clean up query rules and ProxySQL variables + { + MYSQL* admin_cleanup = admin; + bool need_new = false; + if (mysql_query(admin, "SELECT 1") != 0) { + need_new = true; + admin_cleanup = mysql_init(NULL); + if (admin_cleanup) { + mysql_real_connect(admin_cleanup, cl.admin_host, cl.admin_username, + cl.admin_password, NULL, cl.admin_port, NULL, 0); + } + } else { + MYSQL_RES* res = mysql_store_result(admin); + mysql_free_result(res); + } + + if (admin_cleanup) { + MYSQL_QUERY_err(admin_cleanup, "SET mysql-set_query_lock_on_hostgroup=1"); + MYSQL_QUERY_err(admin_cleanup, "LOAD MYSQL VARIABLES TO RUNTIME"); + MYSQL_QUERY_err(admin_cleanup, "DELETE FROM mysql_query_rules"); + MYSQL_QUERY_err(admin_cleanup, "LOAD MYSQL QUERY RULES TO RUNTIME"); + if (need_new) mysql_close(admin_cleanup); + } + } + + mysql_close(backend); + mysql_close(admin); + + return exit_status(); +} From 499896e44bd6f7e91b64a2a93083bc4dda3d9ce7 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Wed, 1 Apr 2026 11:14:37 +0000 Subject: [PATCH 2/2] Address PR review feedback for test_binlog_dump_multi_backend_crash - Move wait_timeout mutation after hostgroup discovery skip check so early returns don't leave backend state modified (Copilot review) - Add precondition assertion verifying connections exist in both writer and reader hostgroups before sending COM_BINLOG_DUMP, using ConnOK pool stats (CodeRabbit review) --- ...test_binlog_dump_multi_backend_crash-t.cpp | 74 +++++++++++++------ 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp b/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp index 207b098a9..1fce18370 100644 --- a/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp +++ b/test/tap/tests/test_binlog_dump_multi_backend_crash-t.cpp @@ -91,28 +91,8 @@ int main(int argc, char** argv) { return exit_status(); } - // Save and lower wait_timeout - long orig_wait_timeout = 28800; - if (mysql_query(backend, "SELECT @@global.wait_timeout") == 0) { - MYSQL_RES* res = mysql_store_result(backend); - if (res) { - MYSQL_ROW row = mysql_fetch_row(res); - if (row && row[0]) orig_wait_timeout = strtol(row[0], NULL, 10); - mysql_free_result(res); - } - } - diag("Original wait_timeout: %ld, setting to %d", orig_wait_timeout, SHORT_WAIT_TIMEOUT); - - std::string set_wt = "SET GLOBAL wait_timeout=" + std::to_string(SHORT_WAIT_TIMEOUT); - if (mysql_query(backend, set_wt.c_str()) != 0) { - diag("Failed to set wait_timeout: %s", mysql_error(backend)); - mysql_close(backend); - mysql_close(admin); - return exit_status(); - } - // ======================================================================== - // Step 2: Discover hostgroup topology and set up routing + // Step 2: Discover hostgroup topology // ======================================================================== int writer_hg = -1; int reader_hg = -1; @@ -140,6 +120,27 @@ int main(int argc, char** argv) { } diag("Discovered hostgroups: writer=%d, reader=%d", writer_hg, reader_hg); + // Save and lower wait_timeout (after skip checks so early returns don't + // leave it modified — addresses Copilot review feedback) + long orig_wait_timeout = 28800; + if (mysql_query(backend, "SELECT @@global.wait_timeout") == 0) { + MYSQL_RES* res = mysql_store_result(backend); + if (res) { + MYSQL_ROW row = mysql_fetch_row(res); + if (row && row[0]) orig_wait_timeout = strtol(row[0], NULL, 10); + mysql_free_result(res); + } + } + diag("Original wait_timeout: %ld, setting to %d", orig_wait_timeout, SHORT_WAIT_TIMEOUT); + + std::string set_wt = "SET GLOBAL wait_timeout=" + std::to_string(SHORT_WAIT_TIMEOUT); + if (mysql_query(backend, set_wt.c_str()) != 0) { + diag("Failed to set wait_timeout: %s", mysql_error(backend)); + mysql_close(backend); + mysql_close(admin); + return exit_status(); + } + // Set up query rules: ^SELECT → reader_hg (multiplex=0), all else → writer_hg (default) MYSQL_QUERY(admin, "DELETE FROM mysql_query_rules"); { @@ -193,6 +194,37 @@ int main(int argc, char** argv) { goto cleanup; } + // Verify connections were established to both hostgroups. + // We check ConnOK (total connections ever made) rather than ConnUsed, + // because the writer backend may already be returned to the pool after + // DO completes (it's still in the session's mybes array though). + { + bool has_writer = false, has_reader = false; + std::string pool_q = "SELECT hostgroup, ConnOK FROM stats_mysql_connection_pool " + "WHERE hostgroup IN (" + std::to_string(writer_hg) + "," + std::to_string(reader_hg) + ") " + "AND ConnOK > 0"; + if (mysql_query(admin, pool_q.c_str()) == 0) { + MYSQL_RES* res = mysql_store_result(admin); + if (res) { + MYSQL_ROW row; + while ((row = mysql_fetch_row(res))) { + int hg = atoi(row[0]); + diag(" Connection pool: HG%d has %s total connections (ConnOK)", hg, row[1]); + if (hg == writer_hg) has_writer = true; + if (hg == reader_hg) has_reader = true; + } + mysql_free_result(res); + } + } + if (!has_writer || !has_reader) { + diag("PRECONDITION FAILED: Connections not established to both hostgroups " + "(writer=%s, reader=%s). Test cannot reproduce #5556.", + has_writer ? "yes" : "NO", has_reader ? "yes" : "NO"); + goto cleanup; + } + diag("Confirmed connections to both writer HG%d and reader HG%d", writer_hg, reader_hg); + } + // ======================================================================== // Step 4: Send COM_BINLOG_DUMP via mariadb_rpl // ========================================================================