Merge pull request #5559 from sysown/v3.0-issue5556

Fix crash when COM_BINLOG_DUMP received with idle backends (#5556)
pull/5034/merge
René Cannaò 2 months ago committed by GitHub
commit 63007cfa39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -5101,6 +5101,33 @@ int MySQL_Session::GPFC_Replication_SwitchToFastForward(PtrSize_t& pkt, unsigned
set_status(FAST_FORWARD); // we can set status to FAST_FORWARD
}
// Issue #5556: Clean up idle backends from other hostgroups.
// When transitioning to FAST_FORWARD, only the active backend (mybe) is
// needed. Any other backends from different hostgroups are now idle and
// will never be used again. If left alive, they can receive POLLIN events
// (e.g. from MySQL wait_timeout) which causes a crash in handler() because
// ASYNC_IDLE is not handled in the switch-case.
{
unsigned int i = 0;
while (i < mybes->len) {
MySQL_Backend* _mybe = (MySQL_Backend*)mybes->index(i);
if (_mybe != mybe) {
if (_mybe->server_myds && _mybe->server_myds->myconn) {
proxy_info(
"Cleaning up idle backend from hostgroup %d during FAST_FORWARD transition\n",
_mybe->hostgroup_id
);
_mybe->server_myds->destroy_MySQL_Connection_From_Pool(false);
}
mybes->remove_index_fast(i);
_mybe->reset();
delete _mybe;
} else {
i++;
}
}
}
return 0;
}

@ -252,6 +252,7 @@
"test_auth_methods-t" : [ "mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ],
"test_auto_increment_delay_multiplex-t" : [ "legacy-g2","mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ],
"test_backend_conn_ping-t" : [ "legacy-g2","mysql84-g2","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2" ],
"test_binlog_dump_multi_backend_crash-t" : [ "legacy-binlog-g1" ],
"test_binlog_fast_forward-t" : [ "legacy-binlog-g1" ],
"test_binlog_reader-t" : [ "legacy-binlog-g1" ],
"test_binlog_reader_uses_previous_hostgroup-t" : [ "legacy-binlog-g1" ],

@ -0,0 +1,371 @@
/**
* @file test_binlog_dump_multi_backend_crash-t.cpp
* @brief Regression test for issue #5556.
* @details When a session has backend connections to multiple hostgroups and
* then receives COM_BINLOG_DUMP, ProxySQL switches the session to
* FAST_FORWARD mode but only manages the active backend. The idle backend
* from another hostgroup remains in the session. If that idle backend
* receives a POLLIN event (e.g. from MySQL wait_timeout closing the
* connection), MySQL_Connection::handler() is called with
* async_state_machine==ASYNC_IDLE, which has no case in the switch and
* hits assert(0).
*
* Reproduction steps:
* 1. Use the existing replication hostgroups (writer_hg / reader_hg).
* Add a query rule routing SELECTs to reader_hg with multiplex=0.
* 2. Disable set_query_lock_on_hostgroup so SET @user_var doesn't
* prevent cross-hostgroup routing.
* 3. Connect to ProxySQL and run SELECT (creates reader_hg backend)
* then DO 1 (creates writer_hg backend).
* 4. Send COM_BINLOG_DUMP via mariadb_rpl (uses writer_hg via
* previous_hostgroup).
* 5. Wait for MySQL wait_timeout to close the idle reader_hg backend.
* 6. Verify ProxySQL did not crash.
*/
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <unistd.h>
#include <string>
#include <vector>
#include "mysql.h"
#include "mariadb_rpl.h"
#include "tap.h"
#include "command_line.h"
#include "utils.h"
// Short wait_timeout to trigger idle backend closure quickly
#define SHORT_WAIT_TIMEOUT 5
// How long to sleep waiting for the idle backend to be closed by MySQL
#define SLEEP_SECONDS (SHORT_WAIT_TIMEOUT + 3)
int main(int argc, char** argv) {
CommandLine cl;
plan(1);
diag("Testing issue #5556: COM_BINLOG_DUMP crash with idle backends from other hostgroups");
if (cl.getEnv()) {
diag("Failed to get the required environmental variables.");
return exit_status();
}
// ========================================================================
// Step 0: Connect to ProxySQL admin
// ========================================================================
MYSQL* admin = mysql_init(NULL);
if (!admin || !mysql_real_connect(admin, cl.admin_host, cl.admin_username,
cl.admin_password, NULL, cl.admin_port, NULL, 0)) {
fprintf(stderr, "Admin connection failed: %s\n", admin ? mysql_error(admin) : "mysql_init failed");
return exit_status();
}
// ========================================================================
// Step 1: Connect to MySQL backend directly, check binlog, set wait_timeout
// ========================================================================
MYSQL* backend = mysql_init(NULL);
if (!backend || !mysql_real_connect(backend, cl.mysql_host, cl.mysql_username,
cl.mysql_password, NULL, cl.mysql_port, NULL, 0)) {
fprintf(stderr, "Backend connection failed: %s\n", backend ? mysql_error(backend) : "mysql_init failed");
mysql_close(admin);
return exit_status();
}
// Verify binlog is available
bool binlog_ok = false;
if (mysql_query(backend, "SHOW MASTER STATUS") == 0) {
MYSQL_RES* res = mysql_store_result(backend);
if (res) {
if (mysql_fetch_row(res)) binlog_ok = true;
mysql_free_result(res);
}
}
if (!binlog_ok) {
diag("Binary logging not available. Skipping.");
ok(1, "SKIP: Binary logging not available");
mysql_close(backend);
mysql_close(admin);
return exit_status();
}
// ========================================================================
// Step 2: Discover hostgroup topology
// ========================================================================
int writer_hg = -1;
int reader_hg = -1;
{
const char* q = "SELECT writer_hostgroup, reader_hostgroup "
"FROM mysql_replication_hostgroups LIMIT 1";
if (mysql_query(admin, q) == 0) {
MYSQL_RES* res = mysql_store_result(admin);
if (res) {
MYSQL_ROW row = mysql_fetch_row(res);
if (row && row[0] && row[1]) {
writer_hg = atoi(row[0]);
reader_hg = atoi(row[1]);
}
mysql_free_result(res);
}
}
}
if (writer_hg < 0 || reader_hg < 0) {
diag("Could not discover replication hostgroups. Skipping.");
ok(1, "SKIP: No replication hostgroups configured");
mysql_close(backend);
mysql_close(admin);
return exit_status();
}
diag("Discovered hostgroups: writer=%d, reader=%d", writer_hg, reader_hg);
// Save and lower wait_timeout (after skip checks so early returns don't
// leave it modified — addresses Copilot review feedback)
long orig_wait_timeout = 28800;
if (mysql_query(backend, "SELECT @@global.wait_timeout") == 0) {
MYSQL_RES* res = mysql_store_result(backend);
if (res) {
MYSQL_ROW row = mysql_fetch_row(res);
if (row && row[0]) orig_wait_timeout = strtol(row[0], NULL, 10);
mysql_free_result(res);
}
}
diag("Original wait_timeout: %ld, setting to %d", orig_wait_timeout, SHORT_WAIT_TIMEOUT);
std::string set_wt = "SET GLOBAL wait_timeout=" + std::to_string(SHORT_WAIT_TIMEOUT);
if (mysql_query(backend, set_wt.c_str()) != 0) {
diag("Failed to set wait_timeout: %s", mysql_error(backend));
mysql_close(backend);
mysql_close(admin);
return exit_status();
}
// Set up query rules: ^SELECT → reader_hg (multiplex=0), all else → writer_hg (default)
MYSQL_QUERY(admin, "DELETE FROM mysql_query_rules");
{
std::string q = "INSERT INTO mysql_query_rules "
"(rule_id, active, match_digest, destination_hostgroup, multiplex, apply) "
"VALUES (1, 1, '^SELECT', " + std::to_string(reader_hg) + ", 0, 1)";
MYSQL_QUERY(admin, q.c_str());
}
MYSQL_QUERY(admin, "LOAD MYSQL QUERY RULES TO RUNTIME");
// Disable hostgroup locking so SET @user_var queries don't prevent
// routing to different hostgroups within the same session
MYSQL_QUERY(admin, "SET mysql-set_query_lock_on_hostgroup=0");
MYSQL_QUERY(admin, "LOAD MYSQL VARIABLES TO RUNTIME");
diag("ProxySQL configured: SELECT→HG%d (reader), default→HG%d (writer), multiplex=0, lock_on_hg=0",
reader_hg, writer_hg);
// ========================================================================
// Step 3: Connect to ProxySQL and create backends in both hostgroups
// ========================================================================
MYSQL* proxy = mysql_init(NULL);
if (!proxy) {
diag("mysql_init failed for proxy connection");
goto cleanup;
}
// Force CLIENT_DEPRECATE_EOF to match backend capabilities for fast-forward
proxy->options.client_flag |= CLIENT_DEPRECATE_EOF;
if (!mysql_real_connect(proxy, cl.root_host, cl.root_username,
cl.root_password, NULL, cl.root_port, NULL, 0)) {
diag("Proxy connection failed: %s", mysql_error(proxy));
goto cleanup;
}
// SELECT → creates backend in reader_hg
diag("Sending SELECT to create backend in reader HG%d", reader_hg);
if (mysql_query(proxy, "SELECT 1") != 0) {
diag("SELECT failed: %s", mysql_error(proxy));
goto cleanup;
}
{
MYSQL_RES* res = mysql_store_result(proxy);
mysql_free_result(res);
}
// DO 1 → creates backend in writer_hg, also sets previous_hostgroup
diag("Sending DO to create backend in writer HG%d (and set previous_hostgroup)", writer_hg);
if (mysql_query(proxy, "DO 1") != 0) {
diag("DO 1 failed: %s", mysql_error(proxy));
goto cleanup;
}
// Verify connections were established to both hostgroups.
// We check ConnOK (total connections ever made) rather than ConnUsed,
// because the writer backend may already be returned to the pool after
// DO completes (it's still in the session's mybes array though).
{
bool has_writer = false, has_reader = false;
std::string pool_q = "SELECT hostgroup, ConnOK FROM stats_mysql_connection_pool "
"WHERE hostgroup IN (" + std::to_string(writer_hg) + "," + std::to_string(reader_hg) + ") "
"AND ConnOK > 0";
if (mysql_query(admin, pool_q.c_str()) == 0) {
MYSQL_RES* res = mysql_store_result(admin);
if (res) {
MYSQL_ROW row;
while ((row = mysql_fetch_row(res))) {
int hg = atoi(row[0]);
diag(" Connection pool: HG%d has %s total connections (ConnOK)", hg, row[1]);
if (hg == writer_hg) has_writer = true;
if (hg == reader_hg) has_reader = true;
}
mysql_free_result(res);
}
}
if (!has_writer || !has_reader) {
diag("PRECONDITION FAILED: Connections not established to both hostgroups "
"(writer=%s, reader=%s). Test cannot reproduce #5556.",
has_writer ? "yes" : "NO", has_reader ? "yes" : "NO");
goto cleanup;
}
diag("Confirmed connections to both writer HG%d and reader HG%d", writer_hg, reader_hg);
}
// ========================================================================
// Step 4: Send COM_BINLOG_DUMP via mariadb_rpl
// ========================================================================
{
// Set up replication parameters on the same connection
diag("Setting replication parameters");
if (mysql_query(proxy, "SET @master_binlog_checksum='NONE'") != 0) {
diag("SET checksum failed: %s", mysql_error(proxy));
goto cleanup;
}
if (mysql_query(proxy, "SET @master_heartbeat_period=1000000000") != 0) {
diag("SET heartbeat failed: %s", mysql_error(proxy));
goto cleanup;
}
MARIADB_RPL* rpl = mariadb_rpl_init(proxy);
if (!rpl) {
diag("mariadb_rpl_init failed");
goto cleanup;
}
rpl->server_id = 99999; // unique server ID for test
rpl->start_position = 4; // start from beginning of first binlog
rpl->flags = MARIADB_RPL_BINLOG_SEND_ANNOTATE_ROWS;
diag("Sending COM_BINLOG_DUMP: pos=4 server_id=%d", rpl->server_id);
if (mariadb_rpl_open(rpl)) {
diag("mariadb_rpl_open failed: %s", mysql_error(proxy));
mariadb_rpl_close(rpl);
goto cleanup;
}
// Fetch a couple of events to confirm replication is working
diag("Replication started, fetching initial events...");
MARIADB_RPL_EVENT* event = NULL;
int num_events = 0;
for (int i = 0; i < 5; i++) {
event = mariadb_rpl_fetch(rpl, event);
if (!event) break;
num_events++;
diag(" Event %d: type=%d", num_events, event->event_type);
}
if (event) mariadb_free_rpl_event(event);
diag("Received %d initial events", num_events);
// ====================================================================
// Step 5: Wait for the idle backend (reader_hg) to be closed by MySQL
// ====================================================================
diag("Sleeping %d seconds for wait_timeout to close idle reader HG%d backend...",
SLEEP_SECONDS, reader_hg);
sleep(SLEEP_SECONDS);
// Fetch one more event to force ProxySQL to process the POLLIN on the
// idle backend's fd. This is the point where the crash would occur.
diag("Fetching another event (this triggers processing of idle backend POLLIN)...");
event = mariadb_rpl_fetch(rpl, NULL);
if (event) {
diag(" Got event: type=%d", event->event_type);
mariadb_free_rpl_event(event);
}
mariadb_rpl_close(rpl);
}
// ========================================================================
// Step 6: Verify ProxySQL did not crash
// ========================================================================
{
// Try to query the admin interface. If ProxySQL crashed and restarted,
// our admin connection would be broken.
int admin_ok = 0;
// First check our existing admin connection
if (mysql_query(admin, "SELECT 1") == 0) {
MYSQL_RES* res = mysql_store_result(admin);
if (res) {
mysql_free_result(res);
admin_ok = 1;
}
}
// If the existing connection is broken, try a fresh one
if (!admin_ok) {
diag("Existing admin connection broken, trying fresh connection...");
MYSQL* admin2 = mysql_init(NULL);
if (admin2 && mysql_real_connect(admin2, cl.admin_host, cl.admin_username,
cl.admin_password, NULL, cl.admin_port, NULL, 0)) {
// ProxySQL restarted after crash - the bug is present
diag("ProxySQL appears to have restarted (crash detected)");
mysql_close(admin2);
admin_ok = 0;
} else {
diag("Cannot connect to ProxySQL admin at all");
admin_ok = 0;
}
}
ok(admin_ok, "ProxySQL should not crash when idle backend receives POLLIN during FAST_FORWARD (issue #5556)");
}
cleanup:
// Restore original wait_timeout
{
std::string restore_wt = "SET GLOBAL wait_timeout=" + std::to_string(orig_wait_timeout);
if (mysql_query(backend, restore_wt.c_str()) != 0) {
mysql_close(backend);
backend = mysql_init(NULL);
if (backend && mysql_real_connect(backend, cl.mysql_host, cl.mysql_username,
cl.mysql_password, NULL, cl.mysql_port, NULL, 0)) {
mysql_query(backend, restore_wt.c_str());
}
}
diag("Restored wait_timeout to %ld", orig_wait_timeout);
}
// Clean up query rules and ProxySQL variables
{
MYSQL* admin_cleanup = admin;
bool need_new = false;
if (mysql_query(admin, "SELECT 1") != 0) {
need_new = true;
admin_cleanup = mysql_init(NULL);
if (admin_cleanup) {
mysql_real_connect(admin_cleanup, cl.admin_host, cl.admin_username,
cl.admin_password, NULL, cl.admin_port, NULL, 0);
}
} else {
MYSQL_RES* res = mysql_store_result(admin);
mysql_free_result(res);
}
if (admin_cleanup) {
MYSQL_QUERY_err(admin_cleanup, "SET mysql-set_query_lock_on_hostgroup=1");
MYSQL_QUERY_err(admin_cleanup, "LOAD MYSQL VARIABLES TO RUNTIME");
MYSQL_QUERY_err(admin_cleanup, "DELETE FROM mysql_query_rules");
MYSQL_QUERY_err(admin_cleanup, "LOAD MYSQL QUERY RULES TO RUNTIME");
if (need_new) mysql_close(admin_cleanup);
}
}
mysql_close(backend);
mysql_close(admin);
return exit_status();
}
Loading…
Cancel
Save