From ca202ceba1ea08711c5b1803ddffdbfbb41fb9dd Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Tue, 12 May 2026 13:26:01 +0500 Subject: [PATCH 01/12] Refactor: partition sessions by backend state instead of sorting Replace ProcessAllSessions_SortingSessions with ProcessAllSessions_Partition. The old function did an implicit one-pass swap-sort over mysql_sessions by max_connect_time; the new one explicitly partitions sessions into three contiguous blocks in pdata according to their backend state: [0, running_end) A: running a query against the backend [running_end, idle_begin) B: acquiring a backend (max_connect_time != 0) [idle_begin, len) C: idle (no backend, or holding a conn while parked in WAITING_CLIENT_DATA) --- include/Base_Thread.h | 2 +- lib/Base_Thread.cpp | 76 ++++++++++++++++++++++++++++--------------- lib/MySQL_Thread.cpp | 2 +- lib/PgSQL_Thread.cpp | 2 +- 4 files changed, 53 insertions(+), 29 deletions(-) diff --git a/include/Base_Thread.h b/include/Base_Thread.h index 485fec913..b4fb25c1a 100644 --- a/include/Base_Thread.h +++ b/include/Base_Thread.h @@ -65,7 +65,7 @@ private: template void check_for_invalid_fd(unsigned int n); template - void ProcessAllSessions_SortingSessions(); + void ProcessAllSessions_Partition(); template void ProcessAllMyDS_AfterPoll(); template diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index 263ac1004..25155dbf6 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -11,8 +11,8 @@ // Explicitly instantiate the required template class and member functions template MySQL_Session* Base_Thread::create_new_session_and_client_data_stream(int); template PgSQL_Session* Base_Thread::create_new_session_and_client_data_stream(int); -template void Base_Thread::ProcessAllSessions_SortingSessions(); -template void Base_Thread::ProcessAllSessions_SortingSessions(); +template void Base_Thread::ProcessAllSessions_Partition(); +template void Base_Thread::ProcessAllSessions_Partition(); template void Base_Thread::ProcessAllMyDS_AfterPoll(); template void Base_Thread::ProcessAllMyDS_AfterPoll(); template void Base_Thread::ProcessAllMyDS_BeforePoll(); @@ -230,34 +230,58 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { } } -// this function was inline in MySQL_Thread::process_all_sessions() + /** - * @brief Sort all sessions based on maximum connection time. - * - * This function iterates through all MySQL sessions and sorts them based on their maximum connection time. - * Sessions with a valid maximum connection time are compared, and if one session has a greater maximum connection - * time than another, their positions in the session list are swapped. The sorting is performed in-place. - * - * @note This function assumes that MySQL sessions and their associated data structures have been initialized - * and are accessible within the MySQL Thread. + * @brief Partition all sessions into three contiguous blocks + * + * Layout produced in mysql_sessions->pdata: + * [0, running_end) block A: running a query against the backend + * [running_end, idle_begin) block B: acquiring a backend (max_connect_time != 0) + * [idle_begin, len) block C: idle (no backend, or holds-conn-but-WAITING_CLIENT_DATA) + * + * Block A is "session is currently driving the backend" — these sessions have a chance + * of releasing the connection at end-of-query (depending on multiplexing eligibility, + * transaction state, etc.), giving block B sessions a fairness chance to acquire it. + * Sessions parked in WAITING_CLIENT_DATA (e.g., idle in a transaction after BEGIN) hold + * the conn but cannot release it until the client sends the next packet, so they live in C. + * + * Classification tests max_connect_time first: it must win over A even when myconn != NULL, + * to catch CHANGING_USER_SERVER on pooled connections and the post-error retry path where + * the old conn hasn't been destroyed yet. + * + * Single O(n) pass, in place. idx walks up, idle_begin walks down, they meet and terminate. */ template -void Base_Thread::ProcessAllSessions_SortingSessions() { - unsigned int a=0; - for (unsigned int n=0; nlen; n++) { - S *sess=(S *)mysql_sessions->index(n); - if (sess->mybe && sess->mybe->server_myds) { - if (sess->mybe->server_myds->max_connect_time) { - S *sess2=(S *)mysql_sessions->index(a); - if (sess2->mybe && sess2->mybe->server_myds && sess2->mybe->server_myds->max_connect_time && sess2->mybe->server_myds->max_connect_time <= sess->mybe->server_myds->max_connect_time) { - // do nothing - } else { - void *p=mysql_sessions->pdata[a]; - mysql_sessions->pdata[a]=mysql_sessions->pdata[n]; - mysql_sessions->pdata[n]=p; - a++; - } +void Base_Thread::ProcessAllSessions_Partition() { + size_t running_end = 0; + size_t idle_begin = mysql_sessions->len; + size_t idx = 0; + + while (idx < idle_begin) { + S* s = static_cast(mysql_sessions->index(idx)); + + const bool has_be = (s->mybe && s->mybe->server_myds); + const bool is_B = has_be && (s->mybe->server_myds->max_connect_time != 0); + const bool is_A = !is_B && has_be && (s->mybe->server_myds->myconn != nullptr) && (s->status != WAITING_CLIENT_DATA); + + if (is_A) { + if (idx != running_end) { + void* p = mysql_sessions->pdata[idx]; + mysql_sessions->pdata[idx] = mysql_sessions->pdata[running_end]; + mysql_sessions->pdata[running_end] = p; + } + ++running_end; + ++idx; + } else if (is_B) { + ++idx; + } else { + --idle_begin; + if (idx != idle_begin) { + void* p = mysql_sessions->pdata[idx]; + mysql_sessions->pdata[idx] = mysql_sessions->pdata[idle_begin]; + mysql_sessions->pdata[idle_begin] = p; } + // do NOT advance idx - re-examine the swapped-in element test } } } diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index 1e4847663..26abae76e 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -4453,7 +4453,7 @@ void MySQL_Thread::process_all_sessions() { } #endif // IDLE_THREADS if (sess_sort && mysql_sessions->len > 3) { - ProcessAllSessions_SortingSessions(); + ProcessAllSessions_Partition(); } for (n=0; nlen; n++) { MySQL_Session *sess=(MySQL_Session *)mysql_sessions->index(n); diff --git a/lib/PgSQL_Thread.cpp b/lib/PgSQL_Thread.cpp index e99e3f8c5..94874e39e 100644 --- a/lib/PgSQL_Thread.cpp +++ b/lib/PgSQL_Thread.cpp @@ -3833,7 +3833,7 @@ void PgSQL_Thread::process_all_sessions() { } #endif // IDLE_THREADS if (sess_sort && mysql_sessions->len > 3) { - ProcessAllSessions_SortingSessions(); + ProcessAllSessions_Partition(); } for (n = 0; n < mysql_sessions->len; n++) { PgSQL_Session* sess = (PgSQL_Session*)mysql_sessions->index(n); From 587950de65e84cb200fcdb85f0da96894f1fc5b1 Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Wed, 13 May 2026 11:54:05 +0500 Subject: [PATCH 02/12] promote long-waiters in block B by wait-time threshold After the 3-way A/B/C partition, sweep block B once and promote any session whose wait_time exceeds K_WAIT * avg(wait_time) toward the front of the B band. This gives sessions that have been waiting longer than the block average a chance at the next released backend conn, reducing head-of-line starvation when many clients share few conns. - Loop 1 (partition pass) now also accumulates sum_wait and b_count over block B, where wait_time = curtime - CurrentQuery.start_time in microseconds. CurrentQuery.start_time is guaranteed non-zero for any session in B; the (st && curtime > st) guard is defensive. - Loop 2 (O(b_count)) is a one-sided in-place partition: any B session with wt > K_WAIT * avg(wt) is swapped toward running_end. The promoted group is moved together but not sorted internally; same on the non-promoted side. The next pass re-evaluates with fresh wt. --- lib/Base_Thread.cpp | 68 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index 25155dbf6..f84dfa4d4 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -232,27 +232,41 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { /** - * @brief Partition all sessions into three contiguous blocks + * @brief Partition all sessions into three blocks, then FIFO-bump + * the long-waiters within block B. * - * Layout produced in mysql_sessions->pdata: - * [0, running_end) block A: running a query against the backend - * [running_end, idle_begin) block B: acquiring a backend (max_connect_time != 0) - * [idle_begin, len) block C: idle (no backend, or holds-conn-but-WAITING_CLIENT_DATA) + * Block layout produced in mysql_sessions->pdata: + * [0, running_end) block A - running a query against the backend + * (myconn != NULL, mct == 0, status != WAITING_CLIENT_DATA) + * [running_end, idle_begin) block B - acquiring/awaiting a backend + * (mct != 0) + * [idle_begin, len) block C - idle, or holds-conn-but-WAITING_CLIENT_DATA * - * Block A is "session is currently driving the backend" — these sessions have a chance - * of releasing the connection at end-of-query (depending on multiplexing eligibility, - * transaction state, etc.), giving block B sessions a fairness chance to acquire it. - * Sessions parked in WAITING_CLIENT_DATA (e.g., idle in a transaction after BEGIN) hold - * the conn but cannot release it until the client sends the next packet, so they live in C. + * Block A drives the backend and may release its conn at end-of-query, giving + * block B sessions a fairness chance to acquire it. Sessions parked in + * WAITING_CLIENT_DATA (idle in a transaction after BEGIN) hold the conn but + * cannot release it until the client sends the next packet, so they live in C. * - * Classification tests max_connect_time first: it must win over A even when myconn != NULL, - * to catch CHANGING_USER_SERVER on pooled connections and the post-error retry path where - * the old conn hasn't been destroyed yet. + * Classification tests max_connect_time first: it must win over A even when + * myconn != NULL, to catch CHANGING_USER_SERVER on pooled connections and the + * post-error retry path where the old conn hasn't been destroyed yet. + * + * Loop 1 (partition + stats): single O(n) pass, in place. idx walks up, + * idle_begin walks down, they meet and terminate. While partitioning, we + * also accumulate sum_wait and b_count over block B, where + * wt = curtime - CurrentQuery.start_time (microseconds) + * + * Loop 2 (O(b_count)): promote sessions in B with + * wt > K_WAIT * avg(wt) + * to the front of the B band, preserving their relative pdata order. This + * ensures the longer-waiting sessions in B get a chance at the next released + * backend conn before the newer arrivals do. * - * Single O(n) pass, in place. idx walks up, idle_begin walks down, they meet and terminate. */ template void Base_Thread::ProcessAllSessions_Partition() { + unsigned long long sum_wait = 0; + size_t b_count = 0; size_t running_end = 0; size_t idle_begin = mysql_sessions->len; size_t idx = 0; @@ -273,6 +287,10 @@ void Base_Thread::ProcessAllSessions_Partition() { ++running_end; ++idx; } else if (is_B) { + const unsigned long long st = s->CurrentQuery.start_time; + const unsigned long long wt = (st && curtime > st) ? (curtime - st) : 0; + sum_wait += wt; + ++b_count; ++idx; } else { --idle_begin; @@ -284,6 +302,28 @@ void Base_Thread::ProcessAllSessions_Partition() { // do NOT advance idx - re-examine the swapped-in element test } } + + // promote sessions with wait_time > K * avg(wait_time) + // to the front of B + constexpr double K_WAIT = 1.1; + if (b_count > 1) { + const unsigned long long avg_wait = sum_wait / b_count; + const unsigned long long threshold = static_cast(static_cast(avg_wait) * K_WAIT); + size_t a = running_end; + for (size_t n = running_end; n < idle_begin; ++n) { + S * sn = static_cast(mysql_sessions->pdata[n]); + const unsigned long long st = sn->CurrentQuery.start_time; + const unsigned long long wt = (st && curtime > st) ? (curtime - st) : 0; + if (wt > threshold) { + if (a != n) { + void* p = mysql_sessions->pdata[a]; + mysql_sessions->pdata[a] = mysql_sessions->pdata[n]; + mysql_sessions->pdata[n] = p; + } + ++a; + } + } + } } // this function was inline in MySQL_Thread::run() From 0e09ec59e946fc7602ed615479f24e86732a26b7 Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Thu, 14 May 2026 17:18:31 +0500 Subject: [PATCH 03/12] replace K_WAIT wait-time promotion with single-pass mct sort In ProcessAllSessions_Partition the two-loop K_WAIT * avg(wait_time) promotion is replaced with a single Lomuto-style sweep over the B band using max_connect_time as the ordering key. This is the same algorithm as the pre-PR ProcessAllSessions_SortingSessions, just scoped to the B band that Pass 1 has already isolated. --- lib/Base_Thread.cpp | 61 +++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index f84dfa4d4..f4583b9fe 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -232,8 +232,8 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { /** - * @brief Partition all sessions into three blocks, then FIFO-bump - * the long-waiters within block B. + * @brief Partition all sessions into three blocks and sort the B band by + * max_connect_time. * * Block layout produced in mysql_sessions->pdata: * [0, running_end) block A - running a query against the backend @@ -251,22 +251,16 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { * myconn != NULL, to catch CHANGING_USER_SERVER on pooled connections and the * post-error retry path where the old conn hasn't been destroyed yet. * - * Loop 1 (partition + stats): single O(n) pass, in place. idx walks up, - * idle_begin walks down, they meet and terminate. While partitioning, we - * also accumulate sum_wait and b_count over block B, where - * wt = curtime - CurrentQuery.start_time (microseconds) - * - * Loop 2 (O(b_count)): promote sessions in B with - * wt > K_WAIT * avg(wt) - * to the front of the B band, preserving their relative pdata order. This - * ensures the longer-waiting sessions in B get a chance at the next released - * backend conn before the newer arrivals do. + * Pass 1 (partition): single O(n) pass, in place. idx walks up, idle_begin + * walks down, they meet and terminate. * + * Pass 2 (sort): single Lomuto-style sweep over the B band [running_end, + * idle_begin) using max_connect_time as the ordering key. Same algorithm + * as the pre-PR ProcessAllSessions_SortingSessions, scoped to the band + * Pass 1 already isolated. */ template void Base_Thread::ProcessAllSessions_Partition() { - unsigned long long sum_wait = 0; - size_t b_count = 0; size_t running_end = 0; size_t idle_begin = mysql_sessions->len; size_t idx = 0; @@ -287,10 +281,6 @@ void Base_Thread::ProcessAllSessions_Partition() { ++running_end; ++idx; } else if (is_B) { - const unsigned long long st = s->CurrentQuery.start_time; - const unsigned long long wt = (st && curtime > st) ? (curtime - st) : 0; - sum_wait += wt; - ++b_count; ++idx; } else { --idle_begin; @@ -303,23 +293,28 @@ void Base_Thread::ProcessAllSessions_Partition() { } } - // promote sessions with wait_time > K * avg(wait_time) - // to the front of B - constexpr double K_WAIT = 1.1; - if (b_count > 1) { - const unsigned long long avg_wait = sum_wait / b_count; - const unsigned long long threshold = static_cast(static_cast(avg_wait) * K_WAIT); + // Single-pass sweep across the B band [running_end, idle_begin) ordering + // by max_connect_time. Same Lomuto-style sweep as the pre-PR + // ProcessAllSessions_SortingSessions, scoped to the B band Pass 1 just + // isolated. + // + // Invariant: every session in [running_end, idle_begin) is a B session, + // so s->mybe->server_myds->max_connect_time is non-zero and no nullptr + // guards are needed. + if (idle_begin > running_end + 1) { size_t a = running_end; for (size_t n = running_end; n < idle_begin; ++n) { - S * sn = static_cast(mysql_sessions->pdata[n]); - const unsigned long long st = sn->CurrentQuery.start_time; - const unsigned long long wt = (st && curtime > st) ? (curtime - st) : 0; - if (wt > threshold) { - if (a != n) { - void* p = mysql_sessions->pdata[a]; - mysql_sessions->pdata[a] = mysql_sessions->pdata[n]; - mysql_sessions->pdata[n] = p; - } + S* sess = static_cast(mysql_sessions->pdata[n]); + S* sess2 = static_cast(mysql_sessions->pdata[a]); + const unsigned long long mct_n = sess->mybe->server_myds->max_connect_time; + const unsigned long long mct_a = sess2->mybe->server_myds->max_connect_time; + if (mct_a <= mct_n) { + // session at 'a' already has earlier-or-equal mct - leave it + } + else { + void* p = mysql_sessions->pdata[a]; + mysql_sessions->pdata[a] = mysql_sessions->pdata[n]; + mysql_sessions->pdata[n] = p; ++a; } } From a282b746304720cf45e5128f4b72e53657fa948a Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 18 May 2026 08:29:34 +0000 Subject: [PATCH 04/12] build: enable -fno-omit-frame-pointer for usable perf call graphs Add -fno-omit-frame-pointer to the optimized build flags so perf --call-graph fp produces reliable call stacks across proxysql functions. Runtime cost well under 1% in our measurements; DWARF unwinding (the only alternative without FPs) is both slower and prone to truncation / unresolved frames in production traces. Enables clean perf profiles and reliable bpftrace ustack() output without any further build-time changes. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d24e1ae66..06f5fd4ef 100644 --- a/Makefile +++ b/Makefile @@ -132,7 +132,7 @@ export SOURCE_DATE_EPOCH ### rebuild SQLite with -USQLITE_ENABLE_MEMORY_MANAGEMENT in deps/Makefile O0 := -O0 -O2 := -O2 +O2 := -O2 -fno-omit-frame-pointer O1 := -O1 O3 := -O3 -mtune=native From 314280034d7d456ad4de04a8adcb5e01593cfe36 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 18 May 2026 08:29:34 +0000 Subject: [PATCH 05/12] fix(pgsql,mysql): correct ERR_clear_error placement in get_sslstatus (#5792) The previous code in get_sslstatus() called ERR_clear_error() unconditionally after every SSL operation: int err = SSL_get_error(ssl, n); ERR_clear_error(); // <- always This is wrong on two axes. (1) The OpenSSL idiom is to clear the error queue *before* an SSL op so it is empty *prior to* the call; clearing *after* SSL_get_error() loses any details for the current error path and does not protect subsequent ops. (2) SSL_ERROR_NONE, SSL_ERROR_WANT_READ, SSL_ERROR_WANT_WRITE do not place anything on the queue, so clearing on those paths is pure overhead -- visible in profiles under sustained TLS workloads. Remove the unconditional clear; drain the queue only on the actual error classifications (ZERO_RETURN / SYSCALL / SSL / default) so the next SSL op on this thread starts clean. Applied symmetrically in lib/PgSQL_Data_Stream.cpp and lib/mysql_data_stream.cpp. --- lib/PgSQL_Data_Stream.cpp | 10 +++++++++- lib/mysql_data_stream.cpp | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/PgSQL_Data_Stream.cpp b/lib/PgSQL_Data_Stream.cpp index f5a9e0fc6..86b03eb89 100644 --- a/lib/PgSQL_Data_Stream.cpp +++ b/lib/PgSQL_Data_Stream.cpp @@ -108,8 +108,13 @@ static void __dump_pkt(const char* func, unsigned char* _ptr, unsigned int len) static enum pgsql_sslstatus get_sslstatus(SSL* ssl, int n) { + // See issue #5792. + // SSL_get_error() classifies based on the return code + SSL_want() state, not on the + // thread-local OpenSSL error queue. For SSL_ERROR_NONE / WANT_READ / WANT_WRITE no error is + // pushed onto the queue, so there is nothing to clear. Only the actual error classifications + // (ZERO_RETURN / SYSCALL / SSL / default) need the queue drained so the next SSL op on this + // thread starts clean. int err = SSL_get_error(ssl, n); - ERR_clear_error(); switch (err) { case SSL_ERROR_NONE: return PGSQL_SSLSTATUS_OK; @@ -119,6 +124,9 @@ static enum pgsql_sslstatus get_sslstatus(SSL* ssl, int n) case SSL_ERROR_ZERO_RETURN: case SSL_ERROR_SYSCALL: default: + // drain the queue; any consumer that wanted the details should have read them + // before returning to this point + while (ERR_get_error()) { /* discard */ } return PGSQL_SSLSTATUS_FAIL; } } diff --git a/lib/mysql_data_stream.cpp b/lib/mysql_data_stream.cpp index fcdbfed6b..cb64a0278 100644 --- a/lib/mysql_data_stream.cpp +++ b/lib/mysql_data_stream.cpp @@ -174,8 +174,13 @@ static void __dump_pkt(const char *func, unsigned char *_ptr, unsigned int len) static enum sslstatus get_sslstatus(SSL* ssl, int n) { + // See issue #5792. + // SSL_get_error() classifies based on the return code + SSL_want() state, not on the + // thread-local OpenSSL error queue. For SSL_ERROR_NONE / WANT_READ / WANT_WRITE no error is + // pushed onto the queue, so there is nothing to clear. Only the actual error classifications + // (ZERO_RETURN / SYSCALL / SSL / default) need the queue drained so the next SSL op on this + // thread starts clean. int err = SSL_get_error(ssl, n); - ERR_clear_error(); switch (err) { case SSL_ERROR_NONE: return SSLSTATUS_OK; @@ -185,6 +190,9 @@ static enum sslstatus get_sslstatus(SSL* ssl, int n) case SSL_ERROR_ZERO_RETURN: case SSL_ERROR_SYSCALL: default: + // drain the queue; any consumer that wanted the details should have read them + // before returning to this point + while (ERR_get_error()) { /* discard */ } return SSLSTATUS_FAIL; } } From e5beb0d65a7ec346db3ea9d62024cb0da323a091 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 18 May 2026 08:31:36 +0000 Subject: [PATCH 06/12] fix(pgsql,mysql): bounded local connection cache 1-in-N (#5791) The per-thread cached_connections cache in PgSQL_Thread/MySQL_Thread greedily kept every released backend conn local to the thread that released it. At high client count this self-reinforced into one thread hoarding all backend conns while sibling worker threads starved on the shared pool, manifesting as a throughput cliff at ~200 clients with 2 worker threads and a 50-conn backend pool. Change push_MyConn_local() to cache only 1 release in every N (N = mysql_threads / pgsql_threads); push the rest straight to the shared HGM pool. At N=1 (single worker) the behaviour is unchanged. At N>1 the cache still amortizes the bulk of the HGM wrlock cost while ensuring released conns flow to peer threads regularly enough that the pool is never drained on one side. See #5791 for the empirical analysis. Mechanism eliminates the 2-thread cliff at c=200 and recovers ~80% throughput at c=200, ~50% at c=1000 vs the always-cache baseline. --- include/MySQL_Thread.h | 1 + include/PgSQL_Thread.h | 1 + lib/MySQL_Thread.cpp | 17 +++++++++++++---- lib/PgSQL_Thread.cpp | 19 +++++++++++++------ 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/include/MySQL_Thread.h b/include/MySQL_Thread.h index 5b6b1a723..2a54cb61d 100644 --- a/include/MySQL_Thread.h +++ b/include/MySQL_Thread.h @@ -114,6 +114,7 @@ class __attribute__((aligned(64))) MySQL_Thread : public Base_Thread bool retrieve_gtids_required; // if any of the servers has gtid_port enabled, this needs to be turned on too PtrArray *cached_connections; + unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = mysql_threads #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; diff --git a/include/PgSQL_Thread.h b/include/PgSQL_Thread.h index 56ce0dbe2..21b94621a 100644 --- a/include/PgSQL_Thread.h +++ b/include/PgSQL_Thread.h @@ -160,6 +160,7 @@ private: //bool maintenance_loop; PtrArray* cached_connections; + unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = pgsql_threads #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index 26abae76e..ee398508c 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -4784,6 +4784,7 @@ MySQL_Thread::MySQL_Thread() { pthread_mutex_init(&thread_mutex,NULL); my_idle_conns=NULL; cached_connections=NULL; + push_local_counter=0; mysql_sessions=NULL; mirror_queue_mysql_sessions=NULL; mirror_queue_mysql_sessions_cache=NULL; @@ -6466,14 +6467,22 @@ MySQL_Connection * MySQL_Thread::get_MyConn_local(unsigned int _hid, MySQL_Sessi * @param c Pointer to the MySQL_Connection object to be pushed to the local connection pool. */ void MySQL_Thread::push_MyConn_local(MySQL_Connection *c) { - MySrvC *mysrvc=NULL; - mysrvc=(MySrvC *)c->parent; + // Bounded local cache: cache 1-in-N releases (N = mysql_threads), push the + // rest to the shared HGM pool so peer workers can pick them up. + // At N=1 always cache (no sibling to share with). + // Rationale: avoids the connection-hoarding behavior that starved sibling + // workers at high client count, while preserving most of the lock-amortization + // benefit at lower client counts. + MySrvC *mysrvc=(MySrvC *)c->parent; // reset insert_id #1093 c->mysql->insert_id = 0; if (mysrvc->get_status() == MYSQL_SERVER_STATUS_ONLINE) { if (c->async_state_machine==ASYNC_IDLE) { - cached_connections->add(c); - return; // all went well + unsigned int n = (GloMTH && GloMTH->num_threads > 0) ? GloMTH->num_threads : 1; + if ((push_local_counter++ % n) == 0) { + cached_connections->add(c); + return; + } } } MyHGM->push_MyConn_to_pool(c); diff --git a/lib/PgSQL_Thread.cpp b/lib/PgSQL_Thread.cpp index 94874e39e..b0aea0aed 100644 --- a/lib/PgSQL_Thread.cpp +++ b/lib/PgSQL_Thread.cpp @@ -4206,6 +4206,7 @@ PgSQL_Thread::PgSQL_Thread() { pthread_mutex_init(&thread_mutex, NULL); my_idle_conns = NULL; cached_connections = NULL; + push_local_counter = 0; mysql_sessions = NULL; mirror_queue_mysql_sessions = NULL; mirror_queue_mysql_sessions_cache = NULL; @@ -5819,14 +5820,20 @@ PgSQL_Connection* PgSQL_Thread::get_MyConn_local(unsigned int _hid, PgSQL_Sessio } void PgSQL_Thread::push_MyConn_local(PgSQL_Connection * c) { - PgSQL_SrvC* mysrvc = NULL; - mysrvc = (PgSQL_SrvC*)c->parent; - // reset insert_id #1093 - //c->pgsql->insert_id = 0; + // Bounded local cache: cache 1-in-N releases (N = pgsql_threads), push the + // rest to the shared HGM pool so peer workers can pick them up. + // At N=1 always cache (no sibling to share with). + // Rationale: avoids the connection-hoarding behavior that starved sibling + // workers at high client count, while preserving most of the lock-amortization + // benefit at lower client counts. + PgSQL_SrvC* mysrvc = (PgSQL_SrvC*)c->parent; if (mysrvc->status == MYSQL_SERVER_STATUS_ONLINE) { if (c->async_state_machine == ASYNC_IDLE) { - cached_connections->add(c); - return; // all went well + unsigned int n = (GloPTH && GloPTH->num_threads > 0) ? GloPTH->num_threads : 1; + if ((push_local_counter++ % n) == 0) { + cached_connections->add(c); + return; + } } } PgHGM->push_MyConn_to_pool(c); From 25d9ea8c4cd9949e62908d669bab1da20ca839b1 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 18 May 2026 08:31:47 +0000 Subject: [PATCH 07/12] perf: remove Pass 2 sort from ProcessAllSessions_Partition The Lomuto-style sort of the B band (sessions waiting on a backend) by max_connect_time, added on this branch, was measured to be a net loss for throughput under sustained workloads: at 1 worker thread, 500 clients, 50-conn backend pool, 4 KB rows under SSL, removing the sort gained ~12% qps and ~20% average latency. The earlier Pass 1 (3-way partition into running / waiting / idle blocks) is kept and remains net beneficial on its own (block A is processed first in the outer loop, so end-of-query conn releases become available to block B waiters in the same iteration). If the sort is reintroduced later for tail-latency-sensitive workloads, it should be gated by an explicit starvation-age signal, not run unconditionally on every outer iteration. --- lib/Base_Thread.cpp | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index f4583b9fe..278e1ea11 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -232,8 +232,7 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { /** - * @brief Partition all sessions into three blocks and sort the B band by - * max_connect_time. + * @brief Partition all sessions into three blocks by backend state. * * Block layout produced in mysql_sessions->pdata: * [0, running_end) block A - running a query against the backend @@ -251,13 +250,12 @@ void Base_Thread::check_for_invalid_fd(unsigned int n) { * myconn != NULL, to catch CHANGING_USER_SERVER on pooled connections and the * post-error retry path where the old conn hasn't been destroyed yet. * - * Pass 1 (partition): single O(n) pass, in place. idx walks up, idle_begin - * walks down, they meet and terminate. - * - * Pass 2 (sort): single Lomuto-style sweep over the B band [running_end, - * idle_begin) using max_connect_time as the ordering key. Same algorithm - * as the pre-PR ProcessAllSessions_SortingSessions, scoped to the band - * Pass 1 already isolated. + * Single O(n) pass, in place. idx walks up, idle_begin walks down, they meet + * and terminate. A previous Lomuto-style sort of the B band by max_connect_time + * was removed: measurement showed it hurt throughput by ~12% at 500 clients / + * 50-conn pool under SSL, without a corresponding tail-latency benefit. If + * reintroduced, it should be gated on an explicit starvation-age signal rather + * than run unconditionally on every iteration. */ template void Base_Thread::ProcessAllSessions_Partition() { @@ -292,33 +290,6 @@ void Base_Thread::ProcessAllSessions_Partition() { // do NOT advance idx - re-examine the swapped-in element test } } - - // Single-pass sweep across the B band [running_end, idle_begin) ordering - // by max_connect_time. Same Lomuto-style sweep as the pre-PR - // ProcessAllSessions_SortingSessions, scoped to the B band Pass 1 just - // isolated. - // - // Invariant: every session in [running_end, idle_begin) is a B session, - // so s->mybe->server_myds->max_connect_time is non-zero and no nullptr - // guards are needed. - if (idle_begin > running_end + 1) { - size_t a = running_end; - for (size_t n = running_end; n < idle_begin; ++n) { - S* sess = static_cast(mysql_sessions->pdata[n]); - S* sess2 = static_cast(mysql_sessions->pdata[a]); - const unsigned long long mct_n = sess->mybe->server_myds->max_connect_time; - const unsigned long long mct_a = sess2->mybe->server_myds->max_connect_time; - if (mct_a <= mct_n) { - // session at 'a' already has earlier-or-equal mct - leave it - } - else { - void* p = mysql_sessions->pdata[a]; - mysql_sessions->pdata[a] = mysql_sessions->pdata[n]; - mysql_sessions->pdata[n] = p; - ++a; - } - } - } } // this function was inline in MySQL_Thread::run() From 5b5b9373e3606a43f5447408ac9ad305595cdc23 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 18 May 2026 08:33:08 +0000 Subject: [PATCH 08/12] feat(pgsql,mysql): gate ProcessAllSessions_Partition by pool-acquire NULL ratio ProcessAllSessions_Partition (Pass 1, the 3-way A/B/C session classification) is cheap and beneficial when block B (sessions waiting for a backend) is non-empty: it lets block A finish queries and release conns to block B waiters in the same outer iteration. When no sessions are contending for backends, partition is unnecessary work. Detect the contention state with a per-worker counter pair tracked at the get_MyConn_from_pool() call site: every call increments partition_pool_attempts; calls that returned NULL (couldn't acquire, session must wait) also increment partition_pool_nulls. At the top of each process_all_sessions outer iteration, the worker computes the NULL ratio and updates a hysteresis state machine: 3 consecutive iterations with ratio >= 5% flips partition_active to true; 3 consecutive iterations below the threshold flips it back. Mechanism: * Cheapest possible signal: 1 int increment per pool acquire, 1 branch per outer iteration. * Hysteresis (3 consecutive iterations either direction) filters transient bursts without needing min-sample / asymmetric-threshold tuning. * Single threshold 5% works in both directions by virtue of the streak filter -- the ratio has to be sustained near the boundary for flapping to occur, which doesn't match any realistic workload. Empirical verification at 1 worker thread, 500c clients, 4 KB rows, 50-conn backend pool, SSL on, 120 s: 20c (no contention): 1576 tps, 13 ms avg latency, gate stays OFF 100c (some): 1487 tps, 67 ms, gate stays ON 500c (heavy): 1487 tps, 336 ms, gate stays ON The gate transitions automatically; no admin variable to learn. Applied symmetrically to MySQL_Thread / MySQL_Session and PgSQL_Thread / PgSQL_Session. See #5791 for analysis. --- include/MySQL_Thread.h | 14 ++++++++++++++ include/PgSQL_Thread.h | 16 ++++++++++++++++ lib/MySQL_Session.cpp | 1 + lib/MySQL_Thread.cpp | 31 +++++++++++++++++++++++++++++-- lib/PgSQL_Session.cpp | 1 + lib/PgSQL_Thread.cpp | 28 +++++++++++++++++++++++++++- 6 files changed, 88 insertions(+), 3 deletions(-) diff --git a/include/MySQL_Thread.h b/include/MySQL_Thread.h index 2a54cb61d..73c460b8a 100644 --- a/include/MySQL_Thread.h +++ b/include/MySQL_Thread.h @@ -116,6 +116,12 @@ class __attribute__((aligned(64))) MySQL_Thread : public Base_Thread PtrArray *cached_connections; unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = mysql_threads + // State for the ratio-based partition gate. See PgSQL_Thread for the equivalent. + unsigned int partition_pool_attempts; + unsigned int partition_pool_nulls; + unsigned int partition_streak; + bool partition_active; + #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; int efd; @@ -160,6 +166,14 @@ class __attribute__((aligned(64))) MySQL_Thread : public Base_Thread public: + // Recorded at the get_MyConn_from_pool() call site by sessions inside this + // worker. Consumed and reset at the top of each process_all_sessions outer + // iteration to compute the ratio-based partition gate. + inline void note_pool_attempt(bool was_null) { + ++partition_pool_attempts; + if (was_null) ++partition_pool_nulls; + } + void *gen_args; // this is a generic pointer to create any sort of structure ProxySQL_Poll mypolls; diff --git a/include/PgSQL_Thread.h b/include/PgSQL_Thread.h index 21b94621a..0faf2a824 100644 --- a/include/PgSQL_Thread.h +++ b/include/PgSQL_Thread.h @@ -162,6 +162,14 @@ private: PtrArray* cached_connections; unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = pgsql_threads + // State for the ratio-based partition gate. Counters are incremented at the + // get_MyConn_from_pool() call site within process_all_sessions; consumed and + // reset at the top of the next outer iteration. + unsigned int partition_pool_attempts; + unsigned int partition_pool_nulls; + unsigned int partition_streak; + bool partition_active; + #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; int efd; @@ -214,6 +222,14 @@ protected: public: + // Recorded at the get_MyConn_from_pool() call site by sessions inside this + // worker. Consumed and reset at the top of each process_all_sessions outer + // iteration to compute the ratio-based partition gate. + inline void note_pool_attempt(bool was_null) { + ++partition_pool_attempts; + if (was_null) ++partition_pool_nulls; + } + void* gen_args; // this is a generic pointer to create any sort of structure ProxySQL_Poll mypolls; diff --git a/lib/MySQL_Session.cpp b/lib/MySQL_Session.cpp index 957c1c25a..9a6b866a5 100644 --- a/lib/MySQL_Session.cpp +++ b/lib/MySQL_Session.cpp @@ -7820,6 +7820,7 @@ void MySQL_Session::handler___client_DSS_QUERY_SENT___server_DSS_NOT_INITIALIZED } else { mc=MyHGM->get_MyConn_from_pool(mybe->hostgroup_id, this, (session_fast_forward || qpo->create_new_conn), NULL, 0, (int)qpo->max_lag_ms); } + thread->note_pool_attempt(mc == NULL); #ifdef STRESSTEST_POOL if (mc && (loops < NUM_SLOW_LOOPS - 1)) { if (mc->mysql) { diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index ee398508c..aa1d613b4 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -4453,7 +4453,29 @@ void MySQL_Thread::process_all_sessions() { } #endif // IDLE_THREADS if (sess_sort && mysql_sessions->len > 3) { - ProcessAllSessions_Partition(); + // Activate partition only when the pool acquire NULL-ratio + // (computed from get_MyConn_from_pool calls within the previous + // iteration) has been above HI_NUM/HI_DEN for STREAK consecutive + // iterations. Disable symmetrically once the ratio has been below + // HI_NUM/HI_DEN for STREAK consecutive iterations. + const unsigned int HI_NUM = 1, HI_DEN = 20; // 5% + const unsigned int STREAK = 3; + bool stressed = (partition_pool_attempts > 0) + && (partition_pool_nulls * HI_DEN >= partition_pool_attempts * HI_NUM); + if (stressed == partition_active) { + partition_streak = 0; + } else { + if (++partition_streak >= STREAK) { + partition_active = stressed; + partition_streak = 0; + } + } + partition_pool_attempts = 0; + partition_pool_nulls = 0; + + if (partition_active) { + ProcessAllSessions_Partition(); + } } for (n=0; nlen; n++) { MySQL_Session *sess=(MySQL_Session *)mysql_sessions->index(n); @@ -4785,6 +4807,10 @@ MySQL_Thread::MySQL_Thread() { my_idle_conns=NULL; cached_connections=NULL; push_local_counter=0; + partition_pool_attempts=0; + partition_pool_nulls=0; + partition_streak=0; + partition_active=false; mysql_sessions=NULL; mirror_queue_mysql_sessions=NULL; mirror_queue_mysql_sessions_cache=NULL; @@ -6473,7 +6499,8 @@ void MySQL_Thread::push_MyConn_local(MySQL_Connection *c) { // Rationale: avoids the connection-hoarding behavior that starved sibling // workers at high client count, while preserving most of the lock-amortization // benefit at lower client counts. - MySrvC *mysrvc=(MySrvC *)c->parent; + MySrvC *mysrvc=NULL; + mysrvc=(MySrvC *)c->parent; // reset insert_id #1093 c->mysql->insert_id = 0; if (mysrvc->get_status() == MYSQL_SERVER_STATUS_ONLINE) { diff --git a/lib/PgSQL_Session.cpp b/lib/PgSQL_Session.cpp index c63ac7c43..d0101d256 100644 --- a/lib/PgSQL_Session.cpp +++ b/lib/PgSQL_Session.cpp @@ -5383,6 +5383,7 @@ void PgSQL_Session::handler___client_DSS_QUERY_SENT___server_DSS_NOT_INITIALIZED else { mc = PgHGM->get_MyConn_from_pool(mybe->hostgroup_id, this, (session_fast_forward || qpo->create_new_conn), NULL, 0, (int)qpo->max_lag_ms); } + thread->note_pool_attempt(mc == NULL); #ifdef STRESSTEST_POOL if (mc && (loops < NUM_SLOW_LOOPS - 1)) { if (mc->pgsql) { diff --git a/lib/PgSQL_Thread.cpp b/lib/PgSQL_Thread.cpp index b0aea0aed..9b0b6c2fe 100644 --- a/lib/PgSQL_Thread.cpp +++ b/lib/PgSQL_Thread.cpp @@ -3833,7 +3833,29 @@ void PgSQL_Thread::process_all_sessions() { } #endif // IDLE_THREADS if (sess_sort && mysql_sessions->len > 3) { - ProcessAllSessions_Partition(); + // Activate partition only when the pool acquire NULL-ratio + // (computed from get_MyConn_from_pool calls within the previous + // iteration) has been above HI_NUM/HI_DEN for STREAK consecutive + // iterations. Disable symmetrically once the ratio has been below + // HI_NUM/HI_DEN for STREAK consecutive iterations. + const unsigned int HI_NUM = 1, HI_DEN = 20; // 5% + const unsigned int STREAK = 3; + bool stressed = (partition_pool_attempts > 0) + && (partition_pool_nulls * HI_DEN >= partition_pool_attempts * HI_NUM); + if (stressed == partition_active) { + partition_streak = 0; + } else { + if (++partition_streak >= STREAK) { + partition_active = stressed; + partition_streak = 0; + } + } + partition_pool_attempts = 0; + partition_pool_nulls = 0; + + if (partition_active) { + ProcessAllSessions_Partition(); + } } for (n = 0; n < mysql_sessions->len; n++) { PgSQL_Session* sess = (PgSQL_Session*)mysql_sessions->index(n); @@ -4207,6 +4229,10 @@ PgSQL_Thread::PgSQL_Thread() { my_idle_conns = NULL; cached_connections = NULL; push_local_counter = 0; + partition_pool_attempts = 0; + partition_pool_nulls = 0; + partition_streak = 0; + partition_active = false; mysql_sessions = NULL; mirror_queue_mysql_sessions = NULL; mirror_queue_mysql_sessions_cache = NULL; From dcd143d39ef454c54ea54b814335169bf5cf66ad Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Wed, 20 May 2026 12:01:36 +0500 Subject: [PATCH 09/12] revert: drop unrelated push_MyConn_local debug-leftover split The two-line split of `MySrvC *mysrvc=(MySrvC *)c->parent;` into a NULL-init plus assignment landed in 5b5b937 as an unrelated debug- session artifact. Reverting to the pre-5b5b937 single-line form. --- lib/MySQL_Thread.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index aa1d613b4..6510a7155 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -6499,8 +6499,7 @@ void MySQL_Thread::push_MyConn_local(MySQL_Connection *c) { // Rationale: avoids the connection-hoarding behavior that starved sibling // workers at high client count, while preserving most of the lock-amortization // benefit at lower client counts. - MySrvC *mysrvc=NULL; - mysrvc=(MySrvC *)c->parent; + MySrvC *mysrvc=(MySrvC *)c->parent; // reset insert_id #1093 c->mysql->insert_id = 0; if (mysrvc->get_status() == MYSQL_SERVER_STATUS_ONLINE) { From 878ba3b1e637d8f592140ec1d7dc9b1a25a5697a Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Wed, 20 May 2026 12:09:28 +0500 Subject: [PATCH 10/12] refactor(partition-gate): consolidate into Base_Thread; drain every iteration; widen ratio to uint64_t Three closely related changes: * Move the four partition_* members and note_pool_attempt() from MySQL_Thread / PgSQL_Thread into Base_Thread. Eliminates 22 lines of duplicated gate logic across the two protocol implementations. * Run update_partition_gate() once per process_all_sessions outer iteration, before the sess_sort / len > 3 / partition_active check. Per-tick counters can no longer accumulate stale across ticks where partition is skipped (sess_sort off, IDLE_THREADS maintenance thread, or session count below threshold), which in 5b5b937 could cause spurious gate flips after sess_sort toggled back on. * Widen the ratio comparison to uint64_t so the multiplications cannot overflow unsigned int (reachable in combination with the staleness issue above). Function-local HI_NUM/HI_DEN/STREAK locals are promoted to named static constexpr class members (PARTITION_GATE_NULL_RATIO_NUM/DEN/STREAK). No new behavior introduced -- purely correctness fixes plus a behavior-preserving refactor. --- include/Base_Thread.h | 31 ++++++++++++++++++++++++++++++- include/MySQL_Thread.h | 14 -------------- include/PgSQL_Thread.h | 16 ---------------- lib/Base_Thread.cpp | 18 ++++++++++++++++++ lib/MySQL_Thread.cpp | 31 +++---------------------------- lib/PgSQL_Thread.cpp | 31 +++---------------------------- 6 files changed, 54 insertions(+), 87 deletions(-) diff --git a/include/Base_Thread.h b/include/Base_Thread.h index b4fb25c1a..a26c0c26d 100644 --- a/include/Base_Thread.h +++ b/include/Base_Thread.h @@ -47,7 +47,36 @@ public: private: bool maintenance_loop; - public: + + // Partition-gate state. note_pool_attempt() bumps the counters from the + // get_MyConn_from_pool() call site; update_partition_gate() consumes them + // once per outer process_all_sessions iteration. Single-threaded per worker. + unsigned int partition_pool_attempts = 0; + unsigned int partition_pool_nulls = 0; + unsigned int partition_streak = 0; + bool partition_active = false; + +public: + // Gate thresholds: NULL-ratio (NUM/DEN) classifies a tick as "stressed"; + // STREAK is the number of consecutive disagreeing ticks required to flip + // the gate state (hysteresis filter against transient bursts). + static constexpr unsigned int PARTITION_GATE_NULL_RATIO_NUM = 1; + static constexpr unsigned int PARTITION_GATE_NULL_RATIO_DEN = 20; // 5% + static constexpr unsigned int PARTITION_GATE_STREAK = 3; + + // Called by sessions inside this worker at the get_MyConn_from_pool() + // call site to feed the gate. + inline void note_pool_attempt(bool was_null) { + ++partition_pool_attempts; + if (was_null) ++partition_pool_nulls; + } + + // Runs the hysteresis state machine from per-tick counters, resets them, + // and returns whether the partition pass should run. MUST be called every + // outer iteration of process_all_sessions (even when the partition path + // is otherwise skipped) so the counters don't accumulate stale. + bool update_partition_gate(); + unsigned long long curtime; unsigned long long last_move_to_idle_thread_time; bool epoll_thread; diff --git a/include/MySQL_Thread.h b/include/MySQL_Thread.h index 73c460b8a..2a54cb61d 100644 --- a/include/MySQL_Thread.h +++ b/include/MySQL_Thread.h @@ -116,12 +116,6 @@ class __attribute__((aligned(64))) MySQL_Thread : public Base_Thread PtrArray *cached_connections; unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = mysql_threads - // State for the ratio-based partition gate. See PgSQL_Thread for the equivalent. - unsigned int partition_pool_attempts; - unsigned int partition_pool_nulls; - unsigned int partition_streak; - bool partition_active; - #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; int efd; @@ -166,14 +160,6 @@ class __attribute__((aligned(64))) MySQL_Thread : public Base_Thread public: - // Recorded at the get_MyConn_from_pool() call site by sessions inside this - // worker. Consumed and reset at the top of each process_all_sessions outer - // iteration to compute the ratio-based partition gate. - inline void note_pool_attempt(bool was_null) { - ++partition_pool_attempts; - if (was_null) ++partition_pool_nulls; - } - void *gen_args; // this is a generic pointer to create any sort of structure ProxySQL_Poll mypolls; diff --git a/include/PgSQL_Thread.h b/include/PgSQL_Thread.h index 0faf2a824..21b94621a 100644 --- a/include/PgSQL_Thread.h +++ b/include/PgSQL_Thread.h @@ -162,14 +162,6 @@ private: PtrArray* cached_connections; unsigned int push_local_counter; // round-robin counter for bounded local caching: cache 1-in-N where N = pgsql_threads - // State for the ratio-based partition gate. Counters are incremented at the - // get_MyConn_from_pool() call site within process_all_sessions; consumed and - // reset at the top of the next outer iteration. - unsigned int partition_pool_attempts; - unsigned int partition_pool_nulls; - unsigned int partition_streak; - bool partition_active; - #ifdef IDLE_THREADS struct epoll_event events[MY_EPOLL_THREAD_MAXEVENTS]; int efd; @@ -222,14 +214,6 @@ protected: public: - // Recorded at the get_MyConn_from_pool() call site by sessions inside this - // worker. Consumed and reset at the top of each process_all_sessions outer - // iteration to compute the ratio-based partition gate. - inline void note_pool_attempt(bool was_null) { - ++partition_pool_attempts; - if (was_null) ++partition_pool_nulls; - } - void* gen_args; // this is a generic pointer to create any sort of structure ProxySQL_Poll mypolls; diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index 278e1ea11..7757fe46c 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -34,6 +34,24 @@ Base_Thread::Base_Thread() : Base_Thread::~Base_Thread() { }; +bool Base_Thread::update_partition_gate() { + // 64-bit so the multiplications below cannot overflow unsigned int. + const uint64_t attempts = partition_pool_attempts; + const uint64_t nulls = partition_pool_nulls; + partition_pool_attempts = 0; + partition_pool_nulls = 0; + + const bool stressed = (nulls * PARTITION_GATE_NULL_RATIO_DEN + >= attempts * PARTITION_GATE_NULL_RATIO_NUM); + if (stressed == partition_active) { + partition_streak = 0; + } else if (++partition_streak >= PARTITION_GATE_STREAK) { + partition_active = stressed; + partition_streak = 0; + } + return partition_active; +} + template void Base_Thread::register_session(T thr, S _sess, bool up_start) { if (mysql_sessions==NULL) { diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index 6510a7155..8d471c8d1 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -4452,30 +4452,9 @@ void MySQL_Thread::process_all_sessions() { sess_sort=false; } #endif // IDLE_THREADS - if (sess_sort && mysql_sessions->len > 3) { - // Activate partition only when the pool acquire NULL-ratio - // (computed from get_MyConn_from_pool calls within the previous - // iteration) has been above HI_NUM/HI_DEN for STREAK consecutive - // iterations. Disable symmetrically once the ratio has been below - // HI_NUM/HI_DEN for STREAK consecutive iterations. - const unsigned int HI_NUM = 1, HI_DEN = 20; // 5% - const unsigned int STREAK = 3; - bool stressed = (partition_pool_attempts > 0) - && (partition_pool_nulls * HI_DEN >= partition_pool_attempts * HI_NUM); - if (stressed == partition_active) { - partition_streak = 0; - } else { - if (++partition_streak >= STREAK) { - partition_active = stressed; - partition_streak = 0; - } - } - partition_pool_attempts = 0; - partition_pool_nulls = 0; - - if (partition_active) { - ProcessAllSessions_Partition(); - } + const bool partition_wanted = update_partition_gate(); + if (sess_sort && mysql_sessions->len > 3 && partition_wanted) { + ProcessAllSessions_Partition(); } for (n=0; nlen; n++) { MySQL_Session *sess=(MySQL_Session *)mysql_sessions->index(n); @@ -4807,10 +4786,6 @@ MySQL_Thread::MySQL_Thread() { my_idle_conns=NULL; cached_connections=NULL; push_local_counter=0; - partition_pool_attempts=0; - partition_pool_nulls=0; - partition_streak=0; - partition_active=false; mysql_sessions=NULL; mirror_queue_mysql_sessions=NULL; mirror_queue_mysql_sessions_cache=NULL; diff --git a/lib/PgSQL_Thread.cpp b/lib/PgSQL_Thread.cpp index 9b0b6c2fe..d701c3d90 100644 --- a/lib/PgSQL_Thread.cpp +++ b/lib/PgSQL_Thread.cpp @@ -3832,30 +3832,9 @@ void PgSQL_Thread::process_all_sessions() { sess_sort = false; } #endif // IDLE_THREADS - if (sess_sort && mysql_sessions->len > 3) { - // Activate partition only when the pool acquire NULL-ratio - // (computed from get_MyConn_from_pool calls within the previous - // iteration) has been above HI_NUM/HI_DEN for STREAK consecutive - // iterations. Disable symmetrically once the ratio has been below - // HI_NUM/HI_DEN for STREAK consecutive iterations. - const unsigned int HI_NUM = 1, HI_DEN = 20; // 5% - const unsigned int STREAK = 3; - bool stressed = (partition_pool_attempts > 0) - && (partition_pool_nulls * HI_DEN >= partition_pool_attempts * HI_NUM); - if (stressed == partition_active) { - partition_streak = 0; - } else { - if (++partition_streak >= STREAK) { - partition_active = stressed; - partition_streak = 0; - } - } - partition_pool_attempts = 0; - partition_pool_nulls = 0; - - if (partition_active) { - ProcessAllSessions_Partition(); - } + const bool partition_wanted = update_partition_gate(); + if (sess_sort && mysql_sessions->len > 3 && partition_wanted) { + ProcessAllSessions_Partition(); } for (n = 0; n < mysql_sessions->len; n++) { PgSQL_Session* sess = (PgSQL_Session*)mysql_sessions->index(n); @@ -4229,10 +4208,6 @@ PgSQL_Thread::PgSQL_Thread() { my_idle_conns = NULL; cached_connections = NULL; push_local_counter = 0; - partition_pool_attempts = 0; - partition_pool_nulls = 0; - partition_streak = 0; - partition_active = false; mysql_sessions = NULL; mirror_queue_mysql_sessions = NULL; mirror_queue_mysql_sessions_cache = NULL; From 53eea80aaf2f637843bb3cde9510be5d1c7fe0a3 Mon Sep 17 00:00:00 2001 From: Rahim Kanji Date: Wed, 20 May 2026 12:14:35 +0500 Subject: [PATCH 11/12] feat(partition-gate): suppress low-volume ticks via min-attempts floor Add PARTITION_GATE_MIN_ATTEMPTS (=4). When a tick has fewer pool acquires than this, leave both gate state and streak counter untouched. Filters the "2/2 NULL = 100% stressed" noise case without drifting the gate OFF on transient idle ticks (which would happen if low- volume ticks were treated as not-stressed). Pairs with the existing mysql_sessions->len > 3 guard at the partition call site. --- include/Base_Thread.h | 3 +++ lib/Base_Thread.cpp | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/include/Base_Thread.h b/include/Base_Thread.h index a26c0c26d..255b19b18 100644 --- a/include/Base_Thread.h +++ b/include/Base_Thread.h @@ -63,6 +63,9 @@ public: static constexpr unsigned int PARTITION_GATE_NULL_RATIO_NUM = 1; static constexpr unsigned int PARTITION_GATE_NULL_RATIO_DEN = 20; // 5% static constexpr unsigned int PARTITION_GATE_STREAK = 3; + // Below this attempt count a tick carries no signal: gate state and + // streak are left untouched. Avoids "2/2 NULL = 100% stressed" noise. + static constexpr unsigned int PARTITION_GATE_MIN_ATTEMPTS = 4; // Called by sessions inside this worker at the get_MyConn_from_pool() // call site to feed the gate. diff --git a/lib/Base_Thread.cpp b/lib/Base_Thread.cpp index 7757fe46c..e5820eafc 100644 --- a/lib/Base_Thread.cpp +++ b/lib/Base_Thread.cpp @@ -41,6 +41,11 @@ bool Base_Thread::update_partition_gate() { partition_pool_attempts = 0; partition_pool_nulls = 0; + // Low-volume ticks carry no signal; leave gate and streak unchanged. + if (attempts < PARTITION_GATE_MIN_ATTEMPTS) { + return partition_active; + } + const bool stressed = (nulls * PARTITION_GATE_NULL_RATIO_DEN >= attempts * PARTITION_GATE_NULL_RATIO_NUM); if (stressed == partition_active) { From 9d9ede15329ef7c3b0cea64dead8d12fbb94413e Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Tue, 26 May 2026 05:30:37 +0000 Subject: [PATCH 12/12] =?UTF-8?q?fix(test):=20sort=20groups.json=20?= =?UTF-8?q?=E2=80=94=20pp1=5Funknown=5Fspoof=20before=20proclist=5Fuse=5Fa?= =?UTF-8?q?fter=5Ffree?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run-with --fix from test/tap/groups/lint_groups_json.py. Unblocks the 'lint' check on PR #5819. --- test/tap/groups/groups.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tap/groups/groups.json b/test/tap/groups/groups.json index 4aaf85e9d..99d41e652 100644 --- a/test/tap/groups/groups.json +++ b/test/tap/groups/groups.json @@ -277,8 +277,8 @@ "reg_test_mariadb_stmt_store_result_async-t" : [ "legacy-g7","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2","mysql84-g2","mysql90-g2","mysql95-g2" ], "reg_test_mariadb_stmt_store_result_libmysql-t" : [ "legacy-g7","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2","mysql84-g2","mysql90-g2","mysql95-g2" ], "reg_test_oversize_first_pkt-t" : [ "legacy-g7","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2","mysql84-g7","mysql90-g2","mysql95-g2" ], - "reg_test_proclist_use_after_free-t" : [ "legacy-g6","mysql-auto_increment_delay_multiplex=0-g1","mysql-multiplexing=false-g1","mysql-query_digests=0-g1","mysql-query_digests_keep_comment=1-g1","mysql84-g6","mysql90-g1","mysql95-g1" ], "reg_test_pp1_unknown_spoof-t" : [ "legacy-g3","mysql-auto_increment_delay_multiplex=0-g3","mysql-multiplexing=false-g3","mysql-query_digests=0-g3","mysql-query_digests_keep_comment=1-g3","mysql84-g3","mysql90-g3","mysql95-g3" ], + "reg_test_proclist_use_after_free-t" : [ "legacy-g6","mysql-auto_increment_delay_multiplex=0-g1","mysql-multiplexing=false-g1","mysql-query_digests=0-g1","mysql-query_digests_keep_comment=1-g1","mysql84-g6","mysql90-g1","mysql95-g1" ], "reg_test_proxy_protocol_oversized_address-t" : [ "mysql84-g6","mysql95-g1" ], "reg_test_sql_calc_found_rows-t" : [ "legacy-g7","mysql-auto_increment_delay_multiplex=0-g2","mysql-multiplexing=false-g2","mysql-query_digests=0-g2","mysql-query_digests_keep_comment=1-g2","mysql84-g2","mysql90-g2","mysql95-g2" ], "reg_test_stmt_close_short_packet-t" : [ "mysql84-g6","mysql95-g1" ],