From 8bc60ed283bf0a9892809a5bf0ea57732b50b532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Fri, 22 Nov 2024 17:36:24 +0100 Subject: [PATCH] Fix worker threads stalling race condition on RESUME command During a RESUME operation, if a 'MySQL_Thread' is bootstrapping listeners (in `MySQL_Thread::run_BootstrapListener`) and detect that `MySQL_Threads_Handler::bootstrapping_listeners` is 'false', the thread prematurely shuts down its own bootstrapping flag from 'mypolls' (`ProxySQL_Poll::bootstrapping_listeners`). Since this thread wont ever bootstrap its corresponding listening sockets, the other working threads will be stalled waiting on it, eventually triggering the watchdog and crashing the instance. Since the bootstrapping operation is sequential, it's expected that all the threads but the one starting their listening sockets are in an active wait. A sensible delay has been introduced to reduce the overhead of such wait. --- include/ProxySQL_Poll.h | 1 - lib/MySQL_Thread.cpp | 16 +++++----------- lib/PgSQL_Thread.cpp | 31 ++++++++++++------------------- lib/ProxySQL_Poll.cpp | 1 - 4 files changed, 17 insertions(+), 32 deletions(-) diff --git a/include/ProxySQL_Poll.h b/include/ProxySQL_Poll.h index 78520ac17..406c0e993 100644 --- a/include/ProxySQL_Poll.h +++ b/include/ProxySQL_Poll.h @@ -35,7 +35,6 @@ class ProxySQL_Poll { T **myds; unsigned long long *last_recv; unsigned long long *last_sent; - std::atomic bootstrapping_listeners; volatile int pending_listener_add; volatile int pending_listener_del; unsigned int poll_timeout; diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index 5fca5cf23..54c057571 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -3127,15 +3127,11 @@ void MySQL_Thread::run_BootstrapListener() { if (n) { poll_listener_add(n); assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add,n,0)); - } else { - if (GloMTH->bootstrapping_listeners == false) { - // we stop looping - mypolls.bootstrapping_listeners = false; - } } -#ifdef DEBUG - usleep(5+rand()%10); -#endif + // The delay for the active-wait is a fraction of 'poll_timeout'. Since other + // threads may be waiting on poll for further operations, checks are meaningless + // until that timeout expires (other workers make progress). + usleep(std::min(std::max(mysql_thread___poll_timeout/20, 10000), 40000) + (rand() % 2000)); } } @@ -3238,9 +3234,7 @@ __run_skip_1: #endif // IDLE_THREADS pthread_mutex_unlock(&thread_mutex); - if (unlikely(mypolls.bootstrapping_listeners == true)) { - run_BootstrapListener(); - } + run_BootstrapListener(); // flush mysql log file GloMyLogger->flush(); diff --git a/lib/PgSQL_Thread.cpp b/lib/PgSQL_Thread.cpp index a28fbf591..bedbba3dc 100644 --- a/lib/PgSQL_Thread.cpp +++ b/lib/PgSQL_Thread.cpp @@ -2998,26 +2998,19 @@ void PgSQL_Thread::run() { #endif // IDLE_THREADS pthread_mutex_unlock(&thread_mutex); - if (unlikely(mypolls.bootstrapping_listeners == true)) { - while ( // spin here if ... - (n = __sync_add_and_fetch(&mypolls.pending_listener_add, 0)) // there is a new listener to add - || - (GloPTH->bootstrapping_listeners == true) // PgSQL_Thread_Handlers has more listeners to configure - ) { - if (n) { - poll_listener_add(n); - assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add, n, 0)); - } - else { - if (GloPTH->bootstrapping_listeners == false) { - // we stop looping - mypolls.bootstrapping_listeners = false; - } - } -#ifdef DEBUG - usleep(5 + rand() % 10); -#endif + while ( // spin here if ... + (n = __sync_add_and_fetch(&mypolls.pending_listener_add, 0)) // there is a new listener to add + || + (GloPTH->bootstrapping_listeners == true) // PgSQL_Thread_Handlers has more listeners to configure + ) { + if (n) { + poll_listener_add(n); + assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add, n, 0)); } + // The delay for the active-wait is a fraction of 'poll_timeout'. Since other + // threads may be waiting on poll for further operations, checks are meaningless + // until that timeout expires (other workers make progress). + usleep(std::min(std::max(pgsql_thread___poll_timeout/20, 10000), 40000) + (rand() % 2000)); } proxy_debug(PROXY_DEBUG_NET, 7, "poll_timeout=%u\n", mypolls.poll_timeout); diff --git a/lib/ProxySQL_Poll.cpp b/lib/ProxySQL_Poll.cpp index 66135ad67..425bca51d 100644 --- a/lib/ProxySQL_Poll.cpp +++ b/lib/ProxySQL_Poll.cpp @@ -68,7 +68,6 @@ ProxySQL_Poll::ProxySQL_Poll() { len=0; pending_listener_add=0; pending_listener_del=0; - bootstrapping_listeners = true; size=MIN_POLL_LEN; fds=(struct pollfd *)malloc(size*sizeof(struct pollfd)); myds=(T**)malloc(size*sizeof(T *));