Fix worker threads stalling race condition on RESUME command

During a RESUME operation, if a 'MySQL_Thread' is bootstrapping
listeners (in `MySQL_Thread::run_BootstrapListener`) and detect that
`MySQL_Threads_Handler::bootstrapping_listeners` is 'false', the thread
prematurely shuts down its own bootstrapping flag from 'mypolls'
(`ProxySQL_Poll::bootstrapping_listeners`). Since this thread wont ever
bootstrap its corresponding listening sockets, the other working threads
will be stalled waiting on it, eventually triggering the watchdog and
crashing the instance.

Since the bootstrapping operation is sequential, it's expected that all
the threads but the one starting their listening sockets are in an
active wait. A sensible delay has been introduced to reduce the overhead
of such wait.
v3.0-fix_hang_on_resume
Javier Jaramago Fernández 1 year ago
parent 6a7c1cf2d0
commit 8bc60ed283

@ -35,7 +35,6 @@ class ProxySQL_Poll {
T **myds;
unsigned long long *last_recv;
unsigned long long *last_sent;
std::atomic<bool> bootstrapping_listeners;
volatile int pending_listener_add;
volatile int pending_listener_del;
unsigned int poll_timeout;

@ -3127,15 +3127,11 @@ void MySQL_Thread::run_BootstrapListener() {
if (n) {
poll_listener_add(n);
assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add,n,0));
} else {
if (GloMTH->bootstrapping_listeners == false) {
// we stop looping
mypolls.bootstrapping_listeners = false;
}
}
#ifdef DEBUG
usleep(5+rand()%10);
#endif
// The delay for the active-wait is a fraction of 'poll_timeout'. Since other
// threads may be waiting on poll for further operations, checks are meaningless
// until that timeout expires (other workers make progress).
usleep(std::min(std::max(mysql_thread___poll_timeout/20, 10000), 40000) + (rand() % 2000));
}
}
@ -3238,9 +3234,7 @@ __run_skip_1:
#endif // IDLE_THREADS
pthread_mutex_unlock(&thread_mutex);
if (unlikely(mypolls.bootstrapping_listeners == true)) {
run_BootstrapListener();
}
run_BootstrapListener();
// flush mysql log file
GloMyLogger->flush();

@ -2998,26 +2998,19 @@ void PgSQL_Thread::run() {
#endif // IDLE_THREADS
pthread_mutex_unlock(&thread_mutex);
if (unlikely(mypolls.bootstrapping_listeners == true)) {
while ( // spin here if ...
(n = __sync_add_and_fetch(&mypolls.pending_listener_add, 0)) // there is a new listener to add
||
(GloPTH->bootstrapping_listeners == true) // PgSQL_Thread_Handlers has more listeners to configure
) {
if (n) {
poll_listener_add(n);
assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add, n, 0));
}
else {
if (GloPTH->bootstrapping_listeners == false) {
// we stop looping
mypolls.bootstrapping_listeners = false;
}
}
#ifdef DEBUG
usleep(5 + rand() % 10);
#endif
while ( // spin here if ...
(n = __sync_add_and_fetch(&mypolls.pending_listener_add, 0)) // there is a new listener to add
||
(GloPTH->bootstrapping_listeners == true) // PgSQL_Thread_Handlers has more listeners to configure
) {
if (n) {
poll_listener_add(n);
assert(__sync_bool_compare_and_swap(&mypolls.pending_listener_add, n, 0));
}
// The delay for the active-wait is a fraction of 'poll_timeout'. Since other
// threads may be waiting on poll for further operations, checks are meaningless
// until that timeout expires (other workers make progress).
usleep(std::min(std::max(pgsql_thread___poll_timeout/20, 10000), 40000) + (rand() % 2000));
}
proxy_debug(PROXY_DEBUG_NET, 7, "poll_timeout=%u\n", mypolls.poll_timeout);

@ -68,7 +68,6 @@ ProxySQL_Poll<T>::ProxySQL_Poll() {
len=0;
pending_listener_add=0;
pending_listener_del=0;
bootstrapping_listeners = true;
size=MIN_POLL_LEN;
fds=(struct pollfd *)malloc(size*sizeof(struct pollfd));
myds=(T**)malloc(size*sizeof(T *));

Loading…
Cancel
Save