From 5444125bd2c2a9acf560fd31e7e698ab619dca78 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene.cannao@gmail.com>
Date: Sat, 11 Apr 2026 13:41:37 +0000
Subject: [PATCH] =?UTF-8?q?test(flush=5Flogs):=20harden=20timing=20race=20?=
 =?UTF-8?q?=E2=80=94=20poll=20instead=20of=20fixed-sleep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Part of sysown/proxysql#5610 (flaky TAP tests revealed by cascade fix).
Addresses the CI-legacy-g3 'test_flush_logs-t' flake that failed on
some PR #5596 commits but passed on others despite identical code.

## Root cause

The test measures "does command X cause proxysql.log to gain one new
'[INFO] ProxySQL version' marker line". The old flow used fixed sleeps:

    BASELINE=$(fn_get_rotations)       # (includes sleep 1)
    fn_padmin "PROXYSQL FLUSH LOGS;"
    RES=$(fn_get_rotations)            # (sleep 1, then count)
    fn_check_res                       # strict equality check

For the SIGUSR1 subtest the flow was worse because the signal had to
travel through a "Scheduler Hack" (fn_signal inserts a scheduler row
that runs `kill -USR1 $(pidof proxysql)` every 2000 ms, waits 4 s,
then deletes the row). Total wall-clock budget from signal scheduling
to log read: 5 s including the inner fn_get_rotations sleep.

On a slow / contended CI runner any one of three asynchronous stages
can slip past that 5 s window:

  1. The scheduler's first interval fires slightly late.
  2. The SIGUSR1 handler in proxysql queues the log rotation on a
     worker thread; the actual rotation + re-open + banner write
     happens after the signal is delivered, not synchronously.
  3. The host-side read of proxysql.log via the docker volume
     bind-mount can lag the in-container write by up to a second
     on Azure-hosted runners under load.

When any of those raced past 5 s, fn_get_rotations returned the
pre-rotation count, fn_check_res saw `RES != BASELINE+1`, the test
flagged a failure, and CI went red. Same code ran fine on the next
retrigger because the race was environmental, not logical.

A second, subtler issue: the strict `-ne` equality check in
fn_check_res would ALSO fail if the scheduler fired twice before the
script got around to deleting its row (interval_ms was 2000, sleep
was 4 s - one extra fire was a coin flip on fast runners). One signal
is the contract; two rotations from one signal is still the correct
observable answer for "did this trigger at least one rotation".

## Fix

Two changes, both in the .sh (the -t file is a gitignored copy
regenerated by `make sh-test_flush_logs-t.sh`, which itself just
`cp`s the .sh — same rule applies to every other `*-t.sh` test):

1. Replace fixed sleeps with bounded polling:
   - fn_get_rotations no longer has an internal `sleep 1` — it is now
     a pure "read the current count" function.
   - New fn_wait_for_rotation_at_least <target> <max_seconds> polls
     fn_get_rotations every 500 ms until the count reaches the
     target or the timeout expires. On success it prints the observed
     count and returns 0; on timeout it prints whatever it last saw
     and returns 1, so fn_check_res still reports a useful
     "BASELINE X expected Y got Z" error message.
   - Max budget raised to 30 s (was 5 s). On a fast machine the poll
     still terminates within one 500 ms cycle, so total test time on
     the local dev workstation dropped from ~9 s to ~1 s.

2. Accept "at least BASELINE+1" instead of "exactly BASELINE+1":
   - fn_check_res uses `-lt` instead of `-ne`.
   - Test semantics: "did the command trigger at least one rotation".
     Two rotations from one signal is still a pass (happens only in
     the scheduler-hack corner, when the scheduler fires twice before
     fn_signal_cleanup tears it down; harmless to proxysql behavior).
   - Also drops the scheduler interval from 2000 ms to 500 ms so the
     first fire happens quickly under normal conditions.

3. Split fn_signal into fn_signal (install only) and fn_signal_cleanup
   (tear down the scheduler row). The caller now:
     fn_signal SIGUSR1
     RES=$(fn_wait_for_rotation_at_least $((BASELINE+1)) 30)
     fn_signal_cleanup
     fn_check_res ...
   which is race-free: the cleanup runs AFTER the rotation has been
   observed, so the scheduler row exists only for the bounded window
   between install and observation.

## Local verification

Reproduced the original PR #5596 failure context (legacy-g3 group
with only test_flush_logs-t enabled via TEST_PY_TAP_INCL), ran 10
back-to-back iterations against a single infra lifecycle following
the recipe in test/README.md §"Debugging a flaky test":

    for i in 1..10: ./test/infra/control/run-tests-isolated.bash

Results:
  - 10/10 pass
  - Per-attempt wall time dropped from ~9 s to ~1 s
  - Confirmed new code path active via the new "got N (>= BASELINE+1)"
    message format in the captured TAP output

## What's still TODO in #5610

The other three flaky tests from the issue are C++ and each needs
its own targeted investigation. They are NOT touched here:
  - pgsql-servers_ssl_params-t
  - pgsql-ssl_keylog-t
  - test_read_only_actions_offline_hard_servers-t

Deliberately keeping this PR small - one test per PR so each fix
can be reviewed and validated independently.
---
 test/tap/tests/test_flush_logs-t.sh | 80 ++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 12 deletions(-)
diff --git a/test/tap/tests/test_flush_logs-t.sh b/test/tap/tests/test_flush_logs-t.sh
index 1720e5460..e3aebfeea 100755
--- a/test/tap/tests/test_flush_logs-t.sh
+++ b/test/tap/tests/test_flush_logs-t.sh
@@ -74,18 +74,34 @@ fn_signal () {
 		fi
 	fi
 
-	# The Scheduler Hack: If we can't use docker, we ask ProxySQL to kill itself
-	# ProxySQL scheduler waits for the first interval before execution.
-	# We use /bin/sh -c to find the correct PID (worker process) and signal it.
+	# The Scheduler Hack: If we can't use docker (e.g. we're running
+	# inside the test-runner container, which has no docker CLI), we ask
+	# ProxySQL to kill itself. We install a scheduler row with interval
+	# 500ms so it fires very quickly. We use /bin/sh -c to find the
+	# correct PID (worker process) and signal it.
+	#
+	# NOTE: the caller is responsible for tearing the scheduler row down
+	# via fn_signal_cleanup() after it has observed the effect it was
+	# waiting for. We intentionally do NOT delete the row here - that
+	# would race against the scheduler actually firing.
 	echo "msg: # Using Scheduler Hack to send ${sig} to ProxySQL..."
-	fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 2000, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
+	fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 500, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
 	fn_padmin "LOAD SCHEDULER TO RUNTIME;"
-	sleep 4
-	fn_padmin "DELETE FROM scheduler WHERE id=9999; LOAD SCHEDULER TO RUNTIME;"
+}
+
+fn_signal_cleanup () {
+	# Tear down any scheduler row installed by fn_signal. Safe to call
+	# unconditionally - if fn_signal used the docker path instead of the
+	# scheduler hack, this is a no-op on a non-existent row.
+	fn_padmin "DELETE FROM scheduler WHERE id=9999;" >/dev/null 2>&1 || true
+	fn_padmin "LOAD SCHEDULER TO RUNTIME;" >/dev/null 2>&1 || true
 }
 
 fn_get_rotations () {
-	sleep 1
+	# Raw count of rotation markers in the current proxysql.log. No
+	# sleep() here - callers that are racing against an in-progress
+	# rotation should use fn_wait_for_rotation_at_least instead, which
+	# polls in a bounded loop rather than relying on a fixed sleep.
 	# Try local mount first (most efficient in CI)
 	if [ -f "$PROXYSQL_LOGS/proxysql.log" ]; then
 		cat "$PROXYSQL_LOGS/proxysql.log" | grep '\[INFO\] ProxySQL version' | wc -l
@@ -99,14 +115,53 @@ fn_get_rotations () {
 	fi
 }
 
+fn_wait_for_rotation_at_least () {
+	# Poll fn_get_rotations every 500 ms until the observed count is
+	# >= $1, or until $2 seconds have elapsed. Prints the final observed
+	# count to stdout so the caller can compare against its expected
+	# value (even on timeout).
+	#
+	# This replaces the `fn_padmin COMMAND ; sleep N ; count` pattern
+	# that used to race on slow CI runners: the sleep window was fixed
+	# at 1-5 s but the actual rotation latency varies with runner load,
+	# and any sleep just shy of the latency silently produced a false
+	# failure. By polling we convert a timing race into a max-latency
+	# bound - on a fast machine the test is as quick as before, on a
+	# slow one it waits up to max_seconds instead of failing.
+	local target=${1}
+	local max_seconds=${2:-30}
+	local i=0
+	local count=0
+	local max_iterations=$(( max_seconds * 2 ))
+	while [ $i -lt $max_iterations ]; do
+		count=$(fn_get_rotations)
+		if [ "$count" -ge "$target" ]; then
+			echo $count
+			return 0
+		fi
+		sleep 0.5
+		i=$(( i + 1 ))
+	done
+	# Timeout. Emit whatever we last observed so fn_check_res will
+	# report a useful "BASELINE X expected Y got Z" message rather than
+	# an empty RES.
+	echo $count
+	return 1
+}
+
 fn_check_res () {
 	DONE=$(( $DONE + 1 ))
 	PLAN=$([[ $PLAN -lt $DONE ]] && echo $DONE || echo $PLAN)
-	if [[ $RES -ne $(( $BASELINE + 1)) ]]; then
-		echo "msg: not ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
+	# Accept any count >= BASELINE+1. The test asks "did this command
+	# trigger at least one rotation" - two rotations (e.g. if the
+	# scheduler hack fired twice before fn_signal_cleanup tore it down)
+	# is still a pass signal. It used to be `-ne` which was stricter
+	# than the actual test semantics and contributed to flakiness.
+	if [[ $RES -lt $(( $BASELINE + 1)) ]]; then
+		echo "msg: not ok $DONE - command '$1' - BASELINE: $BASELINE - expected at least $(( $BASELINE + 1)) : got $RES"
 		FAIL=$(( $FAIL + 1 ))
 	else
-		echo "msg: ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
+		echo "msg: ok $DONE - command '$1' - BASELINE: $BASELINE - got $RES (>= BASELINE + 1)"
 	fi
 }
 
@@ -118,13 +173,14 @@ fn_plan
 # test PROXYSQL FLUSH LOGS
 BASELINE=$(fn_get_rotations)
 fn_padmin "PROXYSQL FLUSH LOGS;"
-RES=$(fn_get_rotations)
+RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
 fn_check_res "PROXYSQL FLUSH LOGS;"
 
 # test SIGUSR1 signal
 BASELINE=$(fn_get_rotations)
 fn_signal "SIGUSR1"
-RES=$(fn_get_rotations)
+RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
+fn_signal_cleanup
 fn_check_res "kill -s SIGUSR1 \$PID"