Merge pull request #5611 from sysown/v3.0-fix-flake-test-flush-logs

test(flush_logs): harden timing race — poll instead of fixed sleep
1 month ago · caa2129eee
parent 04466b35a5 5444125bd2
commit caa2129eee
1 changed files with 68 additions and 12 deletions
--- a/test/tap/tests/test_flush_logs-t.sh
+++ b/test/tap/tests/test_flush_logs-t.sh
@ -74,18 +74,34 @@ fn_signal () {
 		fi
 	fi

-	# The Scheduler Hack: If we can't use docker, we ask ProxySQL to kill itself
-	# ProxySQL scheduler waits for the first interval before execution.
-	# We use /bin/sh -c to find the correct PID (worker process) and signal it.
+	# The Scheduler Hack: If we can't use docker (e.g. we're running
+	# inside the test-runner container, which has no docker CLI), we ask
+	# ProxySQL to kill itself. We install a scheduler row with interval
+	# 500ms so it fires very quickly. We use /bin/sh -c to find the
+	# correct PID (worker process) and signal it.
+	#
+	# NOTE: the caller is responsible for tearing the scheduler row down
+	# via fn_signal_cleanup() after it has observed the effect it was
+	# waiting for. We intentionally do NOT delete the row here - that
+	# would race against the scheduler actually firing.
 	echo "msg: # Using Scheduler Hack to send ${sig} to ProxySQL..."
-	fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 2000, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
+	fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 500, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
 	fn_padmin "LOAD SCHEDULER TO RUNTIME;"
-	sleep 4
-	fn_padmin "DELETE FROM scheduler WHERE id=9999; LOAD SCHEDULER TO RUNTIME;"
+}
+
+fn_signal_cleanup () {
+	# Tear down any scheduler row installed by fn_signal. Safe to call
+	# unconditionally - if fn_signal used the docker path instead of the
+	# scheduler hack, this is a no-op on a non-existent row.
+	fn_padmin "DELETE FROM scheduler WHERE id=9999;" >/dev/null 2>&1 || true
+	fn_padmin "LOAD SCHEDULER TO RUNTIME;" >/dev/null 2>&1 || true
 }

 fn_get_rotations () {
-	sleep 1
+	# Raw count of rotation markers in the current proxysql.log. No
+	# sleep() here - callers that are racing against an in-progress
+	# rotation should use fn_wait_for_rotation_at_least instead, which
+	# polls in a bounded loop rather than relying on a fixed sleep.
 	# Try local mount first (most efficient in CI)
 	if [ -f "$PROXYSQL_LOGS/proxysql.log" ]; then
 		cat "$PROXYSQL_LOGS/proxysql.log" | grep '\[INFO\] ProxySQL version' | wc -l
@ -99,14 +115,53 @@ fn_get_rotations () {
 	fi
 }

+fn_wait_for_rotation_at_least () {
+	# Poll fn_get_rotations every 500 ms until the observed count is
+	# >= $1, or until $2 seconds have elapsed. Prints the final observed
+	# count to stdout so the caller can compare against its expected
+	# value (even on timeout).
+	#
+	# This replaces the `fn_padmin COMMAND ; sleep N ; count` pattern
+	# that used to race on slow CI runners: the sleep window was fixed
+	# at 1-5 s but the actual rotation latency varies with runner load,
+	# and any sleep just shy of the latency silently produced a false
+	# failure. By polling we convert a timing race into a max-latency
+	# bound - on a fast machine the test is as quick as before, on a
+	# slow one it waits up to max_seconds instead of failing.
+	local target=${1}
+	local max_seconds=${2:-30}
+	local i=0
+	local count=0
+	local max_iterations=$(( max_seconds * 2 ))
+	while [ $i -lt $max_iterations ]; do
+		count=$(fn_get_rotations)
+		if [ "$count" -ge "$target" ]; then
+			echo $count
+			return 0
+		fi
+		sleep 0.5
+		i=$(( i + 1 ))
+	done
+	# Timeout. Emit whatever we last observed so fn_check_res will
+	# report a useful "BASELINE X expected Y got Z" message rather than
+	# an empty RES.
+	echo $count
+	return 1
+}
+
 fn_check_res () {
 	DONE=$(( $DONE + 1 ))
 	PLAN=$([[ $PLAN -lt $DONE ]] && echo $DONE || echo $PLAN)
-	if [[ $RES -ne $(( $BASELINE + 1)) ]]; then
-		echo "msg: not ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
+	# Accept any count >= BASELINE+1. The test asks "did this command
+	# trigger at least one rotation" - two rotations (e.g. if the
+	# scheduler hack fired twice before fn_signal_cleanup tore it down)
+	# is still a pass signal. It used to be `-ne` which was stricter
+	# than the actual test semantics and contributed to flakiness.
+	if [[ $RES -lt $(( $BASELINE + 1)) ]]; then
+		echo "msg: not ok $DONE - command '$1' - BASELINE: $BASELINE - expected at least $(( $BASELINE + 1)) : got $RES"
 		FAIL=$(( $FAIL + 1 ))
 	else
-		echo "msg: ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
+		echo "msg: ok $DONE - command '$1' - BASELINE: $BASELINE - got $RES (>= BASELINE + 1)"
 	fi
 }

@ -118,13 +173,14 @@ fn_plan
 # test PROXYSQL FLUSH LOGS
 BASELINE=$(fn_get_rotations)
 fn_padmin "PROXYSQL FLUSH LOGS;"
-RES=$(fn_get_rotations)
+RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
 fn_check_res "PROXYSQL FLUSH LOGS;"

 # test SIGUSR1 signal
 BASELINE=$(fn_get_rotations)
 fn_signal "SIGUSR1"
-RES=$(fn_get_rotations)
+RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
+fn_signal_cleanup
 fn_check_res "kill -s SIGUSR1 \$PID"