Merge pull request #5611 from sysown/v3.0-fix-flake-test-flush-logs

test(flush_logs): harden timing race — poll instead of fixed sleep
pull/5618/head
René Cannaò 1 month ago committed by GitHub
commit caa2129eee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -74,18 +74,34 @@ fn_signal () {
fi
fi
# The Scheduler Hack: If we can't use docker, we ask ProxySQL to kill itself
# ProxySQL scheduler waits for the first interval before execution.
# We use /bin/sh -c to find the correct PID (worker process) and signal it.
# The Scheduler Hack: If we can't use docker (e.g. we're running
# inside the test-runner container, which has no docker CLI), we ask
# ProxySQL to kill itself. We install a scheduler row with interval
# 500ms so it fires very quickly. We use /bin/sh -c to find the
# correct PID (worker process) and signal it.
#
# NOTE: the caller is responsible for tearing the scheduler row down
# via fn_signal_cleanup() after it has observed the effect it was
# waiting for. We intentionally do NOT delete the row here - that
# would race against the scheduler actually firing.
echo "msg: # Using Scheduler Hack to send ${sig} to ProxySQL..."
fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 2000, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 500, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');"
fn_padmin "LOAD SCHEDULER TO RUNTIME;"
sleep 4
fn_padmin "DELETE FROM scheduler WHERE id=9999; LOAD SCHEDULER TO RUNTIME;"
}
fn_signal_cleanup () {
# Tear down any scheduler row installed by fn_signal. Safe to call
# unconditionally - if fn_signal used the docker path instead of the
# scheduler hack, this is a no-op on a non-existent row.
fn_padmin "DELETE FROM scheduler WHERE id=9999;" >/dev/null 2>&1 || true
fn_padmin "LOAD SCHEDULER TO RUNTIME;" >/dev/null 2>&1 || true
}
fn_get_rotations () {
sleep 1
# Raw count of rotation markers in the current proxysql.log. No
# sleep() here - callers that are racing against an in-progress
# rotation should use fn_wait_for_rotation_at_least instead, which
# polls in a bounded loop rather than relying on a fixed sleep.
# Try local mount first (most efficient in CI)
if [ -f "$PROXYSQL_LOGS/proxysql.log" ]; then
cat "$PROXYSQL_LOGS/proxysql.log" | grep '\[INFO\] ProxySQL version' | wc -l
@ -99,14 +115,53 @@ fn_get_rotations () {
fi
}
fn_wait_for_rotation_at_least () {
# Poll fn_get_rotations every 500 ms until the observed count is
# >= $1, or until $2 seconds have elapsed. Prints the final observed
# count to stdout so the caller can compare against its expected
# value (even on timeout).
#
# This replaces the `fn_padmin COMMAND ; sleep N ; count` pattern
# that used to race on slow CI runners: the sleep window was fixed
# at 1-5 s but the actual rotation latency varies with runner load,
# and any sleep just shy of the latency silently produced a false
# failure. By polling we convert a timing race into a max-latency
# bound - on a fast machine the test is as quick as before, on a
# slow one it waits up to max_seconds instead of failing.
local target=${1}
local max_seconds=${2:-30}
local i=0
local count=0
local max_iterations=$(( max_seconds * 2 ))
while [ $i -lt $max_iterations ]; do
count=$(fn_get_rotations)
if [ "$count" -ge "$target" ]; then
echo $count
return 0
fi
sleep 0.5
i=$(( i + 1 ))
done
# Timeout. Emit whatever we last observed so fn_check_res will
# report a useful "BASELINE X expected Y got Z" message rather than
# an empty RES.
echo $count
return 1
}
fn_check_res () {
DONE=$(( $DONE + 1 ))
PLAN=$([[ $PLAN -lt $DONE ]] && echo $DONE || echo $PLAN)
if [[ $RES -ne $(( $BASELINE + 1)) ]]; then
echo "msg: not ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
# Accept any count >= BASELINE+1. The test asks "did this command
# trigger at least one rotation" - two rotations (e.g. if the
# scheduler hack fired twice before fn_signal_cleanup tore it down)
# is still a pass signal. It used to be `-ne` which was stricter
# than the actual test semantics and contributed to flakiness.
if [[ $RES -lt $(( $BASELINE + 1)) ]]; then
echo "msg: not ok $DONE - command '$1' - BASELINE: $BASELINE - expected at least $(( $BASELINE + 1)) : got $RES"
FAIL=$(( $FAIL + 1 ))
else
echo "msg: ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES"
echo "msg: ok $DONE - command '$1' - BASELINE: $BASELINE - got $RES (>= BASELINE + 1)"
fi
}
@ -118,13 +173,14 @@ fn_plan
# test PROXYSQL FLUSH LOGS
BASELINE=$(fn_get_rotations)
fn_padmin "PROXYSQL FLUSH LOGS;"
RES=$(fn_get_rotations)
RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
fn_check_res "PROXYSQL FLUSH LOGS;"
# test SIGUSR1 signal
BASELINE=$(fn_get_rotations)
fn_signal "SIGUSR1"
RES=$(fn_get_rotations)
RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30)
fn_signal_cleanup
fn_check_res "kill -s SIGUSR1 \$PID"

Loading…
Cancel
Save