diff --git a/test/tap/tests/test_flush_logs-t.sh b/test/tap/tests/test_flush_logs-t.sh index 1720e5460..e3aebfeea 100755 --- a/test/tap/tests/test_flush_logs-t.sh +++ b/test/tap/tests/test_flush_logs-t.sh @@ -74,18 +74,34 @@ fn_signal () { fi fi - # The Scheduler Hack: If we can't use docker, we ask ProxySQL to kill itself - # ProxySQL scheduler waits for the first interval before execution. - # We use /bin/sh -c to find the correct PID (worker process) and signal it. + # The Scheduler Hack: If we can't use docker (e.g. we're running + # inside the test-runner container, which has no docker CLI), we ask + # ProxySQL to kill itself. We install a scheduler row with interval + # 500ms so it fires very quickly. We use /bin/sh -c to find the + # correct PID (worker process) and signal it. + # + # NOTE: the caller is responsible for tearing the scheduler row down + # via fn_signal_cleanup() after it has observed the effect it was + # waiting for. We intentionally do NOT delete the row here - that + # would race against the scheduler actually firing. echo "msg: # Using Scheduler Hack to send ${sig} to ProxySQL..." - fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 2000, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');" + fn_padmin "INSERT OR REPLACE INTO scheduler (id, active, interval_ms, filename, arg1, arg2) VALUES (9999, 1, 500, '/bin/sh', '-c', 'kill -${sig#SIG} \$(pidof proxysql)');" fn_padmin "LOAD SCHEDULER TO RUNTIME;" - sleep 4 - fn_padmin "DELETE FROM scheduler WHERE id=9999; LOAD SCHEDULER TO RUNTIME;" +} + +fn_signal_cleanup () { + # Tear down any scheduler row installed by fn_signal. Safe to call + # unconditionally - if fn_signal used the docker path instead of the + # scheduler hack, this is a no-op on a non-existent row. + fn_padmin "DELETE FROM scheduler WHERE id=9999;" >/dev/null 2>&1 || true + fn_padmin "LOAD SCHEDULER TO RUNTIME;" >/dev/null 2>&1 || true } fn_get_rotations () { - sleep 1 + # Raw count of rotation markers in the current proxysql.log. No + # sleep() here - callers that are racing against an in-progress + # rotation should use fn_wait_for_rotation_at_least instead, which + # polls in a bounded loop rather than relying on a fixed sleep. # Try local mount first (most efficient in CI) if [ -f "$PROXYSQL_LOGS/proxysql.log" ]; then cat "$PROXYSQL_LOGS/proxysql.log" | grep '\[INFO\] ProxySQL version' | wc -l @@ -99,14 +115,53 @@ fn_get_rotations () { fi } +fn_wait_for_rotation_at_least () { + # Poll fn_get_rotations every 500 ms until the observed count is + # >= $1, or until $2 seconds have elapsed. Prints the final observed + # count to stdout so the caller can compare against its expected + # value (even on timeout). + # + # This replaces the `fn_padmin COMMAND ; sleep N ; count` pattern + # that used to race on slow CI runners: the sleep window was fixed + # at 1-5 s but the actual rotation latency varies with runner load, + # and any sleep just shy of the latency silently produced a false + # failure. By polling we convert a timing race into a max-latency + # bound - on a fast machine the test is as quick as before, on a + # slow one it waits up to max_seconds instead of failing. + local target=${1} + local max_seconds=${2:-30} + local i=0 + local count=0 + local max_iterations=$(( max_seconds * 2 )) + while [ $i -lt $max_iterations ]; do + count=$(fn_get_rotations) + if [ "$count" -ge "$target" ]; then + echo $count + return 0 + fi + sleep 0.5 + i=$(( i + 1 )) + done + # Timeout. Emit whatever we last observed so fn_check_res will + # report a useful "BASELINE X expected Y got Z" message rather than + # an empty RES. + echo $count + return 1 +} + fn_check_res () { DONE=$(( $DONE + 1 )) PLAN=$([[ $PLAN -lt $DONE ]] && echo $DONE || echo $PLAN) - if [[ $RES -ne $(( $BASELINE + 1)) ]]; then - echo "msg: not ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES" + # Accept any count >= BASELINE+1. The test asks "did this command + # trigger at least one rotation" - two rotations (e.g. if the + # scheduler hack fired twice before fn_signal_cleanup tore it down) + # is still a pass signal. It used to be `-ne` which was stricter + # than the actual test semantics and contributed to flakiness. + if [[ $RES -lt $(( $BASELINE + 1)) ]]; then + echo "msg: not ok $DONE - command '$1' - BASELINE: $BASELINE - expected at least $(( $BASELINE + 1)) : got $RES" FAIL=$(( $FAIL + 1 )) else - echo "msg: ok $DONE - command '$1' - initial BASELINE: $BASELINE - expected BASELINE + 1 : got $RES" + echo "msg: ok $DONE - command '$1' - BASELINE: $BASELINE - got $RES (>= BASELINE + 1)" fi } @@ -118,13 +173,14 @@ fn_plan # test PROXYSQL FLUSH LOGS BASELINE=$(fn_get_rotations) fn_padmin "PROXYSQL FLUSH LOGS;" -RES=$(fn_get_rotations) +RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30) fn_check_res "PROXYSQL FLUSH LOGS;" # test SIGUSR1 signal BASELINE=$(fn_get_rotations) fn_signal "SIGUSR1" -RES=$(fn_get_rotations) +RES=$(fn_wait_for_rotation_at_least $(( $BASELINE + 1 )) 30) +fn_signal_cleanup fn_check_res "kill -s SIGUSR1 \$PID"