From ba7f45474ea76d47b324f16abc12866eb9c9ced4 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Fri, 20 Mar 2026 14:30:10 +0000 Subject: [PATCH] Fix run-multi-group.bash bugs and improve robustness Fixes: - Fix array parsing issue with bash arithmetic expansion on group names Changed from 'read -ra GROUPS' to simple for-loop iteration Group names like 'legacy-g1' were being evaluated as arithmetic (1004-114-1=889) - Fix exit code capture from timeout command Changed from 'if ! timeout ...; then exit_code=$?' to proper '|| cmd_exit_code=$?' The old code was capturing the if-test exit code (0) instead of the command's - Change PARALLEL_JOBS default from 0 (unlimited) to 2 for resource safety Improvements: - Add random 0-15 second startup delay per group to stagger infrastructure initialization and prevent Docker/resource contention when running multiple groups in parallel - Update header documentation with new default values Tested with RUN_ID="abd123" TAP_GROUPS="legacy-g1 legacy-g2 ai-g1 mysql84-g1" which ran successfully with staggered startup. --- test/infra/control/run-multi-group.bash | 50 ++++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/test/infra/control/run-multi-group.bash b/test/infra/control/run-multi-group.bash index 2a303c91e..56012d8a0 100755 --- a/test/infra/control/run-multi-group.bash +++ b/test/infra/control/run-multi-group.bash @@ -22,7 +22,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" export WORKSPACE="${WORKSPACE:-${REPO_ROOT}}" RUN_ID="${RUN_ID:-$(date +%s)}" TAP_GROUPS="${TAP_GROUPS:-}" -PARALLEL_JOBS="${PARALLEL_JOBS:-0}" # 0 = unlimited +PARALLEL_JOBS="${PARALLEL_JOBS:-2}" # Default: 2 parallel groups TIMEOUT_MINUTES="${TIMEOUT_MINUTES:-60}" EXIT_ON_FIRST_FAIL="${EXIT_ON_FIRST_FAIL:-0}" AUTO_CLEANUP="${AUTO_CLEANUP:-0}" @@ -35,9 +35,11 @@ if [ -z "${TAP_GROUPS}" ]; then exit 1 fi -# Convert TAP_GROUPS to array -read -ra GROUPS <<< "${TAP_GROUPS}" -TOTAL_GROUPS=${#GROUPS[@]} +# Count groups +TOTAL_GROUPS=0 +for _ in ${TAP_GROUPS}; do + TOTAL_GROUPS=$((TOTAL_GROUPS + 1)) +done if [ "${TOTAL_GROUPS}" -eq 0 ]; then echo "ERROR: No TAP groups specified." @@ -90,6 +92,14 @@ run_single_group() { local log_file="${RESULTS_DIR}/${group}.log" local start_time end_time duration + # Add random delay (0-15 seconds) to stagger infrastructure startup + # This prevents resource contention when running multiple groups in parallel + local delay=$((RANDOM % 15)) + if [ "${delay}" -gt 0 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] ${group}: Waiting ${delay}s to stagger startup..." | tee -a "${log_file}" + sleep "${delay}" + fi + start_time=$(date +%s) echo "[$(date '+%Y-%m-%d %H:%M:%S')] STARTING: ${group} (INFRA_ID: ${infra_id})" | tee -a "${log_file}" @@ -101,7 +111,9 @@ run_single_group() { # Note: We don't run cleanup here - let the user decide when to destroy local exit_code=0 - if ! timeout "${TIMEOUT_MINUTES}m" bash -c " + # Run tests with timeout - capture exit code properly + local cmd_exit_code=0 + timeout "${TIMEOUT_MINUTES}m" bash -c " set -euo pipefail export INFRA_ID='${infra_id}' export TAP_GROUP='${group}' @@ -121,16 +133,18 @@ run_single_group() { fi echo '[$(date '+%Y-%m-%d %H:%M:%S')] Tests completed successfully' | tee -a '${log_file}' - "; then - exit_code=$? - if [ "${exit_code}" -eq 124 ]; then - echo "[$(date '+%Y-%m-%d %H:%M:%S')] TIMEOUT: ${group} after ${TIMEOUT_MINUTES} minutes" | tee -a "${log_file}" - exit_code=124 - else - echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAILED: ${group} (exit code: ${exit_code})" | tee -a "${log_file}" - fi - else + " || cmd_exit_code=$? + + # Process exit code + if [ "${cmd_exit_code}" -eq 0 ]; then echo "[$(date '+%Y-%m-%d %H:%M:%S')] SUCCESS: ${group}" | tee -a "${log_file}" + exit_code=0 + elif [ "${cmd_exit_code}" -eq 124 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] TIMEOUT: ${group} after ${TIMEOUT_MINUTES} minutes" | tee -a "${log_file}" + exit_code=124 + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAILED: ${group} (exit code: ${cmd_exit_code})" | tee -a "${log_file}" + exit_code="${cmd_exit_code}" fi end_time=$(date +%s) @@ -166,7 +180,7 @@ OVERALL_FAILED=0 JOBS_RUNNING=0 # Launch jobs -for group in "${GROUPS[@]}"; do +for group in ${TAP_GROUPS}; do # Check if we should stop due to previous failure if [ "${EXIT_ON_FIRST_FAIL}" -eq 1 ] && [ "${OVERALL_FAILED}" -ne 0 ]; then echo ">>> Skipping ${group} due to previous failure" @@ -213,7 +227,7 @@ echo "==========================================" printf "%-25s %10s %12s\n" "Group" "Duration" "Status" echo "------------------------------------------" -for group in "${GROUPS[@]}"; do +for group in ${TAP_GROUPS}; do exit_code="${EXIT_CODES[${group}]:-1}" result_file="${RESULTS_DIR}/${group}.result" @@ -246,7 +260,7 @@ PASS_COUNT=0 FAIL_COUNT=0 TIMEOUT_COUNT=0 -for group in "${GROUPS[@]}"; do +for group in ${TAP_GROUPS}; do exit_code="${EXIT_CODES[${group}]:-1}" if [ "${exit_code}" -eq 0 ]; then PASS_COUNT=$((PASS_COUNT + 1)) @@ -268,7 +282,7 @@ echo "" # Print log locations for failed groups if [ "${FAIL_COUNT}" -gt 0 ] || [ "${TIMEOUT_COUNT}" -gt 0 ]; then echo "Failed/Timed out group logs:" - for group in "${GROUPS[@]}"; do + for group in ${TAP_GROUPS}; do exit_code="${EXIT_CODES[${group}]:-1}" if [ "${exit_code}" -ne 0 ]; then echo " ${group}: ${RESULTS_DIR}/${group}.log"