diff --git a/test/infra/control/destroy-multi-group.bash b/test/infra/control/destroy-multi-group.bash new file mode 100755 index 000000000..2e49e3916 --- /dev/null +++ b/test/infra/control/destroy-multi-group.bash @@ -0,0 +1,145 @@ +#!/bin/bash +set -euo pipefail +# +# Destroy Multiple TAP Groups for a Specific RUN_ID +# Usage: +# RUN_ID="abc123" ./destroy-multi-group.bash +# +# Optional: +# TAP_GROUPS="group1 group2" # Only destroy specific groups for this RUN_ID +# FORCE=1 # Skip confirmation +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# Configuration +export WORKSPACE="${WORKSPACE:-${REPO_ROOT}}" +RUN_ID="${RUN_ID:-}" +TAP_GROUPS="${TAP_GROUPS:-}" +FORCE="${FORCE:-0}" + +# Validate required variables +if [ -z "${RUN_ID}" ]; then + echo "ERROR: RUN_ID is not set." + echo "Usage: RUN_ID= [TAP_GROUPS='group1 group2'] ${0}" + echo "" + echo "This will destroy all infrastructures matching '*-${RUN_ID}'" + exit 1 +fi + +# Determine which groups to destroy +if [ -n "${TAP_GROUPS}" ]; then + # Use specified groups + read -ra GROUPS <<< "${TAP_GROUPS}" + MODE="specific" +else + # Auto-discover groups by finding matching directories + MODE="auto" + GROUPS=() + + # Look for log directories matching *-${RUN_ID} + LOGS_PATH="${WORKSPACE}/ci_infra_logs" + if [ -d "${LOGS_PATH}" ]; then + for dir in "${LOGS_PATH}"/*; do + if [ -d "${dir}" ]; then + dir_name=$(basename "${dir}") + if [[ "${dir_name}" == *"-${RUN_ID}" ]]; then + # Extract group name from INFRA_ID + group_name="${dir_name%-${RUN_ID}}" + GROUPS+=("${group_name}") + fi + fi + done + fi +fi + +TOTAL_GROUPS=${#GROUPS[@]} + +if [ "${TOTAL_GROUPS}" -eq 0 ]; then + echo "No groups found to destroy for RUN_ID: ${RUN_ID}" + echo "Usage: RUN_ID= ${0}" + exit 0 +fi + +echo "==========================================" +echo "Destroy Multiple TAP Groups" +echo "==========================================" +echo "RUN_ID: ${RUN_ID}" +echo "MODE: ${MODE}" +echo "GROUPS: ${GROUPS[*]}" +echo "==========================================" + +# Confirmation prompt (unless FORCE=1) +if [ "${FORCE}" -eq 0 ]; then + echo "" + echo "This will destroy the following ${TOTAL_GROUPS} infrastructure(s):" + for group in "${GROUPS[@]}"; do + echo " - ${group}-${RUN_ID}" + done + echo "" + read -p "Are you sure? [y/N] " -n 1 -r + echo "" + if [[ ! ${REPLY} =~ ^[Yy]$ ]]; then + echo "Aborted." + exit 0 + fi +fi + +# Track results +declare -A DESTROY_RESULTS +SUCCESS_COUNT=0 +FAIL_COUNT=0 + +# Destroy each group's infrastructure +for group in "${GROUPS[@]}"; do + infra_id="${group}-${RUN_ID}" + echo "" + echo ">>> Destroying: ${group} (INFRA_ID: ${infra_id})" + + export INFRA_ID="${infra_id}" + export TAP_GROUP="${group}" + + # First stop ProxySQL + if "${SCRIPT_DIR}/stop-proxysql-isolated.bash" >/dev/null 2>&1; then + echo " Stopped ProxySQL" + fi + + # Then destroy backends + if "${SCRIPT_DIR}/destroy-infras.bash" >/dev/null 2>&1; then + echo " ✓ ${group} destroyed" + DESTROY_RESULTS["${group}"]=0 + SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) + else + echo " ✗ ${group} failed to destroy (may already be cleaned up)" + DESTROY_RESULTS["${group}"]=1 + FAIL_COUNT=$((FAIL_COUNT + 1)) + fi +done + +# Also clean up the multi-group results directory if it exists +MULTI_GROUP_DIR="${WORKSPACE}/ci_infra_logs/multi-group-${RUN_ID}" +if [ -d "${MULTI_GROUP_DIR}" ]; then + echo "" + echo ">>> Removing multi-group results directory: ${MULTI_GROUP_DIR}" + rm -rf "${MULTI_GROUP_DIR}" + echo " ✓ Results directory removed" +fi + +# Summary +echo "" +echo "==========================================" +echo " DESTROY SUMMARY " +echo "==========================================" +echo "TOTAL: ${TOTAL_GROUPS}" +echo "SUCCESS: ${SUCCESS_COUNT}" +echo "FAILED: ${FAIL_COUNT}" +echo "==========================================" + +if [ "${FAIL_COUNT}" -eq 0 ]; then + echo ">>> All groups destroyed successfully" + exit 0 +else + echo ">>> Some groups failed to destroy (may already be cleaned up)" + exit 0 # Exit 0 since cleanup is best-effort +fi diff --git a/test/infra/control/run-multi-group.bash b/test/infra/control/run-multi-group.bash new file mode 100755 index 000000000..2a303c91e --- /dev/null +++ b/test/infra/control/run-multi-group.bash @@ -0,0 +1,287 @@ +#!/bin/bash +set -euo pipefail +# +# Run Multiple TAP Groups in Parallel +# Usage: +# RUN_ID="abc123" \ +# TAP_GROUPS="legacy-g1 legacy-g2 ai-g1 mysql84-g1" \ +# ./run-multi-group.bash +# +# Optional environment variables: +# PARALLEL_JOBS=4 # Max parallel groups (default: unlimited) +# TIMEOUT_MINUTES=60 # Hard timeout per group (default: 60) +# EXIT_ON_FIRST_FAIL=0 # Stop on first failure (default: 0) +# AUTO_CLEANUP=0 # Auto cleanup successful groups (default: 0) +# SKIP_CLUSTER_START=1 # Skip ProxySQL cluster initialization (default: 0) +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# Configuration +export WORKSPACE="${WORKSPACE:-${REPO_ROOT}}" +RUN_ID="${RUN_ID:-$(date +%s)}" +TAP_GROUPS="${TAP_GROUPS:-}" +PARALLEL_JOBS="${PARALLEL_JOBS:-0}" # 0 = unlimited +TIMEOUT_MINUTES="${TIMEOUT_MINUTES:-60}" +EXIT_ON_FIRST_FAIL="${EXIT_ON_FIRST_FAIL:-0}" +AUTO_CLEANUP="${AUTO_CLEANUP:-0}" +SKIP_CLUSTER_START="${SKIP_CLUSTER_START:-0}" + +# Validate required variables +if [ -z "${TAP_GROUPS}" ]; then + echo "ERROR: TAP_GROUPS is not set." + echo "Usage: RUN_ID= TAP_GROUPS='group1 group2' ${0}" + exit 1 +fi + +# Convert TAP_GROUPS to array +read -ra GROUPS <<< "${TAP_GROUPS}" +TOTAL_GROUPS=${#GROUPS[@]} + +if [ "${TOTAL_GROUPS}" -eq 0 ]; then + echo "ERROR: No TAP groups specified." + exit 1 +fi + +echo "==========================================" +echo "Parallel TAP Group Execution" +echo "==========================================" +echo "RUN_ID: ${RUN_ID}" +echo "TAP_GROUPS: ${TAP_GROUPS}" +echo "PARALLEL_JOBS: ${PARALLEL_JOBS}" +echo "TIMEOUT_MINUTES: ${TIMEOUT_MINUTES}" +echo "EXIT_ON_FIRST_FAIL: ${EXIT_ON_FIRST_FAIL}" +echo "AUTO_CLEANUP: ${AUTO_CLEANUP}" +echo "SKIP_CLUSTER_START: ${SKIP_CLUSTER_START}" +echo "==========================================" + +# Create results directory +RESULTS_DIR="${WORKSPACE}/ci_infra_logs/multi-group-${RUN_ID}" +mkdir -p "${RESULTS_DIR}" + +# Arrays to track job PIDs and their associated groups +declare -a JOB_PIDS=() +declare -A GROUP_FOR_PID +declare -A PID_FOR_GROUP +declare -A EXIT_CODES +declare -A START_TIMES +declare -A END_TIMES + +# Cleanup function for interrupted runs +cleanup_on_interrupt() { + echo "" + echo ">>> INTERRUPT received - cleaning up running jobs..." + for pid in "${JOB_PIDS[@]}"; do + kill -TERM "${pid}" 2>/dev/null || true + done + sleep 2 + for pid in "${JOB_PIDS[@]}"; do + kill -KILL "${pid}" 2>/dev/null || true + done + exit 130 +} +trap cleanup_on_interrupt INT TERM + +# Function to run a single group +run_single_group() { + local group="${1}" + local infra_id="${group}-${RUN_ID}" + local log_file="${RESULTS_DIR}/${group}.log" + local start_time end_time duration + + start_time=$(date +%s) + echo "[$(date '+%Y-%m-%d %H:%M:%S')] STARTING: ${group} (INFRA_ID: ${infra_id})" | tee -a "${log_file}" + + # Export variables for the child process + export INFRA_ID="${infra_id}" + export TAP_GROUP="${group}" + + # Run ensure-infras + run-tests-isolated with timeout + # Note: We don't run cleanup here - let the user decide when to destroy + local exit_code=0 + + if ! timeout "${TIMEOUT_MINUTES}m" bash -c " + set -euo pipefail + export INFRA_ID='${infra_id}' + export TAP_GROUP='${group}' + export WORKSPACE='${WORKSPACE}' + export SKIP_CLUSTER_START='${SKIP_CLUSTER_START}' + + echo '[$(date '+%Y-%m-%d %H:%M:%S')] Setting up infrastructure...' | tee -a '${log_file}' + if ! '${SCRIPT_DIR}/ensure-infras.bash' >> '${log_file}' 2>&1; then + echo '[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to set up infrastructure' | tee -a '${log_file}' + exit 1 + fi + + echo '[$(date '+%Y-%m-%d %H:%M:%S')] Running tests...' | tee -a '${log_file}' + if ! '${SCRIPT_DIR}/run-tests-isolated.bash' >> '${log_file}' 2>&1; then + echo '[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Tests failed' | tee -a '${log_file}' + exit 1 + fi + + echo '[$(date '+%Y-%m-%d %H:%M:%S')] Tests completed successfully' | tee -a '${log_file}' + "; then + exit_code=$? + if [ "${exit_code}" -eq 124 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] TIMEOUT: ${group} after ${TIMEOUT_MINUTES} minutes" | tee -a "${log_file}" + exit_code=124 + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAILED: ${group} (exit code: ${exit_code})" | tee -a "${log_file}" + fi + else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] SUCCESS: ${group}" | tee -a "${log_file}" + fi + + end_time=$(date +%s) + duration=$((end_time - start_time)) + + # Write result file + cat > "${RESULTS_DIR}/${group}.result" << EOF +GROUP=${group} +INFRA_ID=${infra_id} +EXIT_CODE=${exit_code} +DURATION=${duration} +START_TIME=${start_time} +END_TIME=${end_time} +LOG_FILE=${log_file} +EOF + + # Auto-cleanup successful runs if enabled + if [ "${exit_code}" -eq 0 ] && [ "${AUTO_CLEANUP}" -eq 1 ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Auto-cleanup: ${group}" | tee -a "${log_file}" + INFRA_ID="${infra_id}" TAP_GROUP="${group}" "${SCRIPT_DIR}/stop-proxysql-isolated.bash" >> "${log_file}" 2>&1 || true + INFRA_ID="${infra_id}" TAP_GROUP="${group}" "${SCRIPT_DIR}/destroy-infras.bash" >> "${log_file}" 2>&1 || true + fi + + return "${exit_code}" +} + +# Main execution +echo ">>> Starting parallel execution of ${TOTAL_GROUPS} groups..." +START_TIME=$(date +%s) + +# Track overall status +OVERALL_FAILED=0 +JOBS_RUNNING=0 + +# Launch jobs +for group in "${GROUPS[@]}"; do + # Check if we should stop due to previous failure + if [ "${EXIT_ON_FIRST_FAIL}" -eq 1 ] && [ "${OVERALL_FAILED}" -ne 0 ]; then + echo ">>> Skipping ${group} due to previous failure" + continue + fi + + # If PARALLEL_JOBS is set and we're at the limit, wait for a job to finish + if [ "${PARALLEL_JOBS}" -gt 0 ] && [ "${JOBS_RUNNING}" -ge "${PARALLEL_JOBS}" ]; then + echo ">>> Waiting for a job to finish (max ${PARALLEL_JOBS} parallel)..." + wait -n || true + JOBS_RUNNING=$((JOBS_RUNNING - 1)) + fi + + # Start the job + echo ">>> Launching: ${group}" + run_single_group "${group}" & + local_pid=$! + JOB_PIDS+=("${local_pid}") + GROUP_FOR_PID["${local_pid}"]="${group}" + PID_FOR_GROUP["${group}"]="${local_pid}" + JOBS_RUNNING=$((JOBS_RUNNING + 1)) +done + +# Wait for all jobs to complete +echo ">>> Waiting for all jobs to complete..." +for pid in "${JOB_PIDS[@]}"; do + group="${GROUP_FOR_PID[${pid}]}" + if wait "${pid}"; then + EXIT_CODES["${group}"]=0 + else + EXIT_CODES["${group}"]=$? + OVERALL_FAILED=1 + fi +done + +END_TIME=$(date +%s) +TOTAL_DURATION=$((END_TIME - START_TIME)) + +# Generate summary report +echo "" +echo "==========================================" +echo " EXECUTION SUMMARY " +echo "==========================================" +printf "%-25s %10s %12s\n" "Group" "Duration" "Status" +echo "------------------------------------------" + +for group in "${GROUPS[@]}"; do + exit_code="${EXIT_CODES[${group}]:-1}" + result_file="${RESULTS_DIR}/${group}.result" + + if [ -f "${result_file}" ]; then + duration=$(grep "^DURATION=" "${result_file}" | cut -d= -f2) + duration_min=$((duration / 60)) + duration_sec=$((duration % 60)) + duration_str="${duration_min}m${duration_sec}s" + else + duration_str="N/A" + fi + + if [ "${exit_code}" -eq 0 ]; then + status="✓ PASS" + elif [ "${exit_code}" -eq 124 ]; then + status="✗ TIMEOUT" + else + status="✗ FAIL" + fi + + printf "%-25s %10s %12s\n" "${group}" "${duration_str}" "${status}" +done + +echo "------------------------------------------" +echo "TOTAL TIME: $((TOTAL_DURATION / 60))m$((TOTAL_DURATION % 60))s" +echo "==========================================" + +# Summary of results +PASS_COUNT=0 +FAIL_COUNT=0 +TIMEOUT_COUNT=0 + +for group in "${GROUPS[@]}"; do + exit_code="${EXIT_CODES[${group}]:-1}" + if [ "${exit_code}" -eq 0 ]; then + PASS_COUNT=$((PASS_COUNT + 1)) + elif [ "${exit_code}" -eq 124 ]; then + TIMEOUT_COUNT=$((TIMEOUT_COUNT + 1)) + else + FAIL_COUNT=$((FAIL_COUNT + 1)) + fi +done + +echo "" +echo "PASSED: ${PASS_COUNT}/${TOTAL_GROUPS}" +echo "FAILED: ${FAIL_COUNT}/${TOTAL_GROUPS}" +echo "TIMEOUT: ${TIMEOUT_COUNT}/${TOTAL_GROUPS}" +echo "" +echo "Results directory: ${RESULTS_DIR}" +echo "" + +# Print log locations for failed groups +if [ "${FAIL_COUNT}" -gt 0 ] || [ "${TIMEOUT_COUNT}" -gt 0 ]; then + echo "Failed/Timed out group logs:" + for group in "${GROUPS[@]}"; do + exit_code="${EXIT_CODES[${group}]:-1}" + if [ "${exit_code}" -ne 0 ]; then + echo " ${group}: ${RESULTS_DIR}/${group}.log" + fi + done + echo "" +fi + +# Exit with appropriate code +if [ "${OVERALL_FAILED}" -eq 0 ]; then + echo ">>> All groups passed!" + exit 0 +else + echo ">>> Some groups failed. Check logs above." + exit 1 +fi