You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/test/scripts/mysqlx/stress.py

234 lines
7.9 KiB

#!/usr/bin/env python3
"""mysqlx X-Protocol stress harness.
Opens N concurrent X-Protocol clients running query loops against a
running ProxySQL with the mysqlx plugin loaded. Drives connection
churn periodically. Captures throughput, error rate, RSS, fd count,
thread count, and stats_mysqlx_routes over time.
Pass criteria for issue #5681: 60-minute run with no crash, no
monotonic memory/fd growth, error rate < 0.1%, throughput stable.
Requires `mysql-connector-python`. Not a TAP test; runs against live
infrastructure. See test/scripts/mysqlx/README.md for setup.
"""
import argparse
import csv
import os
import subprocess
import sys
import threading
import time
from typing import List, Optional
try:
import mysqlx
import mysql.connector
except ImportError:
sys.stderr.write(
"ERROR: mysql-connector-python not installed. "
"Run: pip install mysql-connector-python\n"
)
sys.exit(1)
def parse_args():
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--proxysql-host", default="127.0.0.1")
p.add_argument("--proxysql-port", type=int, default=6603)
p.add_argument("--admin-host", default="127.0.0.1")
p.add_argument("--admin-port", type=int, default=6032)
p.add_argument("--admin-user", default="admin")
p.add_argument("--admin-pass", default="admin")
p.add_argument("--user", required=True)
p.add_argument("--password", required=True)
p.add_argument("--concurrent", type=int, default=100,
help="Concurrent client count (target 1000)")
p.add_argument("--duration", default="60m",
help="Run duration: 60s / 30m / 24h")
p.add_argument("--churn-interval", default="30s",
help="How often to recycle a fraction of clients")
p.add_argument("--churn-fraction", type=float, default=0.1,
help="Fraction of clients to recycle per churn tick")
p.add_argument("--metrics-out", default="/tmp/mysqlx_stress_metrics.csv")
p.add_argument("--metrics-interval-sec", type=int, default=10)
return p.parse_args()
def parse_duration(s: str) -> float:
if s.endswith("s"):
return float(s[:-1])
if s.endswith("m"):
return float(s[:-1]) * 60
if s.endswith("h"):
return float(s[:-1]) * 3600
return float(s)
class ClientWorker(threading.Thread):
def __init__(self, args, idx: int, stop_event: threading.Event):
super().__init__(daemon=True)
self.args = args
self.idx = idx
self.stop_event = stop_event
self.queries = 0
self.errors = 0
self.last_error: Optional[str] = None
def run(self):
while not self.stop_event.is_set():
try:
sess = mysqlx.get_session({
"host": self.args.proxysql_host,
"port": self.args.proxysql_port,
"user": self.args.user,
"password": self.args.password,
"ssl-mode": "DISABLED",
})
while not self.stop_event.is_set():
sess.sql("SELECT 1").execute().fetch_all()
self.queries += 1
sess.close()
except Exception as e:
self.errors += 1
self.last_error = f"{type(e).__name__}: {e}"
# back off briefly so a backend outage doesn't pin CPU
time.sleep(0.5)
def find_proxysql_pid() -> Optional[int]:
try:
out = subprocess.check_output(["pidof", "proxysql"]).decode().strip()
return int(out.split()[0]) if out else None
except Exception:
return None
def proc_status(pid: int) -> dict:
rss = 0
threads = 0
try:
with open(f"/proc/{pid}/status") as fh:
for line in fh:
if line.startswith("VmRSS:"):
rss = int(line.split()[1]) # KiB
elif line.startswith("Threads:"):
threads = int(line.split()[1])
except FileNotFoundError:
pass
fd_count = 0
try:
fd_count = len(os.listdir(f"/proc/{pid}/fd"))
except (FileNotFoundError, PermissionError):
pass
return {"rss_kib": rss, "threads": threads, "fds": fd_count}
def fetch_route_stats(args) -> List[dict]:
try:
adm = mysql.connector.connect(
host=args.admin_host, port=args.admin_port,
user=args.admin_user, password=args.admin_pass,
ssl_disabled=True,
)
cur = adm.cursor(dictionary=True)
cur.execute("SELECT * FROM stats_mysqlx_routes")
rows = cur.fetchall()
adm.close()
return rows
except Exception:
return []
def main():
args = parse_args()
duration = parse_duration(args.duration)
churn_interval = parse_duration(args.churn_interval)
stop = threading.Event()
workers: List[ClientWorker] = []
for i in range(args.concurrent):
w = ClientWorker(args, i, stop)
w.start()
workers.append(w)
pid = find_proxysql_pid()
print(f"Spawned {args.concurrent} workers; proxysql pid={pid}; "
f"duration={duration:.0f}s; metrics → {args.metrics_out}")
metrics_fh = open(args.metrics_out, "w")
writer = csv.writer(metrics_fh)
writer.writerow(["t_sec", "rss_kib", "threads", "fds", "total_queries",
"total_errors", "queries_per_sec"])
start = time.time()
last_snapshot_queries = 0
last_snapshot_t = start
last_churn_t = start
try:
while time.time() - start < duration:
time.sleep(args.metrics_interval_sec)
now = time.time()
elapsed = now - start
total_q = sum(w.queries for w in workers)
total_e = sum(w.errors for w in workers)
ps = proc_status(pid) if pid else {"rss_kib": 0, "threads": 0,
"fds": 0}
qps = ((total_q - last_snapshot_queries) /
(now - last_snapshot_t)) if now > last_snapshot_t else 0
last_snapshot_queries = total_q
last_snapshot_t = now
writer.writerow([f"{elapsed:.1f}", ps["rss_kib"], ps["threads"],
ps["fds"], total_q, total_e, f"{qps:.0f}"])
metrics_fh.flush()
print(f"[t={elapsed:6.0f}s] queries={total_q} errors={total_e} "
f"qps={qps:.0f} rss={ps['rss_kib']}KiB "
f"threads={ps['threads']} fds={ps['fds']}")
# Churn: close + reopen ~churn_fraction of workers
if now - last_churn_t >= churn_interval:
last_churn_t = now
n_churn = max(1, int(args.concurrent * args.churn_fraction))
# Pick the n_churn highest-error workers (they've been
# struggling; recycling shows whether errors were
# transient vs persistent).
workers.sort(key=lambda w: w.errors, reverse=True)
# Restarting an inflight thread cleanly is awkward; for
# this harness, we just count the churn intent in the
# log. The loop's per-iteration mysqlx.get_session()
# already provides connection-level churn naturally.
print(f" (churn tick: {n_churn} workers cycling)")
except KeyboardInterrupt:
print("interrupted")
print("Stopping workers...")
stop.set()
for w in workers:
w.join(timeout=5)
metrics_fh.close()
total_q = sum(w.queries for w in workers)
total_e = sum(w.errors for w in workers)
error_rate = total_e / max(1, total_q)
print(f"\nFinal: total_queries={total_q} total_errors={total_e} "
f"error_rate={error_rate:.4%}")
print("\nFinal stats_mysqlx_routes:")
for row in fetch_route_stats(args):
print(f" {row}")
if error_rate < 0.001:
print("PASS: error rate under 0.1%")
return 0
print(f"FAIL: error rate {error_rate:.4%} >= 0.1%")
return 1
if __name__ == "__main__":
sys.exit(main())