mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
137 lines
5.1 KiB
137 lines
5.1 KiB
"""On-demand memory-growth diagnostic (issue #802: ~0.7 MiB/s RSS growth).
|
|
|
|
Wraps ``tracemalloc`` so a user seeing runaway memory can capture WHERE the
|
|
allocations come from instead of us guessing:
|
|
|
|
1. start_tracking() — begins tracing + stores a baseline snapshot
|
|
2. ...reproduce the growth for a few minutes...
|
|
3. report() — top allocation sites, with the DELTA since baseline
|
|
(the delta is the leak; absolute sizes are mostly
|
|
startup noise)
|
|
4. stop_tracking() — ends tracing, frees trace memory
|
|
|
|
Opt-in by design: tracemalloc costs CPU and memory while active (it shadows
|
|
every allocation), so it must never run by default. The Flask endpoints that
|
|
expose this live in web_server (GET /api/debug/memory/...) so a user can drive
|
|
the whole flow from a browser.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import time
|
|
import tracemalloc
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from utils.logging_config import get_logger
|
|
|
|
logger = get_logger("diagnostics.memory")
|
|
|
|
_baseline: Optional[tracemalloc.Snapshot] = None
|
|
_started_at: Optional[float] = None
|
|
|
|
# Allocation-site traces this deep give useful "who called it" context without
|
|
# pathological overhead.
|
|
_TRACE_FRAMES = 15
|
|
|
|
|
|
def is_tracking() -> bool:
|
|
return tracemalloc.is_tracing()
|
|
|
|
|
|
def start_tracking() -> Dict[str, Any]:
|
|
"""Begin tracing and store the baseline snapshot. Idempotent."""
|
|
global _baseline, _started_at
|
|
if tracemalloc.is_tracing():
|
|
return {"tracking": True, "already_running": True, "started_at": _started_at}
|
|
tracemalloc.start(_TRACE_FRAMES)
|
|
_baseline = tracemalloc.take_snapshot()
|
|
_started_at = time.time()
|
|
logger.info("Memory tracking started (tracemalloc, %d frames)", _TRACE_FRAMES)
|
|
return {"tracking": True, "already_running": False, "started_at": _started_at}
|
|
|
|
|
|
def stop_tracking() -> Dict[str, Any]:
|
|
"""End tracing and free the trace bookkeeping."""
|
|
global _baseline, _started_at
|
|
was = tracemalloc.is_tracing()
|
|
if was:
|
|
tracemalloc.stop()
|
|
logger.info("Memory tracking stopped")
|
|
_baseline = None
|
|
_started_at = None
|
|
return {"tracking": False, "was_tracking": was}
|
|
|
|
|
|
def _rss_mb() -> Optional[float]:
|
|
"""Process RSS in MiB, best-effort (psutil, then /proc fallback)."""
|
|
try:
|
|
import psutil
|
|
return round(psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024), 1)
|
|
except Exception: # noqa: S110 — RSS is optional context; fall through to /proc
|
|
pass
|
|
try:
|
|
with open("/proc/self/status", encoding="utf-8") as fh:
|
|
for line in fh:
|
|
if line.startswith("VmRSS:"):
|
|
return round(int(line.split()[1]) / 1024, 1)
|
|
except Exception: # noqa: S110 — no /proc on this platform; RSS stays None
|
|
pass
|
|
return None
|
|
|
|
|
|
def format_stat(stat: Any) -> Dict[str, Any]:
|
|
"""Project one tracemalloc StatisticDiff/Statistic into a plain dict.
|
|
Duck-typed (reads size/count/size_diff/count_diff/traceback) so it's
|
|
unit-testable without real snapshots."""
|
|
tb = getattr(stat, "traceback", None)
|
|
frames: List[str] = []
|
|
if tb:
|
|
# Most-recent-call-last reads naturally top-down in a report.
|
|
for frame in list(tb)[-3:]:
|
|
frames.append(f"{frame.filename}:{frame.lineno}")
|
|
return {
|
|
"location": frames[-1] if frames else "?",
|
|
"trace": frames,
|
|
"size_mb": round(getattr(stat, "size", 0) / (1024 * 1024), 3),
|
|
"size_diff_mb": round(getattr(stat, "size_diff", 0) / (1024 * 1024), 3),
|
|
"count": getattr(stat, "count", 0),
|
|
"count_diff": getattr(stat, "count_diff", 0),
|
|
}
|
|
|
|
|
|
def report(top: int = 25) -> Dict[str, Any]:
|
|
"""Current snapshot vs the start_tracking() baseline: the top allocation
|
|
sites by GROWTH (size_diff). Includes traced totals + process RSS so the
|
|
user can see how much of the real growth tracemalloc accounts for."""
|
|
if not tracemalloc.is_tracing():
|
|
return {
|
|
"tracking": False,
|
|
"rss_mb": _rss_mb(),
|
|
"hint": "Start with /api/debug/memory/start, reproduce the growth "
|
|
"for a few minutes, then call this again.",
|
|
}
|
|
snapshot = tracemalloc.take_snapshot()
|
|
# Filter the tracer's own bookkeeping out of the picture.
|
|
snapshot = snapshot.filter_traces((
|
|
tracemalloc.Filter(False, tracemalloc.__file__),
|
|
tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
|
|
))
|
|
current, peak = tracemalloc.get_traced_memory()
|
|
|
|
if _baseline is not None:
|
|
stats = snapshot.compare_to(_baseline, "traceback")
|
|
stats.sort(key=lambda s: s.size_diff, reverse=True)
|
|
else:
|
|
stats = snapshot.statistics("traceback")
|
|
|
|
return {
|
|
"tracking": True,
|
|
"started_at": _started_at,
|
|
"elapsed_seconds": round(time.time() - _started_at, 1) if _started_at else None,
|
|
"traced_current_mb": round(current / (1024 * 1024), 1),
|
|
"traced_peak_mb": round(peak / (1024 * 1024), 1),
|
|
"rss_mb": _rss_mb(),
|
|
"top_growth": [format_stat(s) for s in stats[:top]],
|
|
}
|