From aa2806180e00a09e205f41d088b8f73ea9fe6ccb Mon Sep 17 00:00:00 2001 From: BoulderBadgeDad Date: Fri, 29 May 2026 19:19:28 -0700 Subject: [PATCH] Fix: Soulseek album poll hangs on a stalled peer; failed batches never cleared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs from the Slipknot album never finishing. 1) _poll_album_bundle_downloads hung when the peer stalled. The finish check needs every transfer terminal (completed/failed); the #715 grace only covers 'slskd says Completed but file not on disk'. A transfer stuck InProgress / Queued, or dropped by slskd, is none of those — so it blocked both the finish AND the grace exit, and the poll spun to the full ~6h timeout. Add a bundle-level stall guard: track a progress marker (#terminal transfers, total bytes across pending). If NOTHING advances for _stall_grace (180s) — no terminal transition AND no pending byte movement — the peer has stalled; mark the stuck transfers failed so the existing finish/all-failed checks resolve the bundle with whatever completed (missing tracks then fall back to the per-track matcher). Conservative: only trips when EVERYTHING is frozen, so a slow-but-progressing or still-queued transfer is unaffected. 2) Failed batches lingered in the UI forever ('No tracks loaded'). The auto-cleanup gate removed only complete/error/cancelled phases — 'failed' (e.g. an album-bundle hard failure) was missing, so it never aged out. Add 'failed' to the terminal set so it's removed after 5 minutes like the others. Tests (tests/test_soulseek_album_poll_stall.py): stalled peer → gives up with the completed subset (not the deadline); progressing bundle not falsely stalled; all-stalled → empty; dropped transfers stall out; clean finish unaffected. 124 download/soulseek tests pass; ruff clean. --- core/soulseek_client.py | 50 ++++++++ tests/test_soulseek_album_poll_stall.py | 146 ++++++++++++++++++++++++ web_server.py | 6 +- 3 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 tests/test_soulseek_album_poll_stall.py diff --git a/core/soulseek_client.py b/core/soulseek_client.py index 561e4867..e484468f 100644 --- a/core/soulseek_client.py +++ b/core/soulseek_client.py @@ -1700,6 +1700,22 @@ class SoulseekClient(DownloadSourcePlugin): # Seconds an "slskd Completed but locally unresolved" key # has to stay stuck before we give up on it. _unresolved_grace = 45.0 + # Bundle-level stall guard. The #715 grace above only covers + # "slskd says Completed but the file isn't on disk yet". It does NOT + # cover a transfer the peer stalls on — stuck InProgress / Queued, or + # dropped by slskd entirely — which is never failed, never completed, + # and never marked unresolved, so it blocks BOTH the all-terminal + # finish check AND the grace exit, and the poll spun to the full + # ``get_poll_timeout()`` deadline (the Slipknot hang). If NOTHING + # progresses — no transfer completes/fails and no pending transfer's + # byte count moves — for this long, the folder has stalled: mark the + # stuck transfers failed so the bundle resolves with whatever + # completed (the per-track matcher then handles the missing tracks). + # Conservative on purpose: only trips when EVERYTHING is frozen, so a + # slow-but-progressing or still-queued-then-starting transfer is safe. + _stall_grace = 180.0 + _last_progress_marker = None + _last_progress_at = time.monotonic() while time.monotonic() < deadline: try: downloads = run_async(self.get_all_downloads()) @@ -1755,6 +1771,40 @@ class SoulseekClient(DownloadSourcePlugin): count=len(completed_paths), failed=len(failed_states), ) + + # Bundle-level stall detection (see ``_stall_grace`` above). Advance + # the progress marker on ANY forward motion — a transfer reaching a + # terminal state, or a still-pending transfer downloading more bytes. + # If the marker is frozen for ``_stall_grace``, the peer has stalled; + # mark the stuck transfers failed so the finish/all-failed checks + # below resolve the bundle instead of spinning to the deadline. + now = time.monotonic() + stall_pending = [ + k for k in transfer_keys + if k not in completed_paths and k not in failed_states + ] + pending_bytes = 0 + for k in stall_pending: + dl = by_key.get(k) or by_key.get(( + k[0], os.path.basename((k[1] or '').replace('\\', '/')), + )) + pending_bytes += (getattr(dl, 'transferred', 0) or 0) if dl else 0 + marker = (len(completed_paths) + len(failed_states), pending_bytes) + if marker != _last_progress_marker: + _last_progress_marker = marker + _last_progress_at = now + elif stall_pending and (now - _last_progress_at) >= _stall_grace: + logger.warning( + "[Soulseek album] No progress for %.0fs — peer stalled on %d " + "transfer(s) (stuck / queued / dropped). Marking them failed and " + "resolving with what completed; missing tracks fall back to " + "per-track. Stalled: %s", + _stall_grace, len(stall_pending), + [transfer_keys[k].filename for k in stall_pending[:5]], + ) + for k in stall_pending: + failed_states[k] = 'Stalled' + if completed_paths and len(completed_paths) + len(failed_states) == len(transfer_keys): logger.warning( "[Soulseek album] Selected folder finished with %d completed and %d failed transfer(s)", diff --git a/tests/test_soulseek_album_poll_stall.py b/tests/test_soulseek_album_poll_stall.py new file mode 100644 index 00000000..41ea071a --- /dev/null +++ b/tests/test_soulseek_album_poll_stall.py @@ -0,0 +1,146 @@ +"""Regression for the Soulseek album-bundle poll hanging when the peer stalls. + +`_poll_album_bundle_downloads` waits for every transfer in the selected folder +to reach a terminal state (completed or failed). A transfer the peer stalls on — +stuck InProgress/Queued, or dropped by slskd — is never failed, never completed, +and never marked "completed-but-unresolved", so it used to block both the +all-terminal finish check AND the #715 grace exit, and the poll spun to the full +~6h timeout (the Slipknot hang). + +The fix adds a bundle-level stall guard: if NOTHING progresses (no transfer +reaches terminal AND no pending transfer's byte count moves) for `_stall_grace` +seconds, the stuck transfers are marked failed so the bundle resolves with what +completed. These tests drive the real poll with a fake clock + scripted slskd +states. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch + +from core.soulseek_client import SoulseekClient + + +class _Clock: + def __init__(self): + self.now = 0.0 + + def monotonic(self): + return self.now + + def sleep(self, seconds): + self.now += seconds + + +def _dl(username, filename, state, *, size=100, transferred=100): + return SimpleNamespace( + username=username, filename=filename, state=state, + size=size, transferred=transferred, + ) + + +class _StubClient: + """Minimal stand-in for SoulseekClient with just what the poll touches.""" + + def __init__(self, states, resolvable): + self._states = states # list of download-lists, one per poll; last repeats + self._call = 0 + self._resolvable = set(resolvable) + + def get_all_downloads(self): + i = min(self._call, len(self._states) - 1) + self._call += 1 + return self._states[i] + + def _resolve_downloaded_album_file(self, filename): + base = os.path.basename((filename or "").replace("\\", "/")) + if base in self._resolvable or filename in self._resolvable: + return Path(f"/staged/{base}") + return None + + +def _run_poll(stub, transfer_keys, *, timeout=7200.0, interval=2.0): + clock = _Clock() + emits = [] + with patch("core.soulseek_client.time", clock), \ + patch("core.soulseek_client.run_async", lambda x: x), \ + patch("core.soulseek_client.get_poll_timeout", lambda: timeout), \ + patch("core.soulseek_client.get_poll_interval", lambda: interval): + result = SoulseekClient._poll_album_bundle_downloads( + stub, transfer_keys, lambda phase, **kw: emits.append((phase, kw)) + ) + return result, clock, emits + + +def _keys(*names, user="peer"): + """Build {(user, name): TrackResult-ish} preserving order.""" + return {(user, n): SimpleNamespace(filename=n) for n in names} + + +def test_stalled_peer_gives_up_and_returns_completed_subset(): + tk = _keys("01.flac", "02.flac", "03.flac") + # 01 completes (resolvable); 02/03 stuck InProgress with FROZEN byte counts. + frozen = [ + _dl("peer", "01.flac", "Completed", transferred=100, size=100), + _dl("peer", "02.flac", "InProgress", transferred=50, size=100), + _dl("peer", "03.flac", "InProgress", transferred=30, size=100), + ] + stub = _StubClient([frozen], resolvable={"01.flac"}) + result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0) + + # Resolved with the one completed track instead of hanging to the deadline. + assert result == [Path("/staged/01.flac")] + # Gave up around the stall window (~180s), nowhere near the 7200s timeout. + assert clock.now < 600.0 + + +def test_progressing_bundle_is_not_falsely_stalled(): + tk = _keys("01.flac", "02.flac") + # 02 keeps downloading more bytes each poll, then completes — must NOT trip + # the stall guard even though it takes a while. + states = [ + [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=10, size=100)], + [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=40, size=100)], + [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=80, size=100)], + [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "Completed", transferred=100, size=100)], + ] + stub = _StubClient(states, resolvable={"01.flac", "02.flac"}) + result, _clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0) + + assert set(result) == {Path("/staged/01.flac"), Path("/staged/02.flac")} + + +def test_all_transfers_stalled_returns_empty(): + tk = _keys("01.flac", "02.flac") + frozen = [ + _dl("peer", "01.flac", "InProgress", transferred=10, size=100), + _dl("peer", "02.flac", "Queued", transferred=0, size=100), + ] + stub = _StubClient([frozen], resolvable=set()) + result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0) + + assert result == [] # nothing completed → empty (caller falls back) + assert clock.now < 600.0 # didn't spin to the deadline + + +def test_dropped_transfers_also_stall_out(): + """slskd dropping the transfers entirely (dl=None) must also trip the guard, + not hang — there's no byte progress and nothing terminal.""" + tk = _keys("01.flac", "02.flac") + stub = _StubClient([[]], resolvable=set()) # get_all_downloads returns nothing + result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0) + + assert result == [] + assert clock.now < 600.0 + + +def test_clean_finish_unaffected(): + tk = _keys("01.flac", "02.flac") + done = [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "Succeeded")] + stub = _StubClient([done], resolvable={"01.flac", "02.flac"}) + result, clock, _ = _run_poll(stub, tk) + assert set(result) == {Path("/staged/01.flac"), Path("/staged/02.flac")} + assert clock.now < 10.0 # resolves on the first couple polls diff --git a/web_server.py b/web_server.py index 0da4c37e..f767e7e6 100644 --- a/web_server.py +++ b/web_server.py @@ -1418,8 +1418,10 @@ def validate_and_heal_batch_states(): queue = batch_data.get('queue', []) phase = batch_data.get('phase', 'unknown') - # AUTO-CLEANUP: Remove completed batches after 5 minutes to prevent stale state - if phase in ['complete', 'error', 'cancelled']: + # AUTO-CLEANUP: Remove terminal batches after 5 minutes to prevent stale state. + # 'failed' (e.g. an album-bundle hard failure) was missing here, so a failed + # batch lingered in the UI forever ("No tracks loaded") and never cleared. + if phase in ['complete', 'error', 'cancelled', 'failed']: # Check if batch has a completion timestamp completion_time = batch_data.get('completion_time') if not completion_time: