Fix: Soulseek album poll hangs on a stalled peer; failed batches never cleared

Two related bugs from the Slipknot album never finishing.

1) _poll_album_bundle_downloads hung when the peer stalled. The finish check
   needs every transfer terminal (completed/failed); the #715 grace only covers
   'slskd says Completed but file not on disk'. A transfer stuck InProgress /
   Queued, or dropped by slskd, is none of those — so it blocked both the finish
   AND the grace exit, and the poll spun to the full ~6h timeout.

   Add a bundle-level stall guard: track a progress marker (#terminal transfers,
   total bytes across pending). If NOTHING advances for _stall_grace (180s) —
   no terminal transition AND no pending byte movement — the peer has stalled;
   mark the stuck transfers failed so the existing finish/all-failed checks
   resolve the bundle with whatever completed (missing tracks then fall back to
   the per-track matcher). Conservative: only trips when EVERYTHING is frozen,
   so a slow-but-progressing or still-queued transfer is unaffected.

2) Failed batches lingered in the UI forever ('No tracks loaded'). The
   auto-cleanup gate removed only complete/error/cancelled phases — 'failed'
   (e.g. an album-bundle hard failure) was missing, so it never aged out. Add
   'failed' to the terminal set so it's removed after 5 minutes like the others.

Tests (tests/test_soulseek_album_poll_stall.py): stalled peer → gives up with the
completed subset (not the deadline); progressing bundle not falsely stalled;
all-stalled → empty; dropped transfers stall out; clean finish unaffected.
124 download/soulseek tests pass; ruff clean.
pull/747/head
BoulderBadgeDad 3 weeks ago
parent 8ec6d2ae83
commit aa2806180e

@ -1700,6 +1700,22 @@ class SoulseekClient(DownloadSourcePlugin):
# Seconds an "slskd Completed but locally unresolved" key
# has to stay stuck before we give up on it.
_unresolved_grace = 45.0
# Bundle-level stall guard. The #715 grace above only covers
# "slskd says Completed but the file isn't on disk yet". It does NOT
# cover a transfer the peer stalls on — stuck InProgress / Queued, or
# dropped by slskd entirely — which is never failed, never completed,
# and never marked unresolved, so it blocks BOTH the all-terminal
# finish check AND the grace exit, and the poll spun to the full
# ``get_poll_timeout()`` deadline (the Slipknot hang). If NOTHING
# progresses — no transfer completes/fails and no pending transfer's
# byte count moves — for this long, the folder has stalled: mark the
# stuck transfers failed so the bundle resolves with whatever
# completed (the per-track matcher then handles the missing tracks).
# Conservative on purpose: only trips when EVERYTHING is frozen, so a
# slow-but-progressing or still-queued-then-starting transfer is safe.
_stall_grace = 180.0
_last_progress_marker = None
_last_progress_at = time.monotonic()
while time.monotonic() < deadline:
try:
downloads = run_async(self.get_all_downloads())
@ -1755,6 +1771,40 @@ class SoulseekClient(DownloadSourcePlugin):
count=len(completed_paths),
failed=len(failed_states),
)
# Bundle-level stall detection (see ``_stall_grace`` above). Advance
# the progress marker on ANY forward motion — a transfer reaching a
# terminal state, or a still-pending transfer downloading more bytes.
# If the marker is frozen for ``_stall_grace``, the peer has stalled;
# mark the stuck transfers failed so the finish/all-failed checks
# below resolve the bundle instead of spinning to the deadline.
now = time.monotonic()
stall_pending = [
k for k in transfer_keys
if k not in completed_paths and k not in failed_states
]
pending_bytes = 0
for k in stall_pending:
dl = by_key.get(k) or by_key.get((
k[0], os.path.basename((k[1] or '').replace('\\', '/')),
))
pending_bytes += (getattr(dl, 'transferred', 0) or 0) if dl else 0
marker = (len(completed_paths) + len(failed_states), pending_bytes)
if marker != _last_progress_marker:
_last_progress_marker = marker
_last_progress_at = now
elif stall_pending and (now - _last_progress_at) >= _stall_grace:
logger.warning(
"[Soulseek album] No progress for %.0fs — peer stalled on %d "
"transfer(s) (stuck / queued / dropped). Marking them failed and "
"resolving with what completed; missing tracks fall back to "
"per-track. Stalled: %s",
_stall_grace, len(stall_pending),
[transfer_keys[k].filename for k in stall_pending[:5]],
)
for k in stall_pending:
failed_states[k] = 'Stalled'
if completed_paths and len(completed_paths) + len(failed_states) == len(transfer_keys):
logger.warning(
"[Soulseek album] Selected folder finished with %d completed and %d failed transfer(s)",

@ -0,0 +1,146 @@
"""Regression for the Soulseek album-bundle poll hanging when the peer stalls.
`_poll_album_bundle_downloads` waits for every transfer in the selected folder
to reach a terminal state (completed or failed). A transfer the peer stalls on
stuck InProgress/Queued, or dropped by slskd is never failed, never completed,
and never marked "completed-but-unresolved", so it used to block both the
all-terminal finish check AND the #715 grace exit, and the poll spun to the full
~6h timeout (the Slipknot hang).
The fix adds a bundle-level stall guard: if NOTHING progresses (no transfer
reaches terminal AND no pending transfer's byte count moves) for `_stall_grace`
seconds, the stuck transfers are marked failed so the bundle resolves with what
completed. These tests drive the real poll with a fake clock + scripted slskd
states.
"""
from __future__ import annotations
import os
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch
from core.soulseek_client import SoulseekClient
class _Clock:
def __init__(self):
self.now = 0.0
def monotonic(self):
return self.now
def sleep(self, seconds):
self.now += seconds
def _dl(username, filename, state, *, size=100, transferred=100):
return SimpleNamespace(
username=username, filename=filename, state=state,
size=size, transferred=transferred,
)
class _StubClient:
"""Minimal stand-in for SoulseekClient with just what the poll touches."""
def __init__(self, states, resolvable):
self._states = states # list of download-lists, one per poll; last repeats
self._call = 0
self._resolvable = set(resolvable)
def get_all_downloads(self):
i = min(self._call, len(self._states) - 1)
self._call += 1
return self._states[i]
def _resolve_downloaded_album_file(self, filename):
base = os.path.basename((filename or "").replace("\\", "/"))
if base in self._resolvable or filename in self._resolvable:
return Path(f"/staged/{base}")
return None
def _run_poll(stub, transfer_keys, *, timeout=7200.0, interval=2.0):
clock = _Clock()
emits = []
with patch("core.soulseek_client.time", clock), \
patch("core.soulseek_client.run_async", lambda x: x), \
patch("core.soulseek_client.get_poll_timeout", lambda: timeout), \
patch("core.soulseek_client.get_poll_interval", lambda: interval):
result = SoulseekClient._poll_album_bundle_downloads(
stub, transfer_keys, lambda phase, **kw: emits.append((phase, kw))
)
return result, clock, emits
def _keys(*names, user="peer"):
"""Build {(user, name): TrackResult-ish} preserving order."""
return {(user, n): SimpleNamespace(filename=n) for n in names}
def test_stalled_peer_gives_up_and_returns_completed_subset():
tk = _keys("01.flac", "02.flac", "03.flac")
# 01 completes (resolvable); 02/03 stuck InProgress with FROZEN byte counts.
frozen = [
_dl("peer", "01.flac", "Completed", transferred=100, size=100),
_dl("peer", "02.flac", "InProgress", transferred=50, size=100),
_dl("peer", "03.flac", "InProgress", transferred=30, size=100),
]
stub = _StubClient([frozen], resolvable={"01.flac"})
result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0)
# Resolved with the one completed track instead of hanging to the deadline.
assert result == [Path("/staged/01.flac")]
# Gave up around the stall window (~180s), nowhere near the 7200s timeout.
assert clock.now < 600.0
def test_progressing_bundle_is_not_falsely_stalled():
tk = _keys("01.flac", "02.flac")
# 02 keeps downloading more bytes each poll, then completes — must NOT trip
# the stall guard even though it takes a while.
states = [
[_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=10, size=100)],
[_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=40, size=100)],
[_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "InProgress", transferred=80, size=100)],
[_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "Completed", transferred=100, size=100)],
]
stub = _StubClient(states, resolvable={"01.flac", "02.flac"})
result, _clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0)
assert set(result) == {Path("/staged/01.flac"), Path("/staged/02.flac")}
def test_all_transfers_stalled_returns_empty():
tk = _keys("01.flac", "02.flac")
frozen = [
_dl("peer", "01.flac", "InProgress", transferred=10, size=100),
_dl("peer", "02.flac", "Queued", transferred=0, size=100),
]
stub = _StubClient([frozen], resolvable=set())
result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0)
assert result == [] # nothing completed → empty (caller falls back)
assert clock.now < 600.0 # didn't spin to the deadline
def test_dropped_transfers_also_stall_out():
"""slskd dropping the transfers entirely (dl=None) must also trip the guard,
not hang there's no byte progress and nothing terminal."""
tk = _keys("01.flac", "02.flac")
stub = _StubClient([[]], resolvable=set()) # get_all_downloads returns nothing
result, clock, _ = _run_poll(stub, tk, timeout=7200.0, interval=2.0)
assert result == []
assert clock.now < 600.0
def test_clean_finish_unaffected():
tk = _keys("01.flac", "02.flac")
done = [_dl("peer", "01.flac", "Completed"), _dl("peer", "02.flac", "Succeeded")]
stub = _StubClient([done], resolvable={"01.flac", "02.flac"})
result, clock, _ = _run_poll(stub, tk)
assert set(result) == {Path("/staged/01.flac"), Path("/staged/02.flac")}
assert clock.now < 10.0 # resolves on the first couple polls

@ -1418,8 +1418,10 @@ def validate_and_heal_batch_states():
queue = batch_data.get('queue', [])
phase = batch_data.get('phase', 'unknown')
# AUTO-CLEANUP: Remove completed batches after 5 minutes to prevent stale state
if phase in ['complete', 'error', 'cancelled']:
# AUTO-CLEANUP: Remove terminal batches after 5 minutes to prevent stale state.
# 'failed' (e.g. an album-bundle hard failure) was missing here, so a failed
# batch lingered in the UI forever ("No tracks loaded") and never cleared.
if phase in ['complete', 'error', 'cancelled', 'failed']:
# Check if batch has a completion timestamp
completion_time = batch_data.get('completion_time')
if not completion_time:

Loading…
Cancel
Save