From 56bdcee4344399ca533127ef2cd7def9008a3e5b Mon Sep 17 00:00:00 2001 From: BoulderBadgeDad Date: Wed, 10 Jun 2026 18:16:22 -0700 Subject: [PATCH] Fix: rejected slskd download hung the task at 'downloading' forever (#836) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A wishlist track (or tracks in an album) that slskd accepted then REJECTED would sit at "DOWNLOADING... 0%" indefinitely, spam an ERROR every status poll ("…Completed, Rejected - letting monitor handle retry"), and — for albums — block the whole batch from ever completing. Root defect: the status formatter's non-manual error branch keeps the task 'downloading' and trusts the retry monitor to resolve it, with NO backstop. When the monitor can't make progress (a rejected transfer with no other source), the task is stuck forever. Backstop: measure how long the ERROR state has persisted (keyed off the task's last status transition, so a slow-but-healthy transfer is never failed, and each monitor-retry episode gets a fresh window). Once it exceeds the monitor's retry window (60s, vs the monitor's ~15s) with no resolution, mark the task failed and fire the worker-freeing completion callback so the batch can finish. Also log the error ONCE per episode instead of every 2s poll. The healthy path is untouched — a working retry transitions the task before the grace, so this never fires. Manual picks still fail immediately (unchanged). Tests: rejected-within-grace stays downloading; rejected-beyond-grace fails + schedules completion; manual pick fails immediately. 45 status tests pass. --- core/downloads/status.py | 71 ++++++++++++++++++++---- tests/downloads/test_downloads_status.py | 63 +++++++++++++++++++++ 2 files changed, 124 insertions(+), 10 deletions(-) diff --git a/core/downloads/status.py b/core/downloads/status.py index 010581b9..f49a202b 100644 --- a/core/downloads/status.py +++ b/core/downloads/status.py @@ -36,6 +36,15 @@ from utils.logging_config import get_logger # Project logger factory so these lines reach app.log (soulsync.* namespace). logger = get_logger("downloads.status") +# #836 backstop: how long an slskd error state (Rejected/Failed/Errored/TimedOut) +# may persist on a non-manual task before the status formatter gives up on the +# retry monitor and marks it failed. The monitor's own retry window is ~15s +# (3 × 5s); this is well beyond it so a healthy retry always wins, and it only +# fires when the monitor genuinely can't make progress (e.g. a rejected transfer +# with no other source) — which otherwise hangs the task at 'downloading 0%' +# forever and blocks the whole batch from completing. +ERROR_STATE_TERMINAL_GRACE_SECONDS = 60 + def _schedule_completion_callback(deps, batch_id: str, task_id: str, success: bool) -> None: """Fire ``deps.on_download_completed`` on a one-shot daemon thread so @@ -394,17 +403,59 @@ def build_batch_status_data(batch_id: str, batch: dict, live_transfers_lookup: d # release the lock. _schedule_completion_callback(deps, batch_id, task_id, False) else: - # UNIFIED ERROR HANDLING: Let monitor handle errors for consistency - # Monitor will detect errored state and trigger retry within 5 seconds - logger.error(f"Task {task_id} API shows error state: {state_str} - letting monitor handle retry") - - # Keep task in current status (downloading/queued) so monitor can detect error - # Don't mark as failed here - let the unified retry system handle it - if task['status'] in ['searching', 'downloading', 'queued']: - task_status['status'] = task['status'] # Keep current status for monitor + # Normally the retry monitor picks up an errored state and + # retries within ~15s. But if it can't make progress — e.g. an + # slskd transfer rejected with no other source — the task would + # otherwise sit at 'downloading 0%' forever, spam an ERROR every + # poll, AND block its batch from ever completing (#836: a rejected + # wishlist track, or rejected tracks in an album download). + # + # Backstop: measure how long the ERROR state has persisted (not + # how long the task has downloaded, so a slow-but-healthy transfer + # isn't failed). Once it exceeds the monitor's retry window with no + # resolution, mark the task failed so the worker frees and the + # batch can finish. A working retry transitions the task out of the + # error state first, clearing the timer below — so the healthy path + # never hits this. + # A monitor retry transitions the task (newer + # status_change_time), which restarts the window so each + # error EPISODE gets a fresh grace. If the monitor never + # transitions it (the stuck case), the window keeps growing. + err_since = task.get('_error_state_since') + if err_since is None or task.get('status_change_time', 0) > err_since: + task['_error_state_since'] = err_since = current_time + task.pop('_error_state_logged', None) + error_age = current_time - err_since + + if error_age > ERROR_STATE_TERMINAL_GRACE_SECONDS: + err_msg = live_info.get('errorMessage') or live_info.get('error') or '' + task['status'] = 'failed' + task['error_message'] = ( + str(err_msg) if err_msg + else f'Download failed (state: {state_str})' + ) + task_status['status'] = 'failed' + task_status['error_message'] = task['error_message'] + logger.warning( + f"Task {task_id} stuck in error state '{state_str}' for " + f"{error_age:.0f}s with no retry progress — marking failed (#836)" + ) + _schedule_completion_callback(deps, batch_id, task_id, False) else: - task_status['status'] = 'downloading' # Default to downloading for error detection - task['status'] = 'downloading' + # Within the retry window — keep current status so the monitor + # can act. Log once per episode, not every poll, to stop the + # 2-second ERROR spam the reporter saw. + if not task.get('_error_state_logged'): + logger.warning( + f"Task {task_id} API shows error state: {state_str} " + f"- letting monitor handle retry" + ) + task['_error_state_logged'] = True + if task['status'] in ['searching', 'downloading', 'queued']: + task_status['status'] = task['status'] # Keep current status for monitor + else: + task_status['status'] = 'downloading' # Default to downloading for error detection + task['status'] = 'downloading' elif 'Completed' in state_str or 'Succeeded' in state_str: # Verify bytes actually transferred before trusting state string expected_size = live_info.get('size', 0) diff --git a/tests/downloads/test_downloads_status.py b/tests/downloads/test_downloads_status.py index 522137e5..7050e3e6 100644 --- a/tests/downloads/test_downloads_status.py +++ b/tests/downloads/test_downloads_status.py @@ -761,3 +761,66 @@ def test_unified_response_caps_persistent_history_tail(): assert requested_limits == [50] assert len(out['downloads']) == 50 assert out['total'] == 50 + + +# --------------------------------------------------------------------------- +# #836 — a rejected slskd transfer must not hang the task at 'downloading' +# forever. The monitor normally retries; when it can't make progress, a backstop +# in the status formatter marks the task failed after a grace window so the +# worker frees and the batch can complete. +# --------------------------------------------------------------------------- + +def test_rejected_within_grace_keeps_downloading_for_monitor(): + import time + deps, _ = _build_deps() + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + 'status_change_time': now - 5, + } + live = {'u1::song.flac': {'state': 'Completed, Rejected', 'percentComplete': 0}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + # inside the grace window — give the monitor its chance, don't fail yet + assert out['tasks'][0]['status'] == 'downloading' + assert download_tasks['t1']['status'] == 'downloading' + + +def test_rejected_beyond_grace_marks_failed_and_frees_worker(): + import time + completed = [] + deps, _ = _build_deps() + deps.on_download_completed = lambda b, t, s: completed.append((b, t, s)) + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + # error persisted past the grace, no monitor transition since + 'status_change_time': now - 130, + '_error_state_since': now - (st.ERROR_STATE_TERMINAL_GRACE_SECONDS + 30), + } + live = {'u1::song.flac': {'state': 'Completed, Rejected', 'errorMessage': 'peer rejected'}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + assert out['tasks'][0]['status'] == 'failed' + assert download_tasks['t1']['status'] == 'failed' + assert download_tasks['t1']['error_message'] == 'peer rejected' + time.sleep(0.05) # completion callback runs on a daemon thread + assert completed == [('b1', 't1', False)] # batch can now finish + + +def test_manual_pick_rejected_fails_immediately_without_grace(): + import time + deps, _ = _build_deps() + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + '_user_manual_pick': True, 'status_change_time': now, + } + live = {'u1::song.flac': {'state': 'Completed, Rejected'}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + assert out['tasks'][0]['status'] == 'failed' # immediate, no 60s wait + assert download_tasks['t1']['status'] == 'failed'