diff --git a/core/downloads/status.py b/core/downloads/status.py index 010581b9..f49a202b 100644 --- a/core/downloads/status.py +++ b/core/downloads/status.py @@ -36,6 +36,15 @@ from utils.logging_config import get_logger # Project logger factory so these lines reach app.log (soulsync.* namespace). logger = get_logger("downloads.status") +# #836 backstop: how long an slskd error state (Rejected/Failed/Errored/TimedOut) +# may persist on a non-manual task before the status formatter gives up on the +# retry monitor and marks it failed. The monitor's own retry window is ~15s +# (3 × 5s); this is well beyond it so a healthy retry always wins, and it only +# fires when the monitor genuinely can't make progress (e.g. a rejected transfer +# with no other source) — which otherwise hangs the task at 'downloading 0%' +# forever and blocks the whole batch from completing. +ERROR_STATE_TERMINAL_GRACE_SECONDS = 60 + def _schedule_completion_callback(deps, batch_id: str, task_id: str, success: bool) -> None: """Fire ``deps.on_download_completed`` on a one-shot daemon thread so @@ -394,17 +403,59 @@ def build_batch_status_data(batch_id: str, batch: dict, live_transfers_lookup: d # release the lock. _schedule_completion_callback(deps, batch_id, task_id, False) else: - # UNIFIED ERROR HANDLING: Let monitor handle errors for consistency - # Monitor will detect errored state and trigger retry within 5 seconds - logger.error(f"Task {task_id} API shows error state: {state_str} - letting monitor handle retry") - - # Keep task in current status (downloading/queued) so monitor can detect error - # Don't mark as failed here - let the unified retry system handle it - if task['status'] in ['searching', 'downloading', 'queued']: - task_status['status'] = task['status'] # Keep current status for monitor + # Normally the retry monitor picks up an errored state and + # retries within ~15s. But if it can't make progress — e.g. an + # slskd transfer rejected with no other source — the task would + # otherwise sit at 'downloading 0%' forever, spam an ERROR every + # poll, AND block its batch from ever completing (#836: a rejected + # wishlist track, or rejected tracks in an album download). + # + # Backstop: measure how long the ERROR state has persisted (not + # how long the task has downloaded, so a slow-but-healthy transfer + # isn't failed). Once it exceeds the monitor's retry window with no + # resolution, mark the task failed so the worker frees and the + # batch can finish. A working retry transitions the task out of the + # error state first, clearing the timer below — so the healthy path + # never hits this. + # A monitor retry transitions the task (newer + # status_change_time), which restarts the window so each + # error EPISODE gets a fresh grace. If the monitor never + # transitions it (the stuck case), the window keeps growing. + err_since = task.get('_error_state_since') + if err_since is None or task.get('status_change_time', 0) > err_since: + task['_error_state_since'] = err_since = current_time + task.pop('_error_state_logged', None) + error_age = current_time - err_since + + if error_age > ERROR_STATE_TERMINAL_GRACE_SECONDS: + err_msg = live_info.get('errorMessage') or live_info.get('error') or '' + task['status'] = 'failed' + task['error_message'] = ( + str(err_msg) if err_msg + else f'Download failed (state: {state_str})' + ) + task_status['status'] = 'failed' + task_status['error_message'] = task['error_message'] + logger.warning( + f"Task {task_id} stuck in error state '{state_str}' for " + f"{error_age:.0f}s with no retry progress — marking failed (#836)" + ) + _schedule_completion_callback(deps, batch_id, task_id, False) else: - task_status['status'] = 'downloading' # Default to downloading for error detection - task['status'] = 'downloading' + # Within the retry window — keep current status so the monitor + # can act. Log once per episode, not every poll, to stop the + # 2-second ERROR spam the reporter saw. + if not task.get('_error_state_logged'): + logger.warning( + f"Task {task_id} API shows error state: {state_str} " + f"- letting monitor handle retry" + ) + task['_error_state_logged'] = True + if task['status'] in ['searching', 'downloading', 'queued']: + task_status['status'] = task['status'] # Keep current status for monitor + else: + task_status['status'] = 'downloading' # Default to downloading for error detection + task['status'] = 'downloading' elif 'Completed' in state_str or 'Succeeded' in state_str: # Verify bytes actually transferred before trusting state string expected_size = live_info.get('size', 0) diff --git a/tests/downloads/test_downloads_status.py b/tests/downloads/test_downloads_status.py index 522137e5..7050e3e6 100644 --- a/tests/downloads/test_downloads_status.py +++ b/tests/downloads/test_downloads_status.py @@ -761,3 +761,66 @@ def test_unified_response_caps_persistent_history_tail(): assert requested_limits == [50] assert len(out['downloads']) == 50 assert out['total'] == 50 + + +# --------------------------------------------------------------------------- +# #836 — a rejected slskd transfer must not hang the task at 'downloading' +# forever. The monitor normally retries; when it can't make progress, a backstop +# in the status formatter marks the task failed after a grace window so the +# worker frees and the batch can complete. +# --------------------------------------------------------------------------- + +def test_rejected_within_grace_keeps_downloading_for_monitor(): + import time + deps, _ = _build_deps() + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + 'status_change_time': now - 5, + } + live = {'u1::song.flac': {'state': 'Completed, Rejected', 'percentComplete': 0}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + # inside the grace window — give the monitor its chance, don't fail yet + assert out['tasks'][0]['status'] == 'downloading' + assert download_tasks['t1']['status'] == 'downloading' + + +def test_rejected_beyond_grace_marks_failed_and_frees_worker(): + import time + completed = [] + deps, _ = _build_deps() + deps.on_download_completed = lambda b, t, s: completed.append((b, t, s)) + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + # error persisted past the grace, no monitor transition since + 'status_change_time': now - 130, + '_error_state_since': now - (st.ERROR_STATE_TERMINAL_GRACE_SECONDS + 30), + } + live = {'u1::song.flac': {'state': 'Completed, Rejected', 'errorMessage': 'peer rejected'}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + assert out['tasks'][0]['status'] == 'failed' + assert download_tasks['t1']['status'] == 'failed' + assert download_tasks['t1']['error_message'] == 'peer rejected' + time.sleep(0.05) # completion callback runs on a daemon thread + assert completed == [('b1', 't1', False)] # batch can now finish + + +def test_manual_pick_rejected_fails_immediately_without_grace(): + import time + deps, _ = _build_deps() + now = time.time() + download_tasks['t1'] = { + 'track_index': 0, 'status': 'downloading', 'track_info': {}, + 'filename': 'song.flac', 'username': 'u1', + '_user_manual_pick': True, 'status_change_time': now, + } + live = {'u1::song.flac': {'state': 'Completed, Rejected'}} + batch = {'phase': 'downloading', 'queue': ['t1']} + out = st.build_batch_status_data('b1', batch, live, deps) + assert out['tasks'][0]['status'] == 'failed' # immediate, no 60s wait + assert download_tasks['t1']['status'] == 'failed'