You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/downloads/monitor.py

884 lines
48 KiB

"""WebUIDownloadMonitor — lifted from web_server.py.
The class body is byte-identical to the original. Module-level globals
(injected via ``init()`` from web_server) include the worker / completion
helpers and orchestrator handles. ``IS_SHUTTING_DOWN`` is a module-level
flag mirrored from web_server's own flag in ``_shutdown_runtime_components``.
"""
import threading
import time
from config.settings import config_manager
from core.runtime_state import (
download_batches,
download_tasks,
matched_context_lock,
matched_downloads_context,
tasks_lock,
)
from utils.async_helpers import run_async
from utils.logging_config import get_logger
# Project logger factory so these lines reach app.log (soulsync.* namespace).
logger = get_logger("downloads.monitor")
# Mirrored from web_server.IS_SHUTTING_DOWN via _shutdown_runtime_components.
IS_SHUTTING_DOWN = False
# Injected at runtime via init() — these are defined later in web_server.py
# than the class is instantiated, so we late-bind them.
_make_context_key = None
_on_download_completed = None
_download_track_worker = None
_run_post_processing_worker = None
_start_next_batch_of_downloads = None
_orphaned_download_keys = None
missing_download_executor = None
download_orchestrator = None
_RELEASE_SOURCE_NAMES = frozenset(('torrent', 'usenet'))
def _download_id_key(download_id):
return f"download_id::{download_id}" if download_id else None
def _is_release_task(task):
ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
username = task.get('username') or ti.get('username')
return username in _RELEASE_SOURCE_NAMES
def _lookup_live_info(task, live_transfers_lookup):
ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
download_id = task.get('download_id')
if _is_release_task(task):
by_id = live_transfers_lookup.get(_download_id_key(download_id))
if by_id:
return by_id
task_filename = task.get('filename') or ti.get('filename')
task_username = task.get('username') or ti.get('username')
if not task_filename or not task_username:
return None
return live_transfers_lookup.get(_make_context_key(task_username, task_filename))
def init(
make_context_key,
on_download_completed,
download_track_worker,
run_post_processing_worker,
start_next_batch_of_downloads,
orphaned_download_keys,
missing_download_executor_obj,
download_orchestrator_obj,
):
"""Bind web_server-side helpers/globals so the class body can resolve them."""
global _make_context_key, _on_download_completed, _download_track_worker
global _run_post_processing_worker, _start_next_batch_of_downloads
global _orphaned_download_keys, missing_download_executor, download_orchestrator
_make_context_key = make_context_key
_on_download_completed = on_download_completed
_download_track_worker = download_track_worker
_run_post_processing_worker = run_post_processing_worker
_start_next_batch_of_downloads = start_next_batch_of_downloads
_orphaned_download_keys = orphaned_download_keys
missing_download_executor = missing_download_executor_obj
download_orchestrator = download_orchestrator_obj
class WebUIDownloadMonitor:
"""
Background monitor for download progress and retry logic, matching GUI's SyncStatusProcessingWorker.
Implements identical timeout detection and automatic retry functionality.
"""
def __init__(self):
self.monitoring = False
self.monitor_thread = None
self.monitored_batches = set()
self._lock = threading.Lock()
def start_monitoring(self, batch_id):
"""Start monitoring a download batch"""
with self._lock:
self.monitored_batches.add(batch_id)
if not self.monitoring:
self.monitoring = True
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.monitor_thread.start()
logger.info(f"Started download monitor for batch {batch_id}")
def stop_monitoring(self, batch_id):
"""Stop monitoring a specific batch"""
with self._lock:
self.monitored_batches.discard(batch_id)
if not self.monitored_batches:
self.monitoring = False
logger.debug("Stopped download monitor (no active batches)")
def shutdown(self):
"""Stop the monitor loop and clear active batch tracking."""
with self._lock:
self.monitoring = False
self.monitored_batches.clear()
self.monitor_thread = None
logger.info("Download monitor shutdown requested")
def _monitor_loop(self):
"""Main monitoring loop - checks downloads every 1 second for responsive web UX"""
while self.monitoring and self.monitored_batches:
try:
if globals().get('IS_SHUTTING_DOWN', False):
self.monitoring = False
break
self._check_all_downloads()
time.sleep(1) # 1-second polling for fast web UI updates
except Exception as e:
# If we get shutdown errors, stop monitoring gracefully
if "interpreter shutdown" in str(e) or "cannot schedule new futures" in str(e):
logger.info("Monitor detected shutdown, stopping gracefully")
self.monitoring = False
break
logger.error(f"Download monitor error: {e}")
logger.info("Download monitor loop ended")
def _check_all_downloads(self):
"""Check all active downloads for timeouts and failures"""
current_time = time.time()
# Get live transfer data from slskd
live_transfers_lookup = self._get_live_transfers()
# Track tasks with exhausted retries to handle after releasing lock
exhausted_tasks = [] # List of (batch_id, task_id) tuples
# Track completed downloads to handle after releasing lock (prevents deadlock)
completed_tasks = [] # List of (batch_id, task_id) tuples
# Track deferred operations (network calls, nested locks) to run after releasing tasks_lock
deferred_ops = []
with tasks_lock:
# Check all monitored batches for timeouts and errors
for batch_id in list(self.monitored_batches):
if batch_id not in download_batches:
self.monitored_batches.discard(batch_id)
continue
for task_id in download_batches[batch_id].get('queue', []):
task = download_tasks.get(task_id)
if not task:
continue
release_recoverable = (
_is_release_task(task)
and task.get('download_id')
and task.get('status') in ['failed', 'not_found']
)
if task['status'] not in ['downloading', 'queued'] and not release_recoverable:
continue
# Check for timeouts and errors - retries handled directly in _should_retry_task
# If _should_retry_task returns True, it means retries were exhausted
retry_exhausted = False
if not release_recoverable:
retry_exhausted = self._should_retry_task(task_id, task, live_transfers_lookup, current_time, deferred_ops)
# Collect exhausted tasks to handle outside lock (prevents deadlock)
if retry_exhausted:
exhausted_tasks.append((batch_id, task_id))
# ENHANCED: Check for successful completions (especially YouTube).
# Release-style sources can report a completed audio file
# name that differs from the original indexer URL/title
# stored on the task, so prefer the stable download_id.
live_info = _lookup_live_info(task, live_transfers_lookup)
if live_info:
state = live_info.get('state', '')
# Trigger post-processing if download is completed successfully
# slskd uses compound states like 'Completed, Succeeded' - use substring matching
# Must exclude error states first (matching _build_batch_status_data's prioritized checking)
has_error = ('Errored' in state or 'Failed' in state or 'Rejected' in state or 'TimedOut' in state)
has_completion = ('Completed' in state or 'Succeeded' in state)
# Verify bytes actually transferred before trusting state string.
# slskd can report "Completed" before the full file is flushed to disk,
# or on connection drops that leave a partial file.
if has_completion and not has_error:
expected_size = live_info.get('size', 0)
transferred = live_info.get('bytesTransferred', 0)
if expected_size > 0 and transferred < expected_size:
if not task.get('_incomplete_warned'):
logger.debug("Monitor: %s state=%s but bytes incomplete (%s/%s) - waiting", task_id, state, transferred, expected_size)
task['_incomplete_warned'] = True
continue
if has_completion and not has_error and (
task['status'] == 'downloading' or release_recoverable
):
task.pop('_incomplete_warned', None)
# CRITICAL FIX: Transition to 'post_processing' HERE so downloads
# don't depend on browser polling to trigger post-processing.
# Previously, post-processing was only submitted by _build_batch_status_data
# (called from browser-polled endpoints), meaning closing the browser
# left tasks stuck in 'downloading' forever.
task['status'] = 'post_processing'
task['status_change_time'] = current_time
logger.info(f"Monitor detected completed download for {task_id} ({state}) - submitting post-processing")
# Collect for handling outside the lock to prevent deadlock.
# _on_download_completed acquires tasks_lock which is non-reentrant.
completed_tasks.append((batch_id, task_id))
# ---- All work below runs WITHOUT tasks_lock held ----
if globals().get('IS_SHUTTING_DOWN', False) or not self.monitoring:
return
# Execute deferred operations from _should_retry_task (network calls, nested locks)
for op in deferred_ops:
try:
if op[0] == 'cancel_download':
# Issue #648 diagnostic — `op` now carries a trigger
# label (4-tuple, was 3-tuple) so the next log dump
# tells us WHICH path in `_should_retry_task` is
# firing for users seeing "Tidal downloads failed to
# start" mass-cancels. Label format pinned in commit
# message for grep-ability.
if len(op) >= 4:
_, download_id, username, trigger = op[0], op[1], op[2], op[3]
else:
_, download_id, username = op
trigger = 'unlabeled'
logger.info(
f"[CancelTrigger:monitor.{trigger}] download_id={download_id} "
f"username={username}"
)
run_async(download_orchestrator.cancel_download(download_id, username, remove=True))
logger.debug(f"[Deferred] Successfully cancelled download {download_id}")
elif op[0] == 'cleanup_orphan':
_, context_key = op
with matched_context_lock:
matched_downloads_context.pop(context_key, None)
logger.debug(f"[Deferred] Cleaned up orphaned download context: {context_key}")
elif op[0] == 'restart_worker':
_, task_id, batch_id = op
logger.debug(f"[Deferred] Restarting worker for task {task_id}")
missing_download_executor.submit(_download_track_worker, task_id, batch_id)
logger.debug(f"[Deferred] Successfully restarted worker for task {task_id}")
except Exception as e:
logger.error(f"[Deferred] Error executing deferred operation {op[0]}: {e}")
# Handle completed transfers outside the lock. The transfer engine's
# "complete" state only means the remote download finished; the
# post-processing worker still has to find, verify, tag, and move the
# file before it can report real batch success or failure.
for batch_id, task_id in completed_tasks:
try:
# Submit post-processing worker (file move, tagging, AcoustID verification)
# This makes batch downloads fully independent of browser polling.
logger.info(f"[Monitor] Submitting post-processing worker for task {task_id}")
missing_download_executor.submit(_run_post_processing_worker, task_id, batch_id)
except Exception as e:
logger.error(f"[Monitor] Error handling completed task {task_id}: {e}")
with tasks_lock:
if task_id in download_tasks:
download_tasks[task_id]['status'] = 'failed'
download_tasks[task_id]['error_message'] = f'Post-processing could not be scheduled: {e}'
try:
_on_download_completed(batch_id, task_id, success=False)
except Exception as completion_error:
logger.error(
f"[Monitor] Error marking failed post-processing submit for task {task_id}: {completion_error}"
)
# Handle exhausted retry tasks outside the lock to prevent deadlock
for batch_id, task_id in exhausted_tasks:
try:
logger.info(f"[Monitor] Calling completion callback for exhausted task {task_id}")
_on_download_completed(batch_id, task_id, success=False)
except Exception as e:
logger.error(f"[Monitor] Error handling exhausted task {task_id}: {e}")
# ENHANCED: Add worker count validation to detect ghost workers
self._validate_worker_counts()
def _get_live_transfers(self):
"""Get current transfer status from slskd API and YouTube client"""
try:
# Check if we should stop due to shutdown
if not self.monitoring:
return {}
live_transfers = {}
# Only hit slskd API if soulseek is actually configured and active
dl_mode = config_manager.get('download_source.mode', 'hybrid')
hybrid_order = config_manager.get('download_source.hybrid_order', ['hifi', 'youtube', 'soulseek'])
soulseek_active = (dl_mode == 'soulseek' or
(dl_mode == 'hybrid' and 'soulseek' in hybrid_order))
# Get Soulseek downloads from API
transfers_data = None
_slsk = download_orchestrator.client('soulseek') if download_orchestrator and hasattr(download_orchestrator, 'client') else None
if soulseek_active and _slsk and _slsk.base_url:
transfers_data = run_async(download_orchestrator._make_request('GET', 'transfers/downloads'))
if transfers_data:
for user_data in transfers_data:
username = user_data.get('username', 'Unknown')
if 'directories' in user_data:
for directory in user_data['directories']:
if 'files' in directory:
for file_info in directory['files']:
key = _make_context_key(username, file_info.get('filename', ''))
live_transfers[key] = file_info
# Also get non-Soulseek downloads via the engine — single
# cross-source aggregation, no per-source iteration.
try:
all_downloads = []
if download_orchestrator and hasattr(download_orchestrator, 'engine'):
try:
# Exclude soulseek — slskd transfers were already
# pulled via the transfers/downloads endpoint above.
# Without the exclude both fetch paths run, doubling
# the per-tick slskd API hit.
all_downloads = run_async(
download_orchestrator.engine.get_all_downloads(exclude=('soulseek',))
)
except Exception as e:
logger.debug("get_all_downloads failed: %s", e)
for download in all_downloads:
key = _make_context_key(download.username, download.filename)
# Convert DownloadStatus to transfer dict format for monitor compatibility
transfer_row = {
'id': download.id,
'filename': download.filename,
'username': download.username,
'state': download.state,
'percentComplete': download.progress,
'size': download.size,
'bytesTransferred': download.transferred,
'averageSpeed': download.speed,
}
live_transfers[key] = transfer_row
id_key = _download_id_key(download.id)
if id_key:
live_transfers[id_key] = transfer_row
except Exception as yt_error:
logger.error(f"Monitor: Could not fetch streaming source downloads: {yt_error}")
return live_transfers
except Exception as e:
# If we get shutdown-related errors, stop monitoring immediately
if ("interpreter shutdown" in str(e) or
"cannot schedule new futures" in str(e) or
"Event loop is closed" in str(e)):
logger.info("Monitor detected shutdown, stopping immediately")
self.monitoring = False
return {}
else:
logger.error(f"Monitor: Could not fetch live transfers: {e}")
return {}
def _should_retry_task(self, task_id, task, live_transfers_lookup, current_time, deferred_ops):
"""
Determine if a task should be retried due to timeout (matches GUI logic).
IMPORTANT: This runs while tasks_lock is held. All network calls (slskd API)
and nested lock acquisitions (matched_context_lock) are collected into deferred_ops
to be executed AFTER releasing tasks_lock. This prevents deadlocks and long lock holds.
Returns True if retries are exhausted and _on_download_completed should be called outside the lock.
"""
ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
task_filename = task.get('filename') or ti.get('filename')
task_username = task.get('username') or ti.get('username')
if not task_filename or not task_username:
return False
lookup_key = _make_context_key(task_username, task_filename)
live_info = _lookup_live_info(task, live_transfers_lookup)
if not live_info:
# User-initiated manual pick — skip auto-retry. The status
# engine fallback owns the terminal transition for non-Soulseek
# manual downloads. Yanking the task back to 'searching' here
# would defeat the user's explicit selection.
if task.get('_user_manual_pick'):
return False
# Task not in live transfers but status is downloading/queued - likely stuck
if current_time - task.get('status_change_time', current_time) > 90:
retry_count = task.get('stuck_retry_count', 0)
last_retry = task.get('last_retry_time', 0)
if retry_count < 3 and (current_time - last_retry) > 30:
logger.warning(f"Task not in live transfers for >90s - retry {retry_count + 1}/3")
task['stuck_retry_count'] = retry_count + 1
task['last_retry_time'] = current_time
download_id = task.get('download_id')
# Defer slskd cancel to outside the lock
if task_username and download_id:
deferred_ops.append(('cancel_download', download_id, task_username,
'not_in_live_transfers_90s'))
# Mark current source as used (full filename to match worker format)
if task_username and task_filename:
used_sources = task.get('used_sources', set())
source_key = f"{task_username}_{task_filename}"
used_sources.add(source_key)
task['used_sources'] = used_sources
logger.warning(f"Marked missing-transfer source as used: {source_key}")
# Defer orphan cleanup
if task_username and task_filename:
_orphaned_download_keys.add(lookup_key)
deferred_ops.append(('cleanup_orphan', lookup_key))
# Clear download info and reset for retry
task.pop('download_id', None)
task.pop('username', None)
task.pop('filename', None)
task['status'] = 'searching'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task['status_change_time'] = current_time
logger.warning(f"Task {task.get('track_info', {}).get('name', 'Unknown')} reset for missing-transfer retry")
batch_id = task.get('batch_id')
if task_id and batch_id:
deferred_ops.append(('restart_worker', task_id, batch_id))
return False
elif retry_count < 3:
return False
else:
track_label = task.get('track_info', {}).get('name', 'Unknown')
tried_sources = task.get('used_sources', set())
sources_str = f' (tried {len(tried_sources)} source{"s" if len(tried_sources) != 1 else ""})' if tried_sources else ''
logger.error("Task failed after 3 retry attempts (not in live transfers)")
task['status'] = 'failed'
task['error_message'] = f'Download disappeared from transfer list 3 times for "{track_label}"{sources_str} — source may be unavailable'
batch_id = task.get('batch_id')
if batch_id:
return True
return False
return False
state_str = live_info.get('state', '')
progress = live_info.get('percentComplete', 0)
# IMMEDIATE ERROR RETRY: Check for errored/rejected/timed-out downloads first (no timeout needed)
if 'Errored' in state_str or 'Failed' in state_str or 'Rejected' in state_str or 'TimedOut' in state_str:
# Same manual-pick guard as the not-in-live-transfers path —
# user explicitly selected this candidate, surface the failure.
if task.get('_user_manual_pick'):
return False
retry_count = task.get('error_retry_count', 0)
last_retry = task.get('last_error_retry_time', 0)
# Don't retry too frequently (wait at least 5 seconds between error retries)
if retry_count < 3 and (current_time - last_retry) > 5: # Max 3 error retry attempts
logger.error(f"Task errored (state: {state_str}) - immediate retry {retry_count + 1}/3")
task['error_retry_count'] = retry_count + 1
task['last_error_retry_time'] = current_time
_ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
username = task.get('username') or _ti.get('username')
filename = task.get('filename') or _ti.get('filename')
download_id = task.get('download_id')
# Defer slskd cancel to outside the lock
if username and download_id:
deferred_ops.append(('cancel_download', download_id, username,
'errored_state_retry'))
# Mark current source as used to prevent retry loops
# CRITICAL: Use full filename (not basename) to match worker's source_key format
if username and filename:
used_sources = task.get('used_sources', set())
source_key = f"{username}_{filename}"
used_sources.add(source_key)
task['used_sources'] = used_sources
logger.error(f"Marked errored source as used: {source_key}")
# Defer orphan cleanup to outside the lock (needs matched_context_lock)
if username and filename:
old_context_key = _make_context_key(username, filename)
_orphaned_download_keys.add(old_context_key)
deferred_ops.append(('cleanup_orphan', old_context_key))
# Clear download info since we cancelled it
task.pop('download_id', None)
task.pop('username', None)
task.pop('filename', None)
# Reset task state for immediate retry
task['status'] = 'searching'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task['status_change_time'] = current_time
logger.error(f"Task {task.get('track_info', {}).get('name', 'Unknown')} reset for error retry")
# Defer worker restart to outside the lock
batch_id = task.get('batch_id')
if task_id and batch_id:
deferred_ops.append(('restart_worker', task_id, batch_id))
return False
elif retry_count < 3:
# Wait a bit before next error retry
return False
else:
# Too many error retries, mark as failed
track_label = task.get('track_info', {}).get('name', 'Unknown')
tried_sources = task.get('used_sources', set())
sources_str = f' (tried {len(tried_sources)} source{"s" if len(tried_sources) != 1 else ""})' if tried_sources else ''
logger.error("Task failed after 3 error retry attempts")
task['status'] = 'failed'
# Tidal-specific error: check if this was a quality issue.
# task['username'] is popped on error-retry (line ~2866) so we can't rely on it;
# used_sources keys are formatted as "{username}_{filename}", so startswith is exact.
is_tidal = any(s.startswith('tidal_') for s in tried_sources)
if is_tidal:
tidal_quality = config_manager.get('tidal_download.quality', 'lossless')
allow_fb = config_manager.get('tidal_download.allow_fallback', True)
if tidal_quality == 'hires' and not allow_fb:
task['error_message'] = (
f'Tidal download failed for "{track_label}" — HiRes quality is unavailable for this track '
f'on your account or in your region. Enable "Quality Fallback" in Tidal settings to fall back to Lossless.'
)
else:
task['error_message'] = (
f'Tidal download failed for "{track_label}"{sources_str}'
f'check Tidal authentication and quality settings.'
)
else:
task['error_message'] = f'Soulseek transfer errored 3 times for "{track_label}"{sources_str} — all sources failed or became unavailable'
# CRITICAL: Notify batch manager so track is added to permanently_failed_tracks
batch_id = task.get('batch_id')
if batch_id:
logger.error(f"[Retry Exhausted] Notifying batch manager of permanent failure for task {task_id}")
return True # Signal that we need to call completion outside the lock
return False
# Check for queued timeout (90 seconds like GUI)
elif 'Queued' in state_str or task['status'] == 'queued':
if 'queued_start_time' not in task:
task['queued_start_time'] = current_time
return False
else:
queue_time = current_time - task['queued_start_time']
# Use context-aware timeouts like GUI:
# - 15 seconds for artist album downloads (streaming context)
# - 90 seconds for background playlist downloads
is_streaming_context = task.get('track_info', {}).get('is_album_download', False)
timeout_threshold = 15.0 if is_streaming_context else 90.0
if queue_time > timeout_threshold:
# Track retry attempts to prevent rapid loops
retry_count = task.get('stuck_retry_count', 0)
last_retry = task.get('last_retry_time', 0)
# Don't retry too frequently (wait at least 30 seconds between retries)
if retry_count < 3 and (current_time - last_retry) > 30: # Max 3 retry attempts
logger.warning(f"Task stuck in queue for {queue_time:.1f}s - immediate retry {retry_count + 1}/3")
task['stuck_retry_count'] = retry_count + 1
task['last_retry_time'] = current_time
_ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
username = task.get('username') or _ti.get('username')
filename = task.get('filename') or _ti.get('filename')
download_id = task.get('download_id')
# Defer slskd cancel to outside the lock
if username and download_id:
deferred_ops.append(('cancel_download', download_id, username,
'queued_state_timeout'))
# UNIFIED RETRY LOGIC: Handle timeout retry exactly like error retry
# Mark current source as used to prevent retry loops
# CRITICAL: Use full filename (not basename) to match worker's source_key format
if username and filename:
used_sources = task.get('used_sources', set())
source_key = f"{username}_{filename}"
used_sources.add(source_key)
task['used_sources'] = used_sources
logger.error(f"Marked timeout source as used: {source_key}")
# Defer orphan cleanup to outside the lock (needs matched_context_lock)
if username and filename:
old_context_key = _make_context_key(username, filename)
_orphaned_download_keys.add(old_context_key)
deferred_ops.append(('cleanup_orphan', old_context_key))
# Clear download info since we cancelled it
task.pop('download_id', None)
task.pop('username', None)
task.pop('filename', None)
# Reset task state for immediate retry (like error retry)
task['status'] = 'searching'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task['status_change_time'] = current_time
logger.error(f"Task {task.get('track_info', {}).get('name', 'Unknown')} reset for timeout retry")
# Defer worker restart to outside the lock
batch_id = task.get('batch_id')
if task_id and batch_id:
deferred_ops.append(('restart_worker', task_id, batch_id))
return False
elif retry_count < 3:
# Wait longer before next retry
return False
else:
# Too many retries, mark as failed
track_label = task.get('track_info', {}).get('name', 'Unknown')
tried_sources = task.get('used_sources', set())
sources_str = f' (tried {len(tried_sources)} source{"s" if len(tried_sources) != 1 else ""})' if tried_sources else ''
logger.error("Task failed after 3 retry attempts (queue timeout)")
task['status'] = 'failed'
task['error_message'] = f'Download stayed queued too long 3 times for "{track_label}"{sources_str} — peers may be offline or have full queues'
# Clear timers to prevent further retry loops
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
# CRITICAL: Notify batch manager so track is added to permanently_failed_tracks
batch_id = task.get('batch_id')
if batch_id:
logger.error(f"[Retry Exhausted] Notifying batch manager of permanent failure for task {task_id}")
return True # Signal that we need to call completion outside the lock
return False
# Check for downloading at 0% timeout (90 seconds like GUI)
elif 'InProgress' in state_str and progress < 1:
if 'downloading_start_time' not in task:
task['downloading_start_time'] = current_time
return False
else:
download_time = current_time - task['downloading_start_time']
# Use context-aware timeouts like GUI:
# - 15 seconds for artist album downloads (streaming context)
# - 90 seconds for background playlist downloads
is_streaming_context = task.get('track_info', {}).get('is_album_download', False)
timeout_threshold = 15.0 if is_streaming_context else 90.0
if download_time > timeout_threshold:
retry_count = task.get('stuck_retry_count', 0)
last_retry = task.get('last_retry_time', 0)
# Don't retry too frequently (wait at least 30 seconds between retries)
if retry_count < 3 and (current_time - last_retry) > 30: # Max 3 retry attempts
logger.warning(f"Task stuck at 0% for {download_time:.1f}s - immediate retry {retry_count + 1}/3")
task['stuck_retry_count'] = retry_count + 1
task['last_retry_time'] = current_time
_ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
username = task.get('username') or _ti.get('username')
filename = task.get('filename') or _ti.get('filename')
download_id = task.get('download_id')
# Defer slskd cancel to outside the lock
if username and download_id:
deferred_ops.append(('cancel_download', download_id, username,
'stuck_at_0pct_timeout'))
# UNIFIED RETRY LOGIC: Handle 0% timeout retry exactly like error retry
# Mark current source as used to prevent retry loops
# CRITICAL: Use full filename (not basename) to match worker's source_key format
if username and filename:
used_sources = task.get('used_sources', set())
source_key = f"{username}_{filename}"
used_sources.add(source_key)
task['used_sources'] = used_sources
logger.info(f"Marked 0% progress source as used: {source_key}")
# Defer orphan cleanup to outside the lock (needs matched_context_lock)
if username and filename:
old_context_key = _make_context_key(username, filename)
_orphaned_download_keys.add(old_context_key)
deferred_ops.append(('cleanup_orphan', old_context_key))
# Clear download info since we cancelled it
task.pop('download_id', None)
task.pop('username', None)
task.pop('filename', None)
# Reset task state for immediate retry (like error retry)
task['status'] = 'searching'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task['status_change_time'] = current_time
logger.warning(f"Task {task.get('track_info', {}).get('name', 'Unknown')} reset for 0% retry")
# Defer worker restart to outside the lock
batch_id = task.get('batch_id')
if task_id and batch_id:
deferred_ops.append(('restart_worker', task_id, batch_id))
return False
elif retry_count < 3:
# Wait longer before next retry
return False
else:
track_label = task.get('track_info', {}).get('name', 'Unknown')
tried_sources = task.get('used_sources', set())
sources_str = f' (tried {len(tried_sources)} source{"s" if len(tried_sources) != 1 else ""})' if tried_sources else ''
logger.error("Task failed after 3 retry attempts (0% progress timeout)")
task['status'] = 'failed'
task['error_message'] = f'Download stuck at 0% three times for "{track_label}"{sources_str} — peers may have connection issues'
# Clear timers to prevent further retry loops
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
# CRITICAL: Notify batch manager so track is added to permanently_failed_tracks
batch_id = task.get('batch_id')
if batch_id:
logger.error(f"[Retry Exhausted] Notifying batch manager of permanent failure for task {task_id}")
return True # Signal that we need to call completion outside the lock
return False
else:
# Only reset timers if actual byte progress is being made
bytes_transferred = live_info.get('bytesTransferred', 0)
if progress >= 1 or bytes_transferred > 0:
# Real progress happening, reset timers and retry counts
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task.pop('stuck_retry_count', None)
else:
# Unknown state with no progress (e.g., "Requested", "Initializing")
# Treat like 0% stuck — start/keep the downloading timer running
if 'downloading_start_time' not in task:
task['downloading_start_time'] = current_time
download_time = current_time - task['downloading_start_time']
# Use context-aware timeouts
is_streaming_context = task.get('track_info', {}).get('is_album_download', False)
timeout_threshold = 15.0 if is_streaming_context else 90.0
if download_time > timeout_threshold:
retry_count = task.get('stuck_retry_count', 0)
last_retry = task.get('last_retry_time', 0)
if retry_count < 3 and (current_time - last_retry) > 30:
logger.warning(f"Task stuck in unknown state '{state_str}' with 0 progress for {download_time:.1f}s - retry {retry_count + 1}/3")
task['stuck_retry_count'] = retry_count + 1
task['last_retry_time'] = current_time
_ti = task.get('track_info') if isinstance(task.get('track_info'), dict) else {}
username = task.get('username') or _ti.get('username')
filename = task.get('filename') or _ti.get('filename')
download_id = task.get('download_id')
if username and download_id:
deferred_ops.append(('cancel_download', download_id, username,
'unknown_state_no_progress_timeout'))
if username and filename:
used_sources = task.get('used_sources', set())
source_key = f"{username}_{filename}"
used_sources.add(source_key)
task['used_sources'] = used_sources
logger.info(f"Marked unknown-state source as used: {source_key}")
if username and filename:
old_context_key = _make_context_key(username, filename)
_orphaned_download_keys.add(old_context_key)
deferred_ops.append(('cleanup_orphan', old_context_key))
task.pop('download_id', None)
task.pop('username', None)
task.pop('filename', None)
task['status'] = 'searching'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
task['status_change_time'] = current_time
batch_id = task.get('batch_id')
if task_id and batch_id:
deferred_ops.append(('restart_worker', task_id, batch_id))
return False
elif retry_count >= 3:
track_label = task.get('track_info', {}).get('name', 'Unknown')
tried_sources = task.get('used_sources', set())
sources_str = f' (tried {len(tried_sources)} source{"s" if len(tried_sources) != 1 else ""})' if tried_sources else ''
logger.error(f"Task failed after 3 retry attempts (unknown state '{state_str}')")
task['status'] = 'failed'
task['error_message'] = f'Download stuck in "{state_str}" state 3 times for "{track_label}"{sources_str}'
task.pop('queued_start_time', None)
task.pop('downloading_start_time', None)
batch_id = task.get('batch_id')
if batch_id:
return True
return False
return False
def _validate_worker_counts(self):
"""
Validate worker counts to detect and fix ghost workers or orphaned tasks.
This prevents the modal from showing wrong worker counts permanently.
"""
try:
batches_needing_workers = []
with tasks_lock:
for batch_id in list(self.monitored_batches):
if batch_id not in download_batches:
continue
batch = download_batches[batch_id]
reported_active = batch['active_count']
max_concurrent = batch['max_concurrent']
queue = batch.get('queue', [])
queue_index = batch.get('queue_index', 0)
# Count actually active tasks based on status
actually_active = 0
orphaned_tasks = []
# Tasks already processed by _on_download_completed should NOT be counted
# as active, even if their status hasn't been updated yet (race condition
# between stream processor calling _on_download_completed and
# _run_post_processing_worker setting status to 'completed')
completed_task_ids = batch.get('_completed_task_ids', set())
for task_id in queue:
if task_id in download_tasks:
task_status = download_tasks[task_id]['status']
if task_status in ['searching', 'downloading', 'queued', 'post_processing']:
if task_id not in completed_task_ids:
actually_active += 1
elif task_status in ['failed', 'completed', 'cancelled', 'not_found'] and task_id in queue[queue_index:]:
# These are orphaned tasks - they're done but still in active queue
orphaned_tasks.append(task_id)
# Check for discrepancies
if reported_active != actually_active or orphaned_tasks:
logger.warning(f"[Worker Validation] Batch {batch_id}: reported={reported_active}, actual={actually_active}, orphaned={len(orphaned_tasks)}")
if orphaned_tasks:
logger.warning(f"[Worker Validation] Found {len(orphaned_tasks)} orphaned tasks to cleanup")
# Fix the active count if it's wrong
if reported_active != actually_active:
old_count = batch['active_count']
batch['active_count'] = actually_active
logger.info(f"[Worker Validation] Fixed active count: {old_count}{actually_active}")
# Defer starting workers to outside the lock
if actually_active < max_concurrent and queue_index < len(queue):
batches_needing_workers.append(batch_id)
# Start replacement workers outside the lock
for batch_id in batches_needing_workers:
try:
logger.info(f"[Worker Validation] Starting replacement workers for {batch_id}")
_start_next_batch_of_downloads(batch_id)
except Exception as e:
logger.error(f"[Worker Validation] Error starting workers for {batch_id}: {e}")
except Exception as validation_error:
logger.error(f"Error in worker count validation: {validation_error}")