diff --git a/core/downloads/task_worker.py b/core/downloads/task_worker.py index 0cbb569c..ebe5f716 100644 --- a/core/downloads/task_worker.py +++ b/core/downloads/task_worker.py @@ -260,7 +260,13 @@ def download_track_worker(task_id: str, batch_id: Optional[str], deps: TaskWorke search_diagnostics.append(f'"{query}": {result_count} results, {len(candidates)} passed filters but download failed to start') else: search_diagnostics.append(f'"{query}": {result_count} results but none passed quality/artist filters') - all_raw_results.extend(tracks_result[:20]) # Keep top results for review + # Strip SoundCloud preview snippets before caching for the + # review modal — the user can't pick something useful from + # a 30s preview clip, and clicking one bypasses validation + # and downloads it anyway. + from core.downloads.validation import filter_soundcloud_previews + _filtered_raw = filter_soundcloud_previews(tracks_result[:20], track) + all_raw_results.extend(_filtered_raw) else: search_diagnostics.append(f'"{query}": no results found') @@ -281,15 +287,14 @@ def download_track_worker(task_id: str, batch_id: Optional[str], deps: TaskWorke secondary = getattr(orch, 'hybrid_secondary', '') hybrid_order = [primary, secondary] if secondary and secondary != primary else [primary] + # Resolve via the orchestrator's generic accessor — the + # legacy per-source attrs were dropped in the registry + # refactor, so getattr(orch, 'soulseek', None) etc. all + # silently returned None and the fallback never fired. source_clients = { - 'soulseek': getattr(orch, 'soulseek', None), - 'youtube': getattr(orch, 'youtube', None), - 'tidal': getattr(orch, 'tidal', None), - 'qobuz': getattr(orch, 'qobuz', None), - 'hifi': getattr(orch, 'hifi', None), - 'deezer_dl': getattr(orch, 'deezer_dl', None), - 'lidarr': getattr(orch, 'lidarr', None), - 'soundcloud': getattr(orch, 'soundcloud', None), + name: orch.client(name) + for name in ('soulseek', 'youtube', 'tidal', 'qobuz', + 'hifi', 'deezer_dl', 'lidarr', 'soundcloud') } # The orchestrator tried sources in order but stopped at the first with results. diff --git a/core/downloads/validation.py b/core/downloads/validation.py index e2e14c57..7ca346ec 100644 --- a/core/downloads/validation.py +++ b/core/downloads/validation.py @@ -24,6 +24,43 @@ def init(matching_engine_obj, download_orchestrator_obj): download_orchestrator = download_orchestrator_obj +def filter_soundcloud_previews(results, expected_track): + """Drop SoundCloud preview snippets so they never reach the cache, + the modal, or the auto-download attempt. + + SoundCloud serves a ~30s preview clip for tracks gated behind Go+ / + login. yt-dlp accepts the preview as the download payload, the + integrity check catches the truncated file, but the user just sees + "all candidates failed" with previews still listed in the modal + (and clickable for manual retry, which downloads another preview). + + Filter at every spot raw search results enter the task: validation + scoring, modal-cache fallback when validation drops everything, + and the not-found raw-results cache. Keep candidates that genuinely + are short (intros, sound effects) when the expected track is also + short. + """ + if not results or not expected_track: + return results + expected_ms = getattr(expected_track, 'duration_ms', 0) or 0 + if expected_ms <= 0: + return results + expected_secs = expected_ms / 1000.0 + if expected_secs <= 60: + return results + + def _is_preview(r): + if getattr(r, 'username', None) != 'soundcloud': + return False + cand_ms = getattr(r, 'duration', None) or 0 + if cand_ms <= 0: + return False + cand_secs = cand_ms / 1000.0 + return cand_secs < 35 or cand_secs < expected_secs * 0.5 + + return [r for r in results if not _is_preview(r)] + + def get_valid_candidates(results, spotify_track, query): """ This function is a direct port from sync.py. It scores and filters @@ -33,6 +70,13 @@ def get_valid_candidates(results, spotify_track, query): if not results: return [] + # Pre-filter: drop SoundCloud preview snippets when expected + # duration is non-trivially long. Same helper is also applied at + # the modal-cache fallback path so previews never reach the UI. + results = filter_soundcloud_previews(results, spotify_track) + if not results: + return [] + # Streaming sources (YouTube, Tidal, Qobuz, HiFi, Deezer, SoundCloud) return structured API results # with proper artist/title metadata — score using the same matching engine as Soulseek _streaming_sources = ("youtube", "tidal", "qobuz", "hifi", "deezer_dl", "soundcloud") @@ -50,29 +94,6 @@ def get_valid_candidates(results, spotify_track, query): scored = [] for r in results: - # SoundCloud-specific: drop preview snippets BEFORE scoring. - # Anonymous SoundCloud serves a ~30s preview clip for tracks - # gated behind Go+ / login. yt-dlp accepts these as the - # download payload, the integrity check catches them - # post-download, but the user just sees "all candidates - # failed". Filter at search time when we know expected - # duration so the matcher never picks a 30s clip for a - # 5-minute track. Keep candidates that genuinely are short - # (intros, sound effects) when the expected track is also - # short. - if r.username == 'soundcloud' and expected_duration and r.duration: - cand_secs = r.duration / 1000.0 - expected_secs = expected_duration / 1000.0 - # Drop if expected is non-trivially long AND candidate is - # near the SoundCloud 30s preview boundary OR less than - # half the expected duration. - if expected_secs > 60 and (cand_secs < 35 or cand_secs < expected_secs * 0.5): - logger.info( - f"[SoundCloud] Dropping preview/short candidate " - f"'{r.title}' ({cand_secs:.1f}s vs expected {expected_secs:.1f}s)" - ) - continue - # Score using matching engine's generic scorer (same weights as Soulseek) confidence, match_type = matching_engine.score_track_match( source_title=expected_title, diff --git a/tests/downloads/test_downloads_task_worker.py b/tests/downloads/test_downloads_task_worker.py index 3449820a..11ccdfca 100644 --- a/tests/downloads/test_downloads_task_worker.py +++ b/tests/downloads/test_downloads_task_worker.py @@ -31,13 +31,28 @@ class _Recorder: class _FakeClient: - """Stub soulseek client. `mode` defaults to non-hybrid.""" + """Stub download orchestrator. `mode` defaults to non-hybrid. + Per-source stubs passed via ``subclients`` are surfaced through + ``client(name)`` (the registry-backed accessor on the real + orchestrator); plain attrs (hybrid_order, hybrid_primary, etc.) + are set as attributes so getattr() lookups still resolve them.""" + _CLIENT_NAMES = {'soulseek', 'youtube', 'tidal', 'qobuz', 'hifi', + 'deezer_dl', 'lidarr', 'soundcloud'} + def __init__(self, results=None, mode='soulseek', subclients=None): self._results = results if results is not None else [] self.mode = mode self.search_calls = [] + self._client_map = {} for k, v in (subclients or {}).items(): - setattr(self, k, v) + if k in self._CLIENT_NAMES: + self._client_map[k] = v + else: + # config-style attrs (hybrid_order, hybrid_primary, etc.) + setattr(self, k, v) + + def client(self, name): + return self._client_map.get(name) async def search(self, query, timeout=30): self.search_calls.append((query, timeout)) diff --git a/tests/downloads/test_downloads_validation.py b/tests/downloads/test_downloads_validation.py new file mode 100644 index 00000000..62798523 --- /dev/null +++ b/tests/downloads/test_downloads_validation.py @@ -0,0 +1,92 @@ +"""Tests for core/downloads/validation.py — SoundCloud preview filter. + +The SoundCloud anonymous tier serves a ~30s preview clip for tracks +gated behind Go+ / login. ``filter_soundcloud_previews`` drops these +candidates before they reach the matcher, the modal cache, or the +manual-pick download path. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +from core.downloads.validation import filter_soundcloud_previews + + +@dataclass +class _Track: + duration_ms: int + + +@dataclass +class _Candidate: + username: str + duration: Optional[int] # milliseconds + title: str = '' + + +def test_drops_soundcloud_30s_preview_when_expected_long(): + """A 30s SC candidate against a 5-minute expected track is the + canonical preview-snippet case — must be dropped.""" + expected = _Track(duration_ms=338_000) # ~5:38 + cands = [ + _Candidate(username='soundcloud', duration=30_000, title='Preview'), + _Candidate(username='soundcloud', duration=338_000, title='Real'), + ] + result = filter_soundcloud_previews(cands, expected) + assert len(result) == 1 + assert result[0].title == 'Real' + + +def test_drops_under_half_expected_duration(): + """SC candidate at 100s against 300s expected = clearly truncated / + wrong content. Must be dropped even if not at the 30s boundary.""" + expected = _Track(duration_ms=300_000) + cand = _Candidate(username='soundcloud', duration=100_000) + assert filter_soundcloud_previews([cand], expected) == [] + + +def test_keeps_soundcloud_when_expected_track_is_short(): + """Genuinely short SC tracks (intros, sound effects, sub-minute + songs) must pass through when the expected track is also short. + Filter only kicks in when expected > 60s.""" + expected = _Track(duration_ms=45_000) # 45s expected + cand = _Candidate(username='soundcloud', duration=30_000) + result = filter_soundcloud_previews([cand], expected) + assert result == [cand] + + +def test_does_not_filter_non_soundcloud_sources(): + """A 30s candidate from another streaming source isn't a SoundCloud + preview — leave it for the generic matching engine to score.""" + expected = _Track(duration_ms=338_000) + yt = _Candidate(username='youtube', duration=30_000) + tidal = _Candidate(username='tidal', duration=30_000) + assert filter_soundcloud_previews([yt, tidal], expected) == [yt, tidal] + + +def test_returns_input_unchanged_without_expected_duration(): + """Without a Spotify-track / expected duration we can't reason + about previews — pass everything through.""" + cands = [ + _Candidate(username='soundcloud', duration=30_000), + _Candidate(username='soundcloud', duration=300_000), + ] + assert filter_soundcloud_previews(cands, None) == cands + assert filter_soundcloud_previews(cands, _Track(duration_ms=0)) == cands + + +def test_empty_input_returns_empty_list(): + assert filter_soundcloud_previews([], _Track(duration_ms=200_000)) == [] + + +def test_keeps_soundcloud_candidate_at_threshold(): + """Boundary check: 35s candidate against 200s expected — exactly + at the 35s preview boundary, but 35s is also above + expected*0.5 (100s) check (35 < 100, so still drops). Use a + higher value to confirm the just-above threshold passes.""" + expected = _Track(duration_ms=200_000) # 200s + # 110s passes both checks: > 35s AND > 100s (half of 200s) + cand = _Candidate(username='soundcloud', duration=110_000) + assert filter_soundcloud_previews([cand], expected) == [cand] diff --git a/webui/static/helper.js b/webui/static/helper.js index bcee2ca0..f057d8e8 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3432,7 +3432,7 @@ const WHATS_NEW = { '2.4.2': [ // --- post-2.4.1 dev work — entries hidden by _getLatestWhatsNewVersion until the build version bumps --- { date: 'Unreleased — 2.4.2 dev cycle' }, - { title: 'Drop SoundCloud Preview Snippets at Search Time', desc: 'soundcloud serves a ~30s preview clip for tracks that are gated behind go+ / login (very common for major-label uploads — official content basically doesn\'t exist on soundcloud, so what shows up is bootlegs, fan uploads, type beats, and these 30s previews). yt-dlp accepts the preview as the download payload, the post-download integrity check catches the duration mismatch and quarantines the file, but the user just sees "all candidates failed" with no explanation. now the search-time filter drops candidates whose duration is below half the expected length OR within ~5s of the 30s preview boundary, but only when the expected track is non-trivially long (>60s) so genuinely short tracks (intros, sound effects, sub-minute songs) still pass through.', page: 'downloads' }, + { title: 'Drop SoundCloud Preview Snippets at Search Time', desc: 'soundcloud serves a ~30s preview clip for tracks that are gated behind go+ / login (very common for major-label uploads — official content basically doesn\'t exist on soundcloud, so what shows up is bootlegs, fan uploads, type beats, and these 30s previews). yt-dlp accepts the preview as the download payload, the post-download integrity check catches the duration mismatch and quarantines the file, but the user just sees "all candidates failed" with no explanation. previews also showed up in the candidate-review modal where clicking one bypassed validation and downloaded the same broken file. now `filter_soundcloud_previews` drops these candidates at every entry point — auto-search scoring, modal-cache fallback, AND the not-found raw-results path — so previews never reach the matcher OR the user. drops candidates < 35s or below half the expected duration, gated on expected being non-trivially long (>60s) so genuine short tracks still pass. also fixed a silent regression in the hybrid-fallback path where the per-source attribute removal left `getattr(orch, \'youtube\', None)` returning None for every source — fallback never fired. now resolves through the orchestrator\'s `client(name)` accessor.', page: 'downloads' }, { title: 'Internal: Move Shared Download Dataclasses + Singleton Boot Path', desc: 'internal — two architectural cleanups on top of the download engine refactor. (1) `TrackResult`, `AlbumResult`, `DownloadStatus`, `SearchResult` lived in `core/soulseek_client.py` for historical reasons (they grew up there as the soulseek-only types and got exported when other download sources were added). every plugin imported these from the soulseek module just to satisfy the contract — coupling 8 clients to a sibling source for type imports only. moved them to `core/download_plugins/types.py` (the neutral plugin package) and updated all 14 import sites across deezer/hifi/lidarr/qobuz/soundcloud/tidal/youtube clients + the engine + matching engine + redownload + tests. clean break, no backward-compat re-export. (2) `web_server.py` now boots the orchestrator via `set_download_orchestrator(DownloadOrchestrator())` so the singleton factory + boot path share state — `get_download_orchestrator()` returns the same instance the global handle points at instead of lazily building a separate one. matches cin\'s `get_metadata_engine()` pattern.' }, { title: 'Internal: Rename `soulseek_client` Global → `download_orchestrator`', desc: 'internal — followup cleanup. the global handle in web_server.py was named `soulseek_client` for historical reasons (the orchestrator was originally just the soulseek client and grew downstream sources around it), but the type has long been `DownloadOrchestrator` not `SoulseekClient`. renamed the global + every parameter/attribute that carried the legacy name across web_server.py, api/, core/downloads/*, core/search/*, core/streaming/*, services/sync_service.py, and the test fixtures (`MasterDeps.soulseek_client` → `download_orchestrator`, `init(soulseek_client_obj)` → `init(download_orchestrator_obj)`, etc). module path `core.soulseek_client` and class `SoulseekClient` (the actual soulseek-only client) are unchanged — only the orchestrator handle renamed. ~250 references touched, suite green.' }, { title: 'Internal: Drop Backward-Compat Per-Source Attrs', desc: 'internal — followup to cin\'s download engine review. removed the `orchestrator.soulseek` / `.youtube` / `.tidal` / `.qobuz` / `.hifi` / `.deezer_dl` / `.lidarr` / `.soundcloud` attribute aliases that were preserved for backward compat. external callers (core/downloads/, core/search/, web_server.py) all migrated to the generic `orchestrator.client(\'\')` accessor — alias-aware (legacy `deezer_dl` resolves to canonical `deezer`), single source of truth via the registry. the orchestrator\'s own internal `self.soulseek` / `self.deezer_dl` reaches also routed through `client()` so the only place that knows about per-source identity is the registry. test fakes updated to expose `client(name)` instead of stuffing attributes; conformance test pinned to the new accessor contract. zero behavior change — just cleaner shape.' },