diff --git a/core/metadata/discography_filters.py b/core/metadata/discography_filters.py new file mode 100644 index 00000000..c49eeca4 --- /dev/null +++ b/core/metadata/discography_filters.py @@ -0,0 +1,137 @@ +"""Track-level filters for the user-facing Download Discography flow. + +GitHub issue #559 (trackhacs): clicking "Download Discography" on an +artist also pulled in tracks where the artist's name appeared in the +title of someone else's song. Two failure modes underneath: + +1. **Cross-artist tracks.** Spotify's `artist_albums` endpoint returns + compilation / appears_on / various-artists albums where the requested + artist is featured on one or two tracks. The endpoint then added + *every* track from those albums to the wishlist, including tracks by + unrelated artists that just happened to mention the requested artist + in the title. + +2. **Remix / live / acoustic / instrumental versions.** The watchlist + scanner has user-toggleable filters for these (default: exclude), + stored at `watchlist.global_include_*`. The discography backfill + repair job already honors them. The user-facing Download Discography + endpoint did not — those filters never fired for one-off discography + downloads, so users got remix-ladder bloat. + +These helpers live alongside the existing `core.metadata.discography` +because they belong to the same conceptual layer (discography fetch +results, pre-wishlist) and are independently testable. The watchlist +content-type detectors (``is_remix_version`` etc.) are reused from +``core.watchlist_scanner`` rather than re-implemented — same patterns, +single source of truth. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from core.watchlist_scanner import ( + is_acoustic_version, + is_instrumental_version, + is_live_version, + is_remix_version, +) + + +def track_artist_matches(track_artists: Any, requested_artist_name: str) -> bool: + """Return True if the requested artist appears in the track's + artists list (case-insensitive exact-name membership). + + `track_artists` can be the list-of-strings shape produced by + ``core.metadata.album_tracks._normalize_track_artists`` (which is + what the discography fetch returns), or the list-of-dicts shape + that some upstreams pass directly. Both are accepted. + + Returns True for primary-artist tracks AND feature appearances — + the requested artist need only be one of the listed artists. Only + drops tracks where the requested artist isn't named at all (the + cross-artist compilation case from #559). + """ + if not requested_artist_name: + # No artist to compare against — don't filter; let the caller + # decide. Defensive: avoids dropping every track when the + # caller forgot to pass the artist name. + return True + + target = requested_artist_name.strip().lower() + if not target: + return True + + if not track_artists: + return False + + for entry in track_artists: + if isinstance(entry, dict): + name = entry.get('name', '') or '' + else: + name = str(entry or '') + if name.strip().lower() == target: + return True + + return False + + +def content_type_skip_reason( + track_name: str, + album_name: str, + settings: Dict[str, Any], +) -> Optional[str]: + """Return a short skip-reason string if the track is a content type + the user has chosen to exclude, else None. + + `settings` is a dict keyed by the same names as the watchlist + globals (``include_live`` / ``include_remixes`` / ``include_acoustic`` + / ``include_instrumentals``). All default to False — i.e. exclude + by default — matching the watchlist scanner's default contract. + """ + if not settings.get('include_live', False) and is_live_version(track_name, album_name): + return 'live' + if not settings.get('include_remixes', False) and is_remix_version(track_name, album_name): + return 'remix' + if not settings.get('include_acoustic', False) and is_acoustic_version(track_name, album_name): + return 'acoustic' + if not settings.get('include_instrumentals', False) and is_instrumental_version(track_name, album_name): + return 'instrumental' + return None + + +def load_global_content_filter_settings(config_manager: Any) -> Dict[str, Any]: + """Read the four watchlist content-type globals from config. + + Centralises the key names so the endpoint and the helper agree on + where the settings live. All four default to False (exclude) — same + contract as the watchlist scanner. + """ + if config_manager is None: + return { + 'include_live': False, + 'include_remixes': False, + 'include_acoustic': False, + 'include_instrumentals': False, + } + try: + return { + 'include_live': bool(config_manager.get('watchlist.global_include_live', False)), + 'include_remixes': bool(config_manager.get('watchlist.global_include_remixes', False)), + 'include_acoustic': bool(config_manager.get('watchlist.global_include_acoustic', False)), + 'include_instrumentals': bool(config_manager.get('watchlist.global_include_instrumentals', False)), + } + except Exception: + return { + 'include_live': False, + 'include_remixes': False, + 'include_acoustic': False, + 'include_instrumentals': False, + } + + +__all__ = [ + 'track_artist_matches', + 'content_type_skip_reason', + 'load_global_content_filter_settings', +] diff --git a/tests/metadata/test_discography_filters.py b/tests/metadata/test_discography_filters.py new file mode 100644 index 00000000..572e8cbb --- /dev/null +++ b/tests/metadata/test_discography_filters.py @@ -0,0 +1,217 @@ +"""Pin track-level filters used by the Download Discography endpoint. + +GitHub issue #559 (trackhacs): "Download Discography" on an artist +pulled in tracks where the artist's name appeared in the title of +someone else's song. Two failure modes: + +1. Cross-artist tracks — compilations / appears_on albums brought in + tracks by unrelated artists. Fixed by `track_artist_matches`. +2. Remix / live / acoustic / instrumental versions never honored the + watchlist content-type filters for one-off discography downloads. + Fixed by `content_type_skip_reason`. + +These helpers live in ``core.metadata.discography_filters``. Tests pin +behavior at the function boundary so the wiring inside +``web_server.download_discography`` doesn't need an endpoint test to +catch a filter regression. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from core.metadata.discography_filters import ( + content_type_skip_reason, + load_global_content_filter_settings, + track_artist_matches, +) + + +# --------------------------------------------------------------------------- +# track_artist_matches +# --------------------------------------------------------------------------- + + +class TestTrackArtistMatches: + def test_primary_artist_matches(self): + """When the requested artist is the track's primary artist, + match. This is the most common case — non-feature tracks on an + artist's own album.""" + assert track_artist_matches(['Drake', 'Future'], 'Drake') is True + + def test_featured_artist_matches(self): + """When the requested artist appears as a feature (anywhere in + the list, not just position 0), still match. Keeping feature + appearances is intentional — they're legit discography entries.""" + assert track_artist_matches(['Lil Wayne', 'Drake', 'Kanye West'], 'Drake') is True + + def test_unrelated_artist_drops(self): + """The bug case: a compilation track by an unrelated artist + that just mentions the requested artist in the title. The + artists list contains only the actual performer(s); filter + drops it.""" + assert track_artist_matches(['Random Artist'], 'Drake') is False + + def test_match_is_case_insensitive(self): + """Source data can be cased inconsistently across providers.""" + assert track_artist_matches(['drake'], 'Drake') is True + assert track_artist_matches(['DRAKE'], 'Drake') is True + assert track_artist_matches(['Drake'], 'drake') is True + + def test_match_handles_whitespace_padding(self): + """Trailing whitespace in either side mustn't break the match.""" + assert track_artist_matches([' Drake '], 'Drake') is True + assert track_artist_matches(['Drake'], ' Drake ') is True + + def test_empty_artists_list_drops(self): + """No artists on the track → can't be by anyone → drop.""" + assert track_artist_matches([], 'Drake') is False + assert track_artist_matches(None, 'Drake') is False + + def test_empty_requested_artist_keeps(self): + """Defensive: if the caller forgot to pass the requested artist, + don't drop every track — let the caller's other filters decide. + Better to keep too much than to silently drop everything.""" + assert track_artist_matches(['Drake'], '') is True + assert track_artist_matches(['Drake'], ' ') is True + assert track_artist_matches(['Drake'], None) is True + + def test_accepts_list_of_dicts_shape(self): + """Some upstreams pass `[{'name': 'Drake', 'id': '...'}]` + directly instead of the normalized list-of-strings. Helper + must handle both — easier than forcing a normalization step + at the call site.""" + assert track_artist_matches([{'name': 'Drake'}], 'Drake') is True + assert track_artist_matches([{'name': 'Random'}], 'Drake') is False + + def test_substring_does_not_match(self): + """A song by "Drake & Future" should not match "Drake" via + substring — that's exactly the false-positive case the bug + report describes. Exact full-name match only.""" + assert track_artist_matches(['Drake & Future'], 'Drake') is False + assert track_artist_matches(['Drakeo the Ruler'], 'Drake') is False + + +# --------------------------------------------------------------------------- +# content_type_skip_reason +# --------------------------------------------------------------------------- + + +_ALL_OFF = { + 'include_live': False, + 'include_remixes': False, + 'include_acoustic': False, + 'include_instrumentals': False, +} + +_ALL_ON = { + 'include_live': True, + 'include_remixes': True, + 'include_acoustic': True, + 'include_instrumentals': True, +} + + +class TestContentTypeSkipReason: + def test_returns_none_for_plain_track(self): + """Default settings exclude all four content types, but a plain + original studio track shouldn't trigger any of them.""" + assert content_type_skip_reason('Hotline Bling', 'Views', _ALL_OFF) is None + + def test_remix_skipped_when_excluded(self): + """Default: remixes off → "(Remix)" track gets skipped with + reason 'remix'.""" + assert content_type_skip_reason('Hotline Bling (Remix)', 'Views', _ALL_OFF) == 'remix' + + def test_remix_kept_when_included(self): + """When the user opts in via include_remixes, the same track + passes through.""" + assert content_type_skip_reason('Hotline Bling (Remix)', 'Views', _ALL_ON) is None + + def test_live_skipped_when_excluded(self): + assert content_type_skip_reason('Hotline Bling (Live)', 'Views', _ALL_OFF) == 'live' + + def test_acoustic_skipped_when_excluded(self): + assert content_type_skip_reason('Hotline Bling (Acoustic)', 'Views', _ALL_OFF) == 'acoustic' + + def test_instrumental_skipped_when_excluded(self): + assert content_type_skip_reason('Hotline Bling (Instrumental)', 'Views', _ALL_OFF) == 'instrumental' + + def test_first_match_wins(self): + """If a track somehow matches multiple categories (e.g. a live + remix), it's reported under the first one checked. Order is + live → remix → acoustic → instrumental. Stable for telemetry + and for the user-facing skip-counter aggregation.""" + # "Live Remix" — both live and remix patterns fire. Live first. + reason = content_type_skip_reason('Hotline Bling (Live Remix)', 'Views', _ALL_OFF) + assert reason == 'live' + + def test_settings_missing_keys_default_to_exclude(self): + """Defensive: caller passes an empty dict / partial dict. + Missing keys treated as False (exclude) — same as the watchlist + scanner contract. A remix passed with `{}` still gets skipped.""" + assert content_type_skip_reason('Track (Remix)', 'Album', {}) == 'remix' + + +# --------------------------------------------------------------------------- +# load_global_content_filter_settings +# --------------------------------------------------------------------------- + + +class TestLoadGlobalSettings: + def test_reads_all_four_settings(self): + cfg = SimpleNamespace() + cfg.get = lambda key, default=None: { + 'watchlist.global_include_live': True, + 'watchlist.global_include_remixes': False, + 'watchlist.global_include_acoustic': True, + 'watchlist.global_include_instrumentals': False, + }.get(key, default) + result = load_global_content_filter_settings(cfg) + assert result == { + 'include_live': True, + 'include_remixes': False, + 'include_acoustic': True, + 'include_instrumentals': False, + } + + def test_defaults_all_false_when_config_manager_missing(self): + """No config_manager → all four default to False (exclude). + Same defaults the watchlist scanner uses for unconfigured artists.""" + result = load_global_content_filter_settings(None) + assert result == { + 'include_live': False, + 'include_remixes': False, + 'include_acoustic': False, + 'include_instrumentals': False, + } + + def test_config_get_raising_falls_back_to_defaults(self): + """Defensive: if `config_manager.get` raises (corrupted config, + backend offline, etc.), helper returns all-False defaults + rather than crashing the discography fetch.""" + cfg = SimpleNamespace() + def _boom(*_a, **_k): + raise RuntimeError('config backend exploded') + cfg.get = _boom + result = load_global_content_filter_settings(cfg) + assert result['include_live'] is False + assert result['include_remixes'] is False + + def test_setting_values_coerced_to_bool(self): + """Config can store as int / string — coerce defensively so + downstream callers can rely on the bool contract.""" + cfg = SimpleNamespace() + cfg.get = lambda key, default=None: { + 'watchlist.global_include_live': 1, # int truthy + 'watchlist.global_include_remixes': '', # empty string falsy + 'watchlist.global_include_acoustic': 'on', # string truthy + 'watchlist.global_include_instrumentals': 0, + }.get(key, default) + result = load_global_content_filter_settings(cfg) + assert result['include_live'] is True + assert result['include_remixes'] is False + assert result['include_acoustic'] is True + assert result['include_instrumentals'] is False diff --git a/web_server.py b/web_server.py index 65545a2d..9aefc252 100644 --- a/web_server.py +++ b/web_server.py @@ -9270,14 +9270,26 @@ def download_discography(artist_id): from database.music_database import MusicDatabase from core.metadata.album_tracks import get_artist_album_tracks + from core.metadata.discography_filters import ( + content_type_skip_reason, + load_global_content_filter_settings, + track_artist_matches, + ) db = MusicDatabase() profile_id = get_current_profile_id() + # Honor the same content-type filters the watchlist scanner uses + # (issue #559). One read at the top — settings don't change + # mid-stream and the four bool reads aren't worth re-running per + # track. + content_settings = load_global_content_filter_settings(config_manager) total_added = 0 total_skipped = 0 + total_skipped_artist = 0 + total_skipped_filter = 0 def generate_ndjson(): - nonlocal total_added, total_skipped + nonlocal total_added, total_skipped, total_skipped_artist, total_skipped_filter for entry in album_entries: album_id = entry['id'] @@ -9325,6 +9337,8 @@ def download_discography(artist_id): added = 0 skipped = 0 + skipped_artist = 0 + skipped_filter = 0 for track in tracks: track_name = track.get('name', '') @@ -9333,6 +9347,23 @@ def download_discography(artist_id): track_artists = track.get('artists', []) or album_artists track_id = track.get('id', '') + # Issue #559: drop tracks where the requested + # artist isn't in the track's artists list + # (cross-artist compilation / appears_on + # contamination). Keeps features. + if not track_artist_matches(track_artists, hint_artist): + skipped_artist += 1 + continue + + # Issue #559: honor watchlist global content-type + # filters (live / remix / acoustic / instrumental) + # for one-off discography downloads too — same + # contract as the discography backfill repair job. + skip_reason = content_type_skip_reason(track_name, album_name, content_settings) + if skip_reason: + skipped_filter += 1 + continue + spotify_track_data = { 'id': track_id, 'name': track_name, @@ -9379,8 +9410,12 @@ def download_discography(artist_id): total_added += added total_skipped += skipped + total_skipped_artist += skipped_artist + total_skipped_filter += skipped_filter logger.warning( - f"[Discography] {album_name} ({resolved_source}): {added} added, {skipped} skipped" + f"[Discography] {album_name} ({resolved_source}): {added} added, " + f"{skipped} skipped (wishlist), {skipped_artist} skipped (artist mismatch), " + f"{skipped_filter} skipped (content filter)" ) yield json.dumps({ "album_id": album_id, @@ -9388,6 +9423,8 @@ def download_discography(artist_id): "status": "done", "tracks_added": added, "tracks_skipped": skipped, + "tracks_skipped_artist": skipped_artist, + "tracks_skipped_filter": skipped_filter, "tracks_total": len(tracks), "source": resolved_source, }) + '\n' @@ -9402,12 +9439,15 @@ def download_discography(artist_id): logger.warning( f"[Discography] Complete for {artist_name}: {total_added} tracks added, " - f"{total_skipped} skipped across {len(album_entries)} albums" + f"{total_skipped} skipped (wishlist), {total_skipped_artist} skipped (artist mismatch), " + f"{total_skipped_filter} skipped (content filter) across {len(album_entries)} albums" ) yield json.dumps({ "status": "complete", "total_added": total_added, "total_skipped": total_skipped, + "total_skipped_artist": total_skipped_artist, + "total_skipped_filter": total_skipped_filter, "total_albums": len(album_entries), }) + '\n' diff --git a/webui/static/helper.js b/webui/static/helper.js index 93a4d3f6..f9bc4b2a 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3416,6 +3416,7 @@ const WHATS_NEW = { '2.5.1': [ // --- post-release patch work on the 2.5.1 line — entries hidden by _getLatestWhatsNewVersion until the build version bumps --- { date: 'Unreleased — 2.5.1 patch work' }, + { title: 'Download Discography: No More Cross-Artist Tracks Or Unwanted Remixes', desc: 'issue #559: download discography pulled in tracks from compilations / appears-on albums where the artist was only featured on one or two tracks — every other track on those albums got added too. also ignored your watchlist "include remixes / live / acoustic / instrumental" settings, so one-off discography downloads kept stuffing your wishlist with remix ladders. fix: per-track filter at the endpoint. drops tracks where the requested artist isn\'t named in the track\'s artists list (keeps features, drops unrelated compilation entries). honors `watchlist.global_include_*` settings the same way the discography backfill repair job already does. per-album response carries new skip counts so the ui can show how much got filtered. 21 new tests at the helper boundary.', page: 'discover' }, { title: 'Album Completeness: "Could Not Determine Album Folder" Error Now Tells You What To Fix', desc: 'github issue #558 (gabistek, navidrome on docker / arch host): clicking auto-fill or fix selected on the album completeness findings page returned a flat "could not determine album folder from existing tracks" error with no diagnostic. trace: the path resolver in `core/library/path_resolver.py` probes transfer + download + `library.music_paths` config + plex api library locations to map db-recorded paths to actual files on disk. for plex users the api auto-discovers the mount paths (per #476). navidrome\'s subsonic api doesn\'t expose filesystem paths at all (only folder names via `getMusicFolders`), and navidrome\'s native rest api on top of that doesn\'t expose them either — there is no api signal we can probe. so for navidrome users in docker, if the path navidrome reports (`/music/artist/album/track.flac`) doesn\'t exist as-is in the soulsync container view AND the user hasn\'t manually configured settings → library → music paths, the resolver returns none and the fix workflow bailed silently. fix: lifted the resolver into a diagnostic-aware variant (`resolve_library_file_path_with_diagnostic` returning a `(resolved, ResolveAttempt)` tuple) that records what was tried — raw-path-existed, base-dirs-probed, whether config_manager / plex_client were wired up. repair_worker uses the diagnostic to render a multi-part error: names the active media server, shows one sample db-recorded path the album\'s tracks have, lists every base directory the resolver actually probed, and points at settings → library → music paths as the actionable fix. user can now read the error and know exactly what to mount or configure. no auto-probing of common docker conventions — too speculative, could resolve to wrong dirs on the suffix-walk if conventional paths happen to contain a partial collision. backwards compatible: legacy `resolve_library_file_path` kept as a thin wrapper that drops the attempt, every existing call site unchanged. 12 new tests pin: tuple shape, raw-path short-circuit attempt fields, base-dirs listed even on walk failure, had-flags reflect caller inputs, error renders active server name + sample path + base dirs, distinguishes empty-base-dirs vs tried-and-failed cases, settings hint always present, defensive against none attempt + missing sample + missing config_manager.', page: 'tools' }, { title: 'Import History: Clear History Button Now Clears Stuck "Processing" Rows', desc: 'noticed on the import page: clear history left zombie rows behind that all showed "⧗ processing" status from 2-9 days ago. trace: `_record_in_progress` inserts a `status=\'processing\'` row up-front so the ui can render the in-flight import while it runs, then `_finalize_result` updates it to `completed`/`failed` when the import finishes. when the server is restarted mid-import (or the worker crashes), the row never gets finalized — stays at `processing` forever. the clear-history endpoint\'s sql `DELETE ... WHERE status IN (\'completed\', \'approved\', \'failed\', \'needs_identification\', \'rejected\')` didn\'t include `processing`, so those zombies survived every click. fix: add `processing` to the delete list, but guard against nuking actually-live imports by intersecting against `_snapshot_active()` — any folder hash currently registered in the worker\'s in-memory `_active_imports` map is excluded from the delete. `pending_review` deliberately left out so user still has to approve/reject those explicitly. one endpoint touched (`/api/auto-import/clear-completed` in web_server.py). no worker changes. zombie-row pile gets swept on next click, new imports still record + update normally.', page: 'import' }, { title: 'Auto-Import: Falls Through To Other Metadata Sources When Primary Has No Match', desc: 'discord report (mushy): 16 bandcamp indie albums sat in staging because auto-import couldn\'t identify them. manual search at the bottom of the import music tab found the same albums fine — they just weren\'t on the user\'s primary metadata source (spotify) but existed on tidal/deezer. trace: `_search_metadata_source` in `core/auto_import_worker.py` only queried `get_primary_source()` — single source, no fallback. meanwhile `search_import_albums` (the manual search bar at the bottom of the tab) already iterated the full `get_source_priority(get_primary_source())` chain and broke on first source with results. asymmetric behavior — manual search worked, auto-import didn\'t, same album. fix: lift auto-import to use the same source-chain pattern. try primary first; if it returns nothing OR scores below the 0.4 threshold, fall through to next source in priority order. first source that produces a strong-enough match wins. result dict carries the `source` that actually matched (not the primary name), so downstream `_match_tracks` calls the right client to fetch the album\'s tracklist. defensive per-source try/except so a rate-limited or auth-failed source doesn\'t abort the chain. unconfigured sources (client=None) silently skipped. scoring math lifted to pure helper `_score_album_search_result` so weight tweaks (album 50% / artist 20% / track-count 30%) are pinned at the function boundary independent of the orchestrator. weight constants exposed at module level (`_ALBUM_NAME_WEIGHT`, `_ARTIST_NAME_WEIGHT`, `_TRACK_COUNT_WEIGHT`) — greppable, bumpable in one place. 9 integration tests + 18 scoring-helper tests. integration tests pin: primary-success path unchanged (no fallback fires, only primary client called), primary-empty falls through to next source, primary-weak-score falls through, first fallback success stops the chain (no wasted api calls on remaining sources), all-sources-fail returns None, per-source exception contained, unconfigured-source skipped gracefully, result `source` field reflects winning source, `identification_confidence` from winning source. backwards compatible — single-source users see no change (chain just has one entry).', page: 'import' },