diff --git a/core/musicbrainz_search.py b/core/musicbrainz_search.py index 7ff95e7a..8af8bd38 100644 --- a/core/musicbrainz_search.py +++ b/core/musicbrainz_search.py @@ -219,6 +219,41 @@ class MusicBrainzSearchClient: self._artist_mbid_cache[key] = top return top + # Secondary-type tags on MB release-groups that indicate NOT a studio + # release. Used by both the album browse (filter out) and the track + # browse (prefer studio release for album context). + _NON_STUDIO_SECONDARY_TYPES = { + 'Live', 'Compilation', 'Soundtrack', 'Remix', 'Demo', + 'Mixtape/Street', 'Interview', 'Audiobook', 'Audio drama', + } + + def _release_preference_key(self, rel: Dict[str, Any]): + """Sort key: studio releases first, then by date ASC. + + Recordings in MB often have 10+ releases (studio album, live, best-of, + reissues, anniversary editions). The first one in the API response is + arbitrary — it's often a recent live bootleg because MB users add new + live recordings all the time. Re-sorting before `_recording_to_track` + reads the first release means tracks show their canonical studio + album, not a random live compilation. + """ + rg = rel.get('release-group') or {} + secs = set(rg.get('secondary-types') or []) + is_studio = 0 if not (secs & self._NON_STUDIO_SECONDARY_TYPES) else 1 + date = (rel.get('date') or '')[:4] + year = int(date) if date.isdigit() else 9999 + return (is_studio, year) + + def _has_studio_release(self, recording: Dict[str, Any]) -> bool: + """True when at least one of the recording's releases is on a + release-group with no non-studio secondary type.""" + for rel in (recording.get('releases') or []): + rg = rel.get('release-group') or {} + secs = set(rg.get('secondary-types') or []) + if not (secs & self._NON_STUDIO_SECONDARY_TYPES): + return True + return False + def _release_group_to_album(self, rg: Dict[str, Any], artist_name: str) -> Album: """Project a MusicBrainz release-group into our Album dataclass.""" rg_mbid = rg.get('id', '') @@ -270,17 +305,42 @@ class MusicBrainzSearchClient: tname = top.get('name', '') or query rgs = self._client.browse_artist_release_groups( mbid, - release_types=['album', 'ep', 'single', 'compilation'], + # 'compilation' is a SECONDARY type, not a primary type + # — including it in the OR filter causes MB to return + # only 82 matches instead of the actual 1076 because + # the filter silently breaks. Actual compilations + # (primary-type=Album with secondary-types=[Compilation]) + # are handled by the studio-preference filter below. + release_types=['album', 'ep', 'single'], limit=100, ) - # Sort by first-release-date desc (newest first), then by - # primary-type priority (album > ep > single > compilation) - # so the top of the list is a credible "what to explore." + + # Prefer studio releases — MusicBrainz tags live bootlegs + # and best-of compilations with secondary-types. For mega- + # artists like Metallica, 83 of 100 browse results are live + # broadcast bootlegs; the 12 studio albums are buried. A + # release-group with no secondary-types (or an explicit + # studio-only type) is the "original studio" shape users + # expect to see first. + def _is_studio(rg): + secs = set((rg.get('secondary-types') or [])) + return not (secs & {'Live', 'Compilation', 'Soundtrack', + 'Remix', 'Demo', 'Mixtape/Street', + 'Interview', 'Audiobook', 'Audio drama'}) + studio = [rg for rg in rgs if _is_studio(rg)] + # If filtering leaves us empty (niche live-only artist), + # fall back to the unfiltered list — better than no results. + rgs = studio or rgs + + # Sort by primary-type priority first (album > ep > single > + # compilation), then chronologically ASC — the standard way + # discographies are listed ("their debut was X, then Y, then Z"). type_priority = {'album': 0, 'ep': 1, 'single': 2, 'compilation': 3} def _sort_key(rg): pt = (rg.get('primary-type') or '').lower() date = rg.get('first-release-date') or '' - return (type_priority.get(pt, 9), -int(date[:4]) if date[:4].isdigit() else 0) + year = int(date[:4]) if date[:4].isdigit() else 9999 + return (type_priority.get(pt, 9), year) rgs.sort(key=_sort_key) albums = [self._release_group_to_album(rg, tname) for rg in rgs[:limit]] return albums @@ -438,10 +498,29 @@ class MusicBrainzSearchClient: # so we use the fielded Lucene search arid: instead — # that returns recordings with release context inline. recs = self._client.search_recordings_by_artist_mbid(mbid, limit=100) - # Browse returns recordings unsorted. Dedupe by normalized - # title (MB has many live/compilation variants of the same - # song), then sort by release date desc so "newest" tracks - # surface first — matches how the other source tabs look. + + # Re-order each recording's releases to prefer studio over + # live/compilation. Without this, the first release (which + # the adapter uses for album info + date) is often a random + # live bootleg — Metallica has 10+ live versions of "One" + # ranked ahead of the studio release. Mutates in place so + # `_recording_to_track` sees the preferred release first. + for r in recs: + rels = r.get('releases') or [] + if not rels: + continue + rels.sort(key=self._release_preference_key) + r['releases'] = rels + + # Prefer recordings that have at least one studio release. + # Falls back to the full set if the artist is live-only. + studio = [r for r in recs if self._has_studio_release(r)] + recs = studio or recs + + # Dedupe by normalized title (MB has many versions of the + # same song — live, remaster, re-recording, etc.). Because + # we sorted releases above, `_recording_to_track` will pick + # the studio release for album info on the first keeper. seen = set() deduped = [] for r in recs: @@ -451,10 +530,17 @@ class MusicBrainzSearchClient: seen.add(key) deduped.append(r) + # Sort by studio-release year ASC so classic tracks surface + # first. For a user typing "metallica", this means "Seek + # and Destroy" (1983) before "Atlas, Rise!" (2016) — which + # matches how most discography views order by release. def _track_sort_key(r): - rel = (r.get('releases') or [{}])[0] - date = (rel.get('date') or '')[:4] - return -int(date) if date.isdigit() else 0 + rels = r.get('releases') or [] + for rel in rels: + date = (rel.get('date') or '')[:4] + if date.isdigit(): + return int(date) + return 9999 deduped.sort(key=_track_sort_key) tracks = [] diff --git a/tests/test_musicbrainz_search.py b/tests/test_musicbrainz_search.py index db75b306..88afe15f 100644 --- a/tests/test_musicbrainz_search.py +++ b/tests/test_musicbrainz_search.py @@ -205,8 +205,9 @@ def test_search_albums_bare_query_uses_browse_path(): client._client.browse_artist_release_groups.assert_called_once() # Text-search path must NOT be taken. client._client.search_release.assert_not_called() - # Browse results come back, newest first. - assert [a.name for a in albums] == ['Master of Puppets', 'Ride the Lightning'] + # Chronological ASC — debut first, so the album list reads like a + # standard discography (Wikipedia-style: earliest release on top). + assert [a.name for a in albums] == ['Ride the Lightning', 'Master of Puppets'] assert all(a.artists == ['Metallica'] for a in albums) @@ -246,6 +247,113 @@ def test_search_albums_falls_back_to_text_when_no_artist_match(): client._client.browse_artist_release_groups.assert_not_called() +def test_search_albums_filters_live_and_compilation_secondary_types(): + """Mega-artists' browse results are dominated by live bootlegs and + best-of compilations — they should be filtered out so the studio + discography surfaces.""" + client = MusicBrainzSearchClient() + client._client = MagicMock() + client._client.search_artist.return_value = [_mk_artist('Metallica', 'mb-1', score=100)] + client._client.browse_artist_release_groups.return_value = [ + {'id': 'rg-live-1', 'title': 'Live Bootleg 2019', 'primary-type': 'Album', + 'first-release-date': '2019-01-01', 'secondary-types': ['Live']}, + {'id': 'rg-studio-1', 'title': 'Kill Em All', 'primary-type': 'Album', + 'first-release-date': '1983-07-25', 'secondary-types': []}, + {'id': 'rg-comp-1', 'title': 'Greatest Hits', 'primary-type': 'Album', + 'first-release-date': '2010-01-01', 'secondary-types': ['Compilation']}, + {'id': 'rg-studio-2', 'title': 'Master of Puppets', 'primary-type': 'Album', + 'first-release-date': '1986-03-03', 'secondary-types': []}, + ] + + albums = client.search_albums('metallica', limit=10) + + titles = [a.name for a in albums] + assert titles == ['Kill Em All', 'Master of Puppets'] + assert 'Live Bootleg 2019' not in titles + assert 'Greatest Hits' not in titles + + +def test_search_albums_falls_back_to_all_when_no_studio(): + """Niche live-only artist: if no studio releases exist, show live ones + rather than returning empty.""" + client = MusicBrainzSearchClient() + client._client = MagicMock() + client._client.search_artist.return_value = [_mk_artist('LiveBand', 'mb-1', score=100)] + client._client.browse_artist_release_groups.return_value = [ + {'id': 'rg-live-1', 'title': 'Live at X', 'primary-type': 'Album', + 'first-release-date': '2019-01-01', 'secondary-types': ['Live']}, + {'id': 'rg-live-2', 'title': 'Live at Y', 'primary-type': 'Album', + 'first-release-date': '2020-01-01', 'secondary-types': ['Live']}, + ] + + albums = client.search_albums('liveband', limit=10) + + assert len(albums) == 2 + + +def test_search_tracks_prefers_studio_release_in_album_field(): + """When a recording has both a studio release and a live release, the + Track.album should reflect the studio release (canonical album), + regardless of the order MB returned them in.""" + client = MusicBrainzSearchClient() + client._client = MagicMock() + client._client.search_artist.return_value = [_mk_artist('Metallica', 'mb-1', score=100)] + client._client.search_recordings_by_artist_mbid.return_value = [ + { + 'id': 'rec-master', + 'title': 'Master of Puppets', + 'length': 516000, + 'artist-credit': [{'name': 'Metallica'}], + # Live release first (what MB often returns), studio second. + 'releases': [ + {'id': 'rel-live', 'title': 'Live Bootleg', 'date': '2023-01-01', + 'release-group': {'id': 'rg-live', 'primary-type': 'Album', + 'secondary-types': ['Live']}}, + {'id': 'rel-studio', 'title': 'Master of Puppets', 'date': '1986-03-03', + 'release-group': {'id': 'rg-studio', 'primary-type': 'Album', + 'secondary-types': []}}, + ], + }, + ] + + tracks = client.search_tracks('metallica', limit=10) + + assert len(tracks) == 1 + # Album must be the studio release, not the live bootleg. + assert tracks[0].album == 'Master of Puppets' + assert tracks[0].release_date == '1986-03-03' + + +def test_search_tracks_filters_recordings_without_studio_releases(): + """A recording that only exists on live/compilation releases should be + dropped when we have studio alternatives.""" + client = MusicBrainzSearchClient() + client._client = MagicMock() + client._client.search_artist.return_value = [_mk_artist('Metallica', 'mb-1', score=100)] + client._client.search_recordings_by_artist_mbid.return_value = [ + {'id': 'rec-studio', 'title': 'Seek and Destroy', 'length': 390000, + 'artist-credit': [{'name': 'Metallica'}], + 'releases': [ + {'id': 'rel-studio', 'title': 'Kill Em All', 'date': '1983-07-25', + 'release-group': {'id': 'rg-studio', 'primary-type': 'Album', + 'secondary-types': []}}, + ]}, + {'id': 'rec-live-only', 'title': 'Fight Fire With Fire', 'length': 450000, + 'artist-credit': [{'name': 'Metallica'}], + 'releases': [ + {'id': 'rel-live', 'title': 'Live Shit', 'date': '1993-01-01', + 'release-group': {'id': 'rg-live', 'primary-type': 'Album', + 'secondary-types': ['Live']}}, + ]}, + ] + + tracks = client.search_tracks('metallica', limit=10) + + titles = [t.name for t in tracks] + assert 'Seek and Destroy' in titles + assert 'Fight Fire With Fire' not in titles + + def test_search_albums_text_path_filters_by_score(): client = MusicBrainzSearchClient() client._client = MagicMock()