Fix: MusicBrainz artist discography capped at 25 releases

Artist-detail discography from MusicBrainz fetched releases via the
artist lookup (`/artist/<mbid>?inc=release-groups`), which MusicBrainz
hard-caps at 25 embedded release-groups and which ignores the `limit`
param entirely. Prolific artists had ~85% of their catalogue silently
dropped — Kendrick Lamar has 167 release-groups on the site but only the
first 25 ever reached SoulSync. Reported by Sokhi: "a lot of albums are
missing when searching vs what's showing on the site."

Switch `get_artist_albums` to walk the paginated browse endpoint
(`/release-group?artist=<mbid>`, offset loop) — the same pattern the
basic-search path already uses — fetching the full catalogue up to the
caller's limit. No type filter and no studio-only filter here: the
artist-detail page wants every primary/secondary type so its tabs mirror
musicbrainz.org. Verified live: now returns all 167 for Kendrick.

Adds 7 tests covering pagination past the cap, offset advance,
short-page stop, limit cap, cross-page dedup, type->bucket mapping, and
a regression pin asserting the capped inc=release-groups lookup is no
longer the discography source.
pull/731/head
BoulderBadgeDad 2 weeks ago
parent 1b5056ef2a
commit b14d504cc1

@ -1131,30 +1131,59 @@ class MusicBrainzSearchClient:
}
def get_artist_albums(self, artist_mbid: str, album_type: str = 'album,single', limit: int = 200) -> List:
"""Get artist's releases for discography view."""
"""Get an artist's full discography for the artist-detail view.
Walks the paginated browse endpoint (`/release-group?artist=<mbid>`)
instead of the artist lookup's embedded release-groups. The lookup
(`/artist/<mbid>?inc=release-groups`) is hard-capped at 25 release-
groups by MusicBrainz and ignores the `limit` param entirely so
prolific artists had ~85% of their catalogue silently dropped. That
was the bug Sokhi reported: "a lot of albums are missing vs what's
showing on the site" (e.g. Kendrick Lamar — 167 release-groups on
MB, only the first 25 ever reached SoulSync).
Unlike the search path there is NO `type` filter and NO studio-only
filter here: the artist-detail page wants the *whole* catalogue
albums, EPs, singles, compilations, soundtracks, live so its tabs
mirror musicbrainz.org. `_release_group_to_album` maps each release-
group's primary/secondary types into the right bucket.
"""
try:
artist = self._client.get_artist(artist_mbid, includes=['release-groups'])
if not artist or 'release-groups' not in artist:
return []
albums = []
for rg in artist.get('release-groups', []):
primary_type = rg.get('primary-type', '') or ''
rg_type = _map_release_type(primary_type, rg.get('secondary-types', []))
rg_mbid = rg.get('id', '')
image_url = self._cached_art(rg_mbid, rg_mbid)
albums.append(Album(
id=rg_mbid,
name=rg.get('title', ''),
artists=[artist.get('name', 'Unknown Artist')],
release_date=rg.get('first-release-date', '') or '',
total_tracks=0,
album_type=rg_type,
image_url=image_url,
external_urls={'musicbrainz': f'https://musicbrainz.org/release-group/{rg_mbid}'},
))
# Lightweight lookup purely for the canonical artist name — the
# browse endpoint doesn't carry it. Non-fatal: on failure each
# Album falls back to 'Unknown Artist' via _release_group_to_album.
artist = self._client.get_artist(artist_mbid)
artist_name = (artist or {}).get('name', '') or ''
page_size = 100 # MusicBrainz browse hard cap per page
albums: List[Album] = []
seen: set = set()
offset = 0
while len(albums) < limit:
page = self._client.browse_artist_release_groups(
artist_mbid,
# No type filter — fetch every primary type. Secondary
# types (Compilation/Soundtrack/Live) ride along on their
# Album/Single/EP parent and are bucketed downstream.
# (Filtering by type=compilation is intentionally avoided:
# it's a secondary type and silently breaks MB's filter —
# see browse_artist_release_groups docs.)
release_types=None,
limit=page_size,
offset=offset,
)
if not page:
break
for rg in page:
rg_mbid = rg.get('id', '')
if rg_mbid and rg_mbid in seen:
continue
if rg_mbid:
seen.add(rg_mbid)
albums.append(self._release_group_to_album(rg, artist_name))
if len(page) < page_size:
break # last page reached
offset += page_size
return albums[:limit]
except Exception as e:
logger.warning(f"MusicBrainz artist albums failed: {e}")

@ -1179,3 +1179,150 @@ def test_search_tracks_with_artist_swallows_client_errors():
client._client.search_recording.side_effect = RuntimeError('network down')
assert client.search_tracks_with_artist('Track', 'Artist', limit=10) == []
# ---------------------------------------------------------------------------
# get_artist_albums — full-discography browse pagination
#
# Regression for Sokhi's report ("a lot of albums are missing vs the site").
# The old impl read the artist *lookup*'s embedded release-groups
# (`/artist/<mbid>?inc=release-groups`), which MusicBrainz hard-caps at 25
# and which ignores the `limit` param — so ~85% of a prolific artist's
# catalogue (Kendrick Lamar: 167 release-groups) was silently dropped.
# The fix walks the paginated browse endpoint instead.
# ---------------------------------------------------------------------------
def _mk_rg(rg_id, title, primary='Album', secondary=None, date='2000-01-01'):
return {
'id': rg_id,
'title': title,
'primary-type': primary,
'secondary-types': secondary or [],
'first-release-date': date,
}
def test_get_artist_albums_does_not_use_capped_lookup_release_groups():
"""The capped `inc=release-groups` lookup must NOT be the source of the
discography. We still do a lightweight name lookup, but never request
the embedded (25-capped) release-groups."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Kendrick Lamar'}
client._client.browse_artist_release_groups.return_value = [
_mk_rg('rg-1', 'DAMN.'),
]
client.get_artist_albums('mbid-kdot')
# browse is the discography source.
assert client._client.browse_artist_release_groups.called
# The name lookup must not pull the capped embedded release-groups.
for call in client._client.get_artist.call_args_list:
assert 'release-groups' not in (call.kwargs.get('includes') or [])
assert all('release-groups' not in (a or []) for a in call.args[1:])
def test_get_artist_albums_paginates_past_25_cap():
"""Walks multiple browse pages until a short page, returning the FULL
catalogue the whole point of the fix. A single full page (100) must
trigger a follow-up fetch."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Prolific Artist'}
page1 = [_mk_rg(f'rg-{i}', f'Album {i}') for i in range(100)]
page2 = [_mk_rg(f'rg-{i}', f'Album {i}') for i in range(100, 164)]
client._client.browse_artist_release_groups.side_effect = [page1, page2, []]
albums = client.get_artist_albums('mbid-x', limit=200)
assert len(albums) == 164 # not truncated to 25
# Second page fetched at offset=100.
offsets = [c.kwargs.get('offset') for c in client._client.browse_artist_release_groups.call_args_list]
assert 0 in offsets and 100 in offsets
# No `type` filter — the detail page wants the whole catalogue.
for c in client._client.browse_artist_release_groups.call_args_list:
assert c.kwargs.get('release_types') is None
def test_get_artist_albums_stops_on_short_page():
"""A page shorter than the page size is the last page — don't fetch
a spurious extra page."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Small Artist'}
client._client.browse_artist_release_groups.return_value = [
_mk_rg('rg-1', 'Only Album'),
]
albums = client.get_artist_albums('mbid-small', limit=200)
assert len(albums) == 1
client._client.browse_artist_release_groups.assert_called_once()
def test_get_artist_albums_respects_limit():
"""`limit` caps the returned list even when more release-groups exist."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Prolific Artist'}
client._client.browse_artist_release_groups.return_value = [
_mk_rg(f'rg-{i}', f'Album {i}') for i in range(100)
]
albums = client.get_artist_albums('mbid-x', limit=50)
assert len(albums) == 50
def test_get_artist_albums_dedupes_release_group_ids():
"""A release-group id repeated across pages is collapsed to one card.
First page is full (100) so a second page is fetched; 'dup' appears on
both and must surface only once."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Artist'}
page1 = [_mk_rg('dup', 'A')] + [_mk_rg(f'rg-{i}', f'B{i}') for i in range(99)]
page2 = [_mk_rg('dup', 'A again'), _mk_rg('rg-last', 'C')]
client._client.browse_artist_release_groups.side_effect = [page1, page2, []]
albums = client.get_artist_albums('mbid-x', limit=200)
ids = [a.id for a in albums]
assert ids.count('dup') == 1
assert 'rg-last' in ids
assert len(ids) == len(set(ids))
def test_get_artist_albums_maps_types_into_buckets():
"""Primary/secondary types map to the album_type the discography binning
expects: EPep, Singlesingle, Album+Compilationcompilation, plain
Albumalbum."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Artist'}
client._client.browse_artist_release_groups.return_value = [
_mk_rg('rg-lp', 'The LP', primary='Album'),
_mk_rg('rg-ep', 'The EP', primary='EP'),
_mk_rg('rg-single', 'The Single', primary='Single'),
_mk_rg('rg-comp', 'Greatest Hits', primary='Album', secondary=['Compilation']),
]
albums = {a.id: a for a in client.get_artist_albums('mbid-x')}
assert albums['rg-lp'].album_type == 'album'
assert albums['rg-ep'].album_type == 'ep'
assert albums['rg-single'].album_type == 'single'
assert albums['rg-comp'].album_type == 'compilation'
def test_get_artist_albums_swallows_browse_errors():
"""Browse raising must not crash the discography endpoint — return []
so the source-priority cascade falls through to the next provider."""
client = MusicBrainzSearchClient()
client._client = MagicMock()
client._client.get_artist.return_value = {'name': 'Artist'}
client._client.browse_artist_release_groups.side_effect = RuntimeError('mb down')
assert client.get_artist_albums('mbid-x') == []

Loading…
Cancel
Save