diff --git a/core/discogs_client.py b/core/discogs_client.py index 4147277d..445b7c68 100644 --- a/core/discogs_client.py +++ b/core/discogs_client.py @@ -61,6 +61,24 @@ def rate_limited(func): return wrapper +# Discogs disambiguates duplicate artist names two ways: numeric `(N)` +# suffix on the older convention (e.g. "Bullet (2)") and a trailing `*` +# on the newer convention (e.g. "John Smith*"). Both are presentation- +# only — the underlying canonical name is what users expect to see in +# search results, on cards, and especially in import paths (otherwise +# library folders end up named `Foo*` on disk). Strip both at every +# point a Discogs payload becomes a name string. +_DISCOGS_DISAMBIG_RE = re.compile(r'(?:\s*\(\d+\))?\s*\*+\s*$|\s*\(\d+\)\s*$') + + +def _clean_discogs_artist_name(name: Optional[str]) -> str: + """Strip Discogs disambiguation suffixes — both `(N)` and trailing + `*` — from an artist name. Returns '' for None / empty input.""" + if not name: + return '' + return _DISCOGS_DISAMBIG_RE.sub('', name).strip() + + # --- Shared dataclasses (same shape as iTunes/Deezer/Spotify) --- @dataclass @@ -117,9 +135,10 @@ class Track: # Artists from track-level or release-level track_artists = [] if track_data.get('artists'): - track_artists = [a.get('name', '') for a in track_data['artists'] if a.get('name')] + track_artists = [_clean_discogs_artist_name(a.get('name', '')) for a in track_data['artists'] if a.get('name')] if not track_artists and release.get('artists'): - track_artists = [a.get('name', '') for a in release['artists'] if a.get('name')] + track_artists = [_clean_discogs_artist_name(a.get('name', '')) for a in release['artists'] if a.get('name')] + track_artists = [a for a in track_artists if a] if not track_artists: track_artists = ['Unknown Artist'] @@ -190,7 +209,9 @@ class Artist: return cls( id=str(artist_data.get('id', '')), - name=artist_data.get('name', artist_data.get('title', '')), + name=_clean_discogs_artist_name( + artist_data.get('name', artist_data.get('title', '')) + ), popularity=0, genres=[], followers=0, @@ -216,14 +237,15 @@ class Album: artists = [] title = release_data.get('title', '') if release_data.get('artists'): - artists = [a.get('name', '') for a in release_data['artists'] if a.get('name')] + artists = [_clean_discogs_artist_name(a.get('name', '')) for a in release_data['artists'] if a.get('name')] elif release_data.get('artist'): - artists = [release_data['artist']] + artists = [_clean_discogs_artist_name(release_data['artist'])] elif ' - ' in title: # Search results: "Radiohead - OK Computer" → artist="Radiohead", title="OK Computer" parts = title.split(' - ', 1) - artists = [parts[0].strip()] + artists = [_clean_discogs_artist_name(parts[0])] title = parts[1].strip() + artists = [a for a in artists if a] if not artists: artists = ['Unknown Artist'] @@ -444,8 +466,8 @@ class DiscogsClient: artist_name = '' if artists and isinstance(artists[0], dict): artist_name = (artists[0].get('name') or '').strip() - # Strip trailing "(N)" disambiguation suffix Discogs adds. - artist_name = re.sub(r'\s*\(\d+\)$', '', artist_name) + # Strip Discogs disambiguation suffixes — both `(N)` and `*`. + artist_name = _clean_discogs_artist_name(artist_name) if not title or not artist_name: continue @@ -658,9 +680,13 @@ class DiscogsClient: def get_artist_albums(self, artist_id: str, album_type: str = 'album,single', limit: int = 50) -> List[Album]: """Get releases by an artist. Prefers master releases, filters features.""" - # First get the artist name for feature filtering + # First get the artist name for feature filtering. Strip Discogs + # disambiguation suffix so feature-vs-primary matching below + # compares against the canonical name, not "Beyoncé*". artist_data = self._api_get(f'/artists/{artist_id}') - artist_name = artist_data.get('name', '').lower() if artist_data else '' + artist_name = _clean_discogs_artist_name( + artist_data.get('name', '') if artist_data else '' + ).lower() data = self._api_get(f'/artists/{artist_id}/releases', { 'sort': 'year', 'sort_order': 'desc', 'per_page': min(limit * 3, 200), @@ -765,7 +791,11 @@ class DiscogsClient: # Get artists artists_list = [] if data.get('artists'): - artists_list = [{'name': a.get('name', '')} for a in data['artists'] if a.get('name')] + artists_list = [ + {'name': _clean_discogs_artist_name(a.get('name', ''))} + for a in data['artists'] + if a.get('name') and _clean_discogs_artist_name(a.get('name', '')) + ] if not artists_list: artists_list = [{'name': 'Unknown Artist'}] @@ -794,7 +824,11 @@ class DiscogsClient: # Per-track artists or fall back to release artists track_artists = artists_list if t.get('artists'): - track_artists = [{'name': a.get('name', '')} for a in t['artists'] if a.get('name')] + track_artists = [ + {'name': _clean_discogs_artist_name(a.get('name', ''))} + for a in t['artists'] + if a.get('name') and _clean_discogs_artist_name(a.get('name', '')) + ] tracks.append({ 'id': f"{release_id}_t{track_num}", diff --git a/tests/test_discogs_collection_source.py b/tests/test_discogs_collection_source.py index c037cd23..66523fc2 100644 --- a/tests/test_discogs_collection_source.py +++ b/tests/test_discogs_collection_source.py @@ -22,7 +22,34 @@ from unittest.mock import patch import pytest -from core.discogs_client import DiscogsClient +from core.discogs_client import DiscogsClient, _clean_discogs_artist_name + + +# --------------------------------------------------------------------------- +# _clean_discogs_artist_name — disambiguation suffix helper +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize('raw,expected', [ + ('Madonna', 'Madonna'), + ('Madonna (3)', 'Madonna'), # legacy (N) suffix + ('Madonna*', 'Madonna'), # newer asterisk suffix + ('John Smith*', 'John Smith'), + ('Bullet (2)', 'Bullet'), + ('Foo (12)', 'Foo'), # double-digit (N) + ('Baz**', 'Baz'), # repeated asterisks + ('Qux* ', 'Qux'), # trailing whitespace after * + ('Artist (3) *', 'Artist'), # both suffixes, space-separated + ('Foo (3)*', 'Foo'), # both suffixes, no space + ('No Suffix Here', 'No Suffix Here'), + ('', ''), + (None, ''), + (' ', ''), +]) +def test_clean_discogs_artist_name(raw, expected): + """Helper strips both `(N)` and trailing `*` disambiguation suffixes + from Discogs artist names. Closes #634.""" + assert _clean_discogs_artist_name(raw) == expected # --------------------------------------------------------------------------- @@ -105,6 +132,28 @@ def test_get_user_collection_strips_discogs_disambiguation_suffix(authed_client) assert result[0]['artist_name'] == 'Madonna' +def test_get_user_collection_strips_discogs_asterisk_disambiguation(authed_client): + """Discogs also appends '*' (asterisk) as a disambiguation suffix + for the newer naming convention (e.g. 'John Smith*'). Without + stripping, library folders would land on disk named 'John Smith*' + and downstream matchers wouldn't equate it to the canonical name + other providers return. Closes #634.""" + fake_response = { + 'pagination': {'pages': 1, 'page': 1}, + 'releases': [ + {'id': 1, 'basic_information': { + 'title': 'X', 'artists': [{'name': 'John Smith*'}], + 'cover_image': '', 'year': 2020, + }}, + ], + } + with patch.object(authed_client, '_api_get', + side_effect=lambda e, p=None: ({'username': 'u'} if e == '/oauth/identity' else fake_response)): + result = authed_client.get_user_collection() + + assert result[0]['artist_name'] == 'John Smith' + + def test_get_user_collection_handles_missing_year(authed_client): """Year 0 / missing → empty release_date string (NOT '0').""" fake_response = { diff --git a/webui/static/helper.js b/webui/static/helper.js index 6ec49872..339c7ca4 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3415,6 +3415,7 @@ function closeHelperSearch() { const WHATS_NEW = { '2.6.3': [ { unreleased: true }, + { title: 'Discogs: strip artist disambiguation suffixes everywhere', desc: 'Discogs marks duplicate-named artists with either `(N)` numeric suffixes ("Bullet (2)") or trailing asterisks ("John Smith*"). Only the `(N)` form was stripped, and only on the collection-import path — so artist + album search results in the UI surfaced raw "Foo*" or "Bar (2)" rows, and worse, import path inherited those characters into folder names on disk. centralized the cleanup into a single helper applied at every site a Discogs payload becomes a name string (artist search, album search, track lookups, get_artist_albums feature filtering). closes #634.' }, { title: 'Library: Enhanced / Standard view toggle now sticks per browser', desc: 'the artist detail page used to revert to Standard view every time you clicked into a new artist — flipping to Enhanced was a per-click thing with no memory. now the choice is persisted in localStorage and reapplied automatically on every artist open. survives page reloads too. non-admins (who can\'t toggle Enhanced anyway) are unaffected; admins who never touched the Enhanced view stay on Standard since the saved value defaults to it.' }, { title: 'Fix popup: manual matches survive Playlist Pipeline runs', desc: 'manually mapping a mirrored-playlist track via the Fix popup (search or MBID) saved correctly and the download went through, but the next Playlist Pipeline run kept reverting the match — the track flipped back to "Provider Changed" and you had to map it again. three things were ganging up on it: the manual-fix save always stamped `provider: \'spotify\'` even when the match came from MusicBrainz / iTunes / Deezer, so prepare-discovery saw a provider mismatch on the next run; the discovery worker also re-queued any matched_data that lacked `track_number` / `album.id` / `release_date` for re-discovery, and Fix-popup saves never carry those fields by design (search results don\'t include track_number, MBID lookups don\'t include album.id) — so manual fixes always looked "incomplete" and got overwritten; and the prepare-discovery provider check didn\'t honour the `manual_match` flag at all. all three patched — manual matches now stamp the actual source, the worker short-circuits on `manual_match` before the incomplete-data branch, and prepare-discovery treats manual fixes as cached regardless of provider drift.' }, { title: 'Fix popup: artist + track fields no longer surface unrelated covers', desc: 'separate artist / track inputs in the Fix popup used to dump both into a bare MusicBrainz keyword query — and MB\'s scorer heavily favours title matches, so searching "Coffee Break" + "Zeds Dead" surfaced random "Coffee Break" tracks by other artists ahead of the canonical Zeds Dead one. fields-mode now uses a field-scoped Lucene query that actually anchors the artist, with the old bare query kept as a fallback for diacritic mismatches like "Bjork" vs "Björk" where strict phrase match misses. results also stable-prefer entries with known track length so the canonical 3:04 sibling sits above the 0:00 duplicate.' },