From 48d848bb747b0907a223663fa2479d21608b8c93 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sun, 10 May 2026 16:22:23 -0700 Subject: [PATCH] MB worker populates artists.aliases on enrichment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #442 — MusicBrainz exposes alternate-spelling aliases (Japanese kanji `澤野弘之` for `Hiroyuki Sawano`, Cyrillic `Сергей Лазарев` for `Sergey Lazarev`, etc.) on every artist record. SoulSync's MB enrichment worker had access to this data via `get_artist(mbid, includes=['aliases'])` but wasn't reading or persisting it. This commit wires the alias fetch into the worker's existing artist-match path, persists to the new `artists.aliases` column added in the prior commit, and adds a verifier-friendly read-by- name lookup so the AcoustID verifier (next commit) can resolve aliases without an MB round-trip when the artist is in the library. # New service methods - `fetch_artist_aliases(mbid) -> list[str]` — calls `mb_client.get_artist(mbid, includes=['aliases'])`, parses the alias array, dedupes case-insensitively. Returns empty list on any failure (missing key, network error, malformed response) so transient MB outages never trigger stricter quarantine decisions than the pre-fix behaviour. Empty mbid → no API call. - `update_artist_aliases(artist_id, aliases)` — persists as JSON array to `artists.aliases`. Idempotent — overwrites prior value. Empty list clears the column. None artist_id is a no-op. - `get_artist_aliases(artist_name) -> list[str]` — reads back by artist NAME (not id), case-insensitive. Used by the verifier where the expected artist comes from track metadata — there's no library row id at quarantine time. Returns empty list for unknown artists, missing data, or corrupt JSON (defensive against legacy rows). # Worker integration `MusicBrainzWorker._process_item` artist branch: - After `update_artist_mbid` succeeds, fetch aliases for the matched MBID and persist via `update_artist_aliases`. - Best-effort: alias fetch wrapped in try/except, failure logs at debug level, doesn't regress the match outcome. - No alias call when the artist didn't match an MBID (nothing to enrich). # Tests (23) - `fetch_artist_aliases`: extracts names from MB response, case-insensitive dedup, skips empty/null entries, missing-key fallback, network failure → empty, empty mbid no API call, verifies `inc=aliases` request param. - `update_artist_aliases`: persists as JSON, idempotent overwrite, empty list clears column, None id is no-op. - `get_artist_aliases`: returns aliases for known artist, case-insensitive lookup, empty for unknown artist / no-aliases row, handles corrupt JSON + non-list shape gracefully. - Worker integration: matched artist triggers fetch + persist, no alias call when not matched, alias-fetch failure doesn't break the match outcome. # Verification - 23/23 new tests pass - Ruff clean --- core/musicbrainz_service.py | 107 +++++++ core/musicbrainz_worker.py | 18 ++ tests/matching/test_artist_alias_service.py | 332 ++++++++++++++++++++ 3 files changed, 457 insertions(+) create mode 100644 tests/matching/test_artist_alias_service.py diff --git a/core/musicbrainz_service.py b/core/musicbrainz_service.py index fced6dcb..d7b8177c 100644 --- a/core/musicbrainz_service.py +++ b/core/musicbrainz_service.py @@ -388,6 +388,113 @@ class MusicBrainzService: logger.error(f"Error matching recording '{track_name}': {e}") return None + def fetch_artist_aliases(self, mbid: str) -> list: + """Fetch the alias list for an artist from MusicBrainz. + + Issue #442 — Japanese kanji / Cyrillic / etc. spellings of an + artist's name are stored as `aliases` on the MusicBrainz + artist record. Pull them so SoulSync can recognise that + `澤野弘之` and `Hiroyuki Sawano` refer to the same artist. + + Returns the deduplicated list of alias `name` strings. Returns + empty list (NOT None) on any failure — caller should treat + empty as "no aliases available, fall back to direct match" so + a transient MB outage never causes a stricter verification + decision than today. + """ + if not mbid: + return [] + try: + data = self.mb_client.get_artist(mbid, includes=['aliases']) + except Exception as e: + logger.debug("fetch_artist_aliases: get_artist(%s) raised: %s", mbid, e) + return [] + if not data: + return [] + raw_aliases = data.get('aliases') or [] + # MB returns each alias as a dict with `name`, `sort-name`, + # `locale`, `primary`, `type`, etc. We only care about the + # display name — that's what `actual` artist strings will + # match against. + seen = set() + cleaned = [] + for entry in raw_aliases: + if not isinstance(entry, dict): + continue + name = (entry.get('name') or '').strip() + if not name: + continue + key = name.lower() + if key in seen: + continue + seen.add(key) + cleaned.append(name) + return cleaned + + def update_artist_aliases(self, artist_id: int, aliases: list) -> None: + """Persist the alias list to `artists.aliases` as a JSON array. + + Idempotent — overwrites any existing value. Empty list + clears the column (caller may want this if MB has no aliases + for the artist anymore). + """ + if artist_id is None: + return + conn = None + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute( + "UPDATE artists SET aliases = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?", + (json.dumps(aliases) if aliases else None, artist_id), + ) + conn.commit() + logger.debug("Updated artist %s aliases (%d entries)", artist_id, len(aliases or [])) + except Exception as e: + logger.error(f"Error updating artist aliases for {artist_id}: {e}") + if conn: + conn.rollback() + finally: + if conn: + conn.close() + + def get_artist_aliases(self, artist_name: str) -> list: + """Look up cached aliases for an artist by NAME (not id). + + Used by the verifier where the expected artist comes from a + download's metadata-source data — we don't have a library + row's `id` to query, just the display name. Returns empty + list when the artist isn't in the library or has no aliases + recorded. The verifier falls back to live MB lookup in that + case. + """ + if not artist_name: + return [] + conn = None + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute( + "SELECT aliases FROM artists WHERE name = ? COLLATE NOCASE LIMIT 1", + (artist_name,), + ) + row = cursor.fetchone() + if not row or not row[0]: + return [] + try: + parsed = json.loads(row[0]) + except (TypeError, json.JSONDecodeError): + return [] + if not isinstance(parsed, list): + return [] + return [str(x).strip() for x in parsed if x] + except Exception as e: + logger.debug("get_artist_aliases lookup failed for %r: %s", artist_name, e) + return [] + finally: + if conn: + conn.close() + def update_artist_mbid(self, artist_id: int, mbid: Optional[str], status: str): """Update artist with MusicBrainz ID""" conn = None diff --git a/core/musicbrainz_worker.py b/core/musicbrainz_worker.py index fe6461fc..ee50e08a 100644 --- a/core/musicbrainz_worker.py +++ b/core/musicbrainz_worker.py @@ -313,6 +313,24 @@ class MusicBrainzWorker: result = self.mb_service.match_artist(item_name) if result and result.get('mbid'): self.mb_service.update_artist_mbid(item_id, result['mbid'], 'matched') + # Issue #442 — pull alternate-spelling aliases (Japanese + # kanji, Cyrillic, etc.) so the verifier can recognise + # cross-script artist names without re-querying MB on + # every quarantine candidate. Best-effort: failures are + # swallowed inside `fetch_artist_aliases` (returns + # empty list) so a transient MB outage never regresses + # the enrichment outcome. + try: + aliases = self.mb_service.fetch_artist_aliases(result['mbid']) + if aliases: + self.mb_service.update_artist_aliases(item_id, aliases) + logger.debug( + "Stored %d aliases for artist '%s'", len(aliases), item_name, + ) + except Exception as alias_err: + logger.debug( + "Alias enrichment failed for artist '%s': %s", item_name, alias_err, + ) self.stats['matched'] += 1 logger.info(f"Matched artist '{item_name}' → MBID: {result['mbid']}") else: diff --git a/tests/matching/test_artist_alias_service.py b/tests/matching/test_artist_alias_service.py new file mode 100644 index 00000000..f195ebaf --- /dev/null +++ b/tests/matching/test_artist_alias_service.py @@ -0,0 +1,332 @@ +"""Pin the MusicBrainz service alias methods + worker enrichment. + +Issue #442 — these methods feed the alias data the helper compares +against. Three layers covered: + +1. ``fetch_artist_aliases`` — pulls aliases off the MB get-artist + response, defensive against missing fields, broken JSON, network + errors. +2. ``update_artist_aliases`` — persists to ``artists.aliases`` as a + JSON array. Empty/None → column cleared. +3. ``get_artist_aliases`` — reads back by artist NAME (not id) since + that's what the verifier has at quarantine time. + +Worker enrichment integration covered separately: when MB worker +matches an artist, it calls fetch + update so subsequent verifier +calls find aliases in the library DB without firing live MB. +""" + +from __future__ import annotations + +import json +import sqlite3 +from unittest.mock import MagicMock + +import pytest + + +@pytest.fixture +def temp_db(tmp_path, monkeypatch): + """Real MusicDatabase against a tmp file. Uses the production + schema (so the `aliases` column from commit 1's migration is + present) and the production update/get methods we're pinning.""" + monkeypatch.setenv('DATABASE_PATH', str(tmp_path / 'test.db')) + from database.music_database import MusicDatabase + return MusicDatabase() + + +@pytest.fixture +def service(temp_db): + """MusicBrainzService with stubbed mb_client. The DB is real; + the network is not.""" + from core.musicbrainz_service import MusicBrainzService + svc = MusicBrainzService(temp_db) + svc.mb_client = MagicMock() + return svc + + +_seed_counter = 0 + + +def _seed_artist(db, name: str, **fields) -> str: + """Insert a row into the artists table. + + `artists.id` is TEXT (NOT INTEGER auto-increment), so we mint a + deterministic test id rather than relying on rowid magic. + Returns the id as str — that's what the production code paths + use too (read methods, joins, etc.). + """ + global _seed_counter + _seed_counter += 1 + artist_id = f"test-artist-{_seed_counter}" + conn = db._get_connection() + cursor = conn.cursor() + cols = ['id', 'name'] + list(fields.keys()) + placeholders = ','.join('?' * len(cols)) + cursor.execute( + f"INSERT INTO artists ({','.join(cols)}) VALUES ({placeholders})", + [artist_id, name] + list(fields.values()), + ) + conn.commit() + conn.close() + return artist_id + + +# --------------------------------------------------------------------------- +# fetch_artist_aliases — MB get-artist response parser +# --------------------------------------------------------------------------- + + +class TestFetchArtistAliases: + def test_extracts_alias_names_from_mb_response(self, service): + """Reporter's case 1 shape: MB returns aliases for Hiroyuki + Sawano including the Japanese kanji form. Extract the `name` + from each alias entry.""" + service.mb_client.get_artist.return_value = { + 'id': '60d2ea34-1912-425f-bf9c-fc544e4448cd', + 'name': 'Hiroyuki Sawano', + 'aliases': [ + {'name': '澤野弘之', 'sort-name': '澤野弘之', 'locale': 'ja', 'primary': True}, + {'name': 'SawanoHiroyuki', 'sort-name': 'SawanoHiroyuki', 'locale': None}, + {'name': 'Sawano Hiroyuki', 'sort-name': 'Sawano, Hiroyuki', 'locale': 'en'}, + ], + } + + aliases = service.fetch_artist_aliases('60d2ea34-1912-425f-bf9c-fc544e4448cd') + + assert '澤野弘之' in aliases + assert 'SawanoHiroyuki' in aliases + assert 'Sawano Hiroyuki' in aliases + assert len(aliases) == 3 + + def test_dedup_case_insensitive(self, service): + """Same name with different casing should collapse — MB + sometimes returns duplicate-looking entries with locale + variations.""" + service.mb_client.get_artist.return_value = { + 'aliases': [ + {'name': 'Hiroyuki Sawano'}, + {'name': 'hiroyuki sawano'}, + {'name': 'HIROYUKI SAWANO'}, + ], + } + aliases = service.fetch_artist_aliases('mbid-x') + assert len(aliases) == 1 + + def test_empty_alias_entries_skipped(self, service): + service.mb_client.get_artist.return_value = { + 'aliases': [ + {'name': ''}, + {'name': ' '}, + {'name': None}, + {'name': 'Real Name'}, + ], + } + aliases = service.fetch_artist_aliases('mbid-x') + assert aliases == ['Real Name'] + + def test_missing_aliases_key_returns_empty(self, service): + """MB artist record might not have any aliases. Returns [] + not raises.""" + service.mb_client.get_artist.return_value = { + 'id': 'mbid-x', + 'name': 'Some Artist', + } + assert service.fetch_artist_aliases('mbid-x') == [] + + def test_aliases_null_returns_empty(self, service): + """MB sometimes returns `aliases: null` instead of empty array.""" + service.mb_client.get_artist.return_value = {'aliases': None} + assert service.fetch_artist_aliases('mbid-x') == [] + + def test_get_artist_failure_returns_empty(self, service): + """Network / API failure → empty list, NOT raise. Caller + must treat empty as 'no aliases available, fall back to + direct match' so transient MB outages never trigger + stricter quarantine decisions than today.""" + service.mb_client.get_artist.side_effect = Exception("network error") + assert service.fetch_artist_aliases('mbid-x') == [] + + def test_get_artist_returns_none_returns_empty(self, service): + service.mb_client.get_artist.return_value = None + assert service.fetch_artist_aliases('mbid-x') == [] + + def test_empty_mbid_returns_empty_without_api_call(self, service): + assert service.fetch_artist_aliases('') == [] + assert service.fetch_artist_aliases(None) == [] + service.mb_client.get_artist.assert_not_called() + + def test_includes_aliases_in_request(self, service): + """Verify the MB API call requests the aliases include + explicitly — without `inc=aliases` the response wouldn't + carry them.""" + service.mb_client.get_artist.return_value = {'aliases': []} + service.fetch_artist_aliases('mbid-x') + service.mb_client.get_artist.assert_called_once_with( + 'mbid-x', includes=['aliases'], + ) + + +# --------------------------------------------------------------------------- +# update_artist_aliases — persistence +# --------------------------------------------------------------------------- + + +class TestUpdateArtistAliases: + def test_persists_as_json_array(self, service, temp_db): + artist_id = _seed_artist(temp_db, 'Hiroyuki Sawano') + service.update_artist_aliases(artist_id, ['澤野弘之', 'SawanoHiroyuki']) + + conn = temp_db._get_connection() + row = conn.execute("SELECT aliases FROM artists WHERE id = ?", (artist_id,)).fetchone() + conn.close() + parsed = json.loads(row[0]) + assert parsed == ['澤野弘之', 'SawanoHiroyuki'] + + def test_idempotent_overwrite(self, service, temp_db): + artist_id = _seed_artist(temp_db, 'X') + service.update_artist_aliases(artist_id, ['a']) + service.update_artist_aliases(artist_id, ['b', 'c']) + conn = temp_db._get_connection() + row = conn.execute("SELECT aliases FROM artists WHERE id = ?", (artist_id,)).fetchone() + conn.close() + assert json.loads(row[0]) == ['b', 'c'] + + def test_empty_list_clears_column(self, service, temp_db): + artist_id = _seed_artist(temp_db, 'X', aliases=json.dumps(['old'])) + service.update_artist_aliases(artist_id, []) + conn = temp_db._get_connection() + row = conn.execute("SELECT aliases FROM artists WHERE id = ?", (artist_id,)).fetchone() + conn.close() + assert row[0] is None + + def test_none_artist_id_is_noop(self, service, temp_db): + """Defensive: caller might pass None on edge cases. Don't crash.""" + service.update_artist_aliases(None, ['x']) # no exception + + +# --------------------------------------------------------------------------- +# get_artist_aliases — read back by artist NAME (verifier path) +# --------------------------------------------------------------------------- + + +class TestGetArtistAliases: + def test_returns_aliases_for_known_artist(self, service, temp_db): + artist_id = _seed_artist( + temp_db, 'Hiroyuki Sawano', + aliases=json.dumps(['澤野弘之', 'SawanoHiroyuki']), + ) + aliases = service.get_artist_aliases('Hiroyuki Sawano') + assert '澤野弘之' in aliases + assert 'SawanoHiroyuki' in aliases + + def test_case_insensitive_lookup(self, service, temp_db): + """Verifier passes the artist name from track metadata — + casing might differ from how the library stored it.""" + _seed_artist(temp_db, 'Hiroyuki Sawano', aliases=json.dumps(['澤野弘之'])) + assert service.get_artist_aliases('hiroyuki sawano') == ['澤野弘之'] + assert service.get_artist_aliases('HIROYUKI SAWANO') == ['澤野弘之'] + + def test_returns_empty_for_unknown_artist(self, service): + assert service.get_artist_aliases('NeverHeardOf') == [] + + def test_returns_empty_for_artist_without_aliases(self, service, temp_db): + _seed_artist(temp_db, 'X') # no aliases column set + assert service.get_artist_aliases('X') == [] + + def test_handles_corrupt_json_gracefully(self, service, temp_db): + _seed_artist(temp_db, 'X', aliases='not-valid-json') + # Returns [] instead of raising — defensive against legacy + # rows that might have been written by an older format + assert service.get_artist_aliases('X') == [] + + def test_handles_non_list_json_gracefully(self, service, temp_db): + _seed_artist(temp_db, 'X', aliases=json.dumps({'wrong': 'shape'})) + assert service.get_artist_aliases('X') == [] + + def test_empty_name_returns_empty_without_query(self, service): + assert service.get_artist_aliases('') == [] + assert service.get_artist_aliases(None) == [] + + +# --------------------------------------------------------------------------- +# Worker integration — alias enrichment fires on successful match +# --------------------------------------------------------------------------- + + +class TestWorkerAliasEnrichment: + def test_matched_artist_triggers_alias_fetch_and_persist(self, temp_db, monkeypatch): + """End-to-end: worker matches an artist, immediately fetches + + persists aliases. Subsequent verifier calls find them in + the library DB without firing live MB.""" + from core.musicbrainz_worker import MusicBrainzWorker + + artist_id = _seed_artist(temp_db, 'Hiroyuki Sawano') + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.mb_service = MagicMock() + worker.mb_service.match_artist.return_value = { + 'mbid': '60d2ea34-1912-425f-bf9c-fc544e4448cd', 'name': 'Hiroyuki Sawano', + } + worker.mb_service.fetch_artist_aliases.return_value = ['澤野弘之', 'SawanoHiroyuki'] + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + + # Bypass _get_existing_id (would query DB for prior MBID) + worker._get_existing_id = MagicMock(return_value=None) + + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'Hiroyuki Sawano'}) + + worker.mb_service.update_artist_mbid.assert_called_once_with( + artist_id, '60d2ea34-1912-425f-bf9c-fc544e4448cd', 'matched', + ) + worker.mb_service.fetch_artist_aliases.assert_called_once_with( + '60d2ea34-1912-425f-bf9c-fc544e4448cd', + ) + worker.mb_service.update_artist_aliases.assert_called_once_with( + artist_id, ['澤野弘之', 'SawanoHiroyuki'], + ) + + def test_no_alias_call_when_artist_not_matched(self, temp_db): + """If MB returned no MBID match, don't fetch aliases — + nothing to enrich.""" + from core.musicbrainz_worker import MusicBrainzWorker + artist_id = _seed_artist(temp_db, 'Unknown') + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.mb_service = MagicMock() + worker.mb_service.match_artist.return_value = None + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + worker._get_existing_id = MagicMock(return_value=None) + + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'Unknown'}) + + worker.mb_service.fetch_artist_aliases.assert_not_called() + worker.mb_service.update_artist_aliases.assert_not_called() + + def test_alias_fetch_failure_does_not_break_match(self, temp_db): + """If alias fetch raises (network error, malformed response, + whatever), the artist match still gets recorded — alias + enrichment is best-effort, not a gate.""" + from core.musicbrainz_worker import MusicBrainzWorker + artist_id = _seed_artist(temp_db, 'X') + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.mb_service = MagicMock() + worker.mb_service.match_artist.return_value = {'mbid': 'mb-x', 'name': 'X'} + worker.mb_service.fetch_artist_aliases.side_effect = Exception("boom") + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + worker._get_existing_id = MagicMock(return_value=None) + + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'X'}) + + # MBID still got updated despite alias failure + worker.mb_service.update_artist_mbid.assert_called_once_with( + artist_id, 'mb-x', 'matched', + ) + # No alias write attempted (fetch raised before update) + worker.mb_service.update_artist_aliases.assert_not_called() + # And the match was still counted + assert worker.stats['matched'] == 1