From 15244f24cf72c4d965d43d6579fa3c71bd9c8791 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sun, 10 May 2026 16:25:30 -0700 Subject: [PATCH] Live MB lookup for un-enriched artists with cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous commit only populated `artists.aliases` for artists the MB worker had enriched. But the AcoustID verifier (next commit) needs aliases for ANY expected artist — including: - Artists not yet in the user's library (first download) - Artists in the library where MB enrichment hasn't run yet - Artists where MB enrichment ran but found no MBID (NULL aliases) This commit adds a multi-tier resolution helper that fills those gaps without thrashing the MB API. # Multi-tier resolution `lookup_artist_aliases(artist_name) -> list[str]`: 1. **Library DB** (fast path): existing `get_artist_aliases` lookup by name. No network. Most common path once the worker has enriched everything. 2. **Cache** (existing `musicbrainz_cache` table, entity_type= `artist_aliases`): a prior live lookup for this name. Empty cache hit is respected (don't re-query when MB previously had nothing). 3. **Live MB**: search artist by name → pick highest-confidence match (combined name-similarity + MB relevance) → fetch aliases for that MBID → cache the result. Always returns a list (possibly empty), never raises. Empty result on any tier means "no alternate spellings found, fall back to direct match" — identical to the pre-fix behaviour. # Threshold gate Live lookup only trusts the MB search result when combined similarity score >= 0.6. Below that, we'd be guessing at the wrong artist — searching `John Smith` returns multiple John Smiths and pulling aliases for one of them could mismatch. Cache the empty result so we don't keep re-searching the same low-confidence name. # Performance contract Critical for the verifier path: 100 quarantine candidates with the same expected artist must NOT trigger 100 MB API calls. Cache hit on second + subsequent calls per unique artist name. Verified by test pinning the call counts. # Tests added (8) - Tier 1 library DB hit — no MB API call fired - Tier 3 live MB lookup → search → fetch → returns aliases - Tier 2 cache hit on second call — no re-query - Empty input → empty return + no API call - Network failure on search → empty + cached so we don't retry - No search results → empty + cached - Low-confidence match (sim < 0.6) skipped — defends against picking the wrong artist - Library row exists but aliases NULL → falls through to live lookup (defends against the half-enriched state) # Verification - 31/31 service tests pass (8 new + 23 prior) - Ruff clean --- core/musicbrainz_service.py | 78 ++++++++++++++ tests/matching/test_artist_alias_service.py | 111 ++++++++++++++++++++ 2 files changed, 189 insertions(+) diff --git a/core/musicbrainz_service.py b/core/musicbrainz_service.py index d7b8177c..1ca39463 100644 --- a/core/musicbrainz_service.py +++ b/core/musicbrainz_service.py @@ -388,6 +388,84 @@ class MusicBrainzService: logger.error(f"Error matching recording '{track_name}': {e}") return None + def lookup_artist_aliases(self, artist_name: str) -> list: + """Find alternate-spelling aliases for an artist by NAME. + + Multi-tier resolution: + 1. Library DB row (`artists.aliases` populated by the MB + worker when the artist was enriched). Fast path — no + network. + 2. Existing musicbrainz_cache entry (entity_type='artist_aliases') + — caches a prior live MB lookup for this name. + 3. Live MB lookup: search artist → fetch aliases for the best + MBID → cache the result. + + Always returns a list (possibly empty) — never raises. Empty + result on any tier means "no alternate spellings found, fall + back to direct match" which is identical to pre-fix behaviour. + + Used by the AcoustID verifier when an artist comparison fails + the direct similarity check. Caching means each unique artist + name only hits MB once per cache TTL even if 100 download + candidates fail verification with that artist. + """ + if not artist_name: + return [] + + # Tier 1: library DB + library = self.get_artist_aliases(artist_name) + if library: + return library + + # Tier 2: cached live lookup (re-uses musicbrainz_cache table) + cached = self._check_cache('artist_aliases', artist_name) + if cached: + metadata = cached.get('metadata') or {} + aliases = metadata.get('aliases') if isinstance(metadata, dict) else None + if isinstance(aliases, list): + return [str(x).strip() for x in aliases if x] + # Cache hit with empty result — respect it (don't re-query) + return [] + + # Tier 3: live MB lookup. Search → fetch by MBID → cache. + try: + results = self.mb_client.search_artist(artist_name, limit=3) + except Exception as e: + logger.debug("lookup_artist_aliases: search_artist(%r) raised: %s", artist_name, e) + self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0) + return [] + + if not results: + self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0) + return [] + + # Pick the best match — highest combined score of MB's relevance + # and our name-similarity check (mirrors `match_artist`). + best_mbid = None + best_score = 0 + for result in results: + mb_name = result.get('name', '') + mb_score = result.get('score', 0) + sim = self._calculate_similarity(artist_name, mb_name) + combined = (sim * 0.7) + (mb_score / 100 * 0.3) + if combined > best_score: + best_score = combined + best_mbid = result.get('id') + + # Threshold: only trust the lookup when name + MB-relevance + # combined is reasonably high. Otherwise we're guessing, + # which could pull in aliases for the wrong artist. + if not best_mbid or best_score < 0.6: + self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0) + return [] + + aliases = self.fetch_artist_aliases(best_mbid) + self._save_to_cache( + 'artist_aliases', artist_name, None, best_mbid, + {'aliases': aliases}, int(best_score * 100), + ) + return aliases + def fetch_artist_aliases(self, mbid: str) -> list: """Fetch the alias list for an artist from MusicBrainz. diff --git a/tests/matching/test_artist_alias_service.py b/tests/matching/test_artist_alias_service.py index f195ebaf..5c474309 100644 --- a/tests/matching/test_artist_alias_service.py +++ b/tests/matching/test_artist_alias_service.py @@ -330,3 +330,114 @@ class TestWorkerAliasEnrichment: worker.mb_service.update_artist_aliases.assert_not_called() # And the match was still counted assert worker.stats['matched'] == 1 + + +# --------------------------------------------------------------------------- +# lookup_artist_aliases — multi-tier resolution (library → cache → live) +# --------------------------------------------------------------------------- + + +class TestLookupArtistAliasesMultiTier: + def test_tier1_library_db_hit(self, service, temp_db): + """Fast path: artist already enriched in library DB. + No MB API call fired.""" + _seed_artist(temp_db, 'Hiroyuki Sawano', + aliases=json.dumps(['澤野弘之', 'SawanoHiroyuki'])) + + aliases = service.lookup_artist_aliases('Hiroyuki Sawano') + + assert '澤野弘之' in aliases + service.mb_client.search_artist.assert_not_called() + service.mb_client.get_artist.assert_not_called() + + def test_tier3_live_mb_lookup_when_not_in_library(self, service, temp_db): + """Cache miss + library miss → MB search → fetch by MBID → + cache the result.""" + service.mb_client.search_artist.return_value = [ + {'id': 'mb-sawano', 'name': 'Hiroyuki Sawano', 'score': 100}, + ] + service.mb_client.get_artist.return_value = { + 'aliases': [{'name': '澤野弘之'}, {'name': 'SawanoHiroyuki'}], + } + + aliases = service.lookup_artist_aliases('Hiroyuki Sawano') + + assert '澤野弘之' in aliases + service.mb_client.search_artist.assert_called_once() + service.mb_client.get_artist.assert_called_once_with( + 'mb-sawano', includes=['aliases'], + ) + + def test_tier2_cache_hit_skips_live_lookup(self, service, temp_db): + """Second call for same artist hits the cache, doesn't + re-query MB. Critical for the verifier path — 100 quarantine + candidates with the same artist must NOT trigger 100 MB + calls.""" + service.mb_client.search_artist.return_value = [ + {'id': 'mb-x', 'name': 'X', 'score': 100}, + ] + service.mb_client.get_artist.return_value = { + 'aliases': [{'name': 'X-alias'}], + } + + # First call — populates cache + first = service.lookup_artist_aliases('X') + # Second call — should be cached + second = service.lookup_artist_aliases('X') + + assert first == second == ['X-alias'] + # Only ONE round-trip to MB despite two calls + assert service.mb_client.search_artist.call_count == 1 + assert service.mb_client.get_artist.call_count == 1 + + def test_empty_name_returns_empty_no_api_call(self, service): + assert service.lookup_artist_aliases('') == [] + assert service.lookup_artist_aliases(None) == [] + service.mb_client.search_artist.assert_not_called() + + def test_search_failure_returns_empty(self, service): + """Network outage on search — return empty, cache the empty + result so we don't keep retrying.""" + service.mb_client.search_artist.side_effect = Exception("network down") + aliases = service.lookup_artist_aliases('Anyone') + assert aliases == [] + + def test_no_search_results_returns_empty(self, service): + """Artist not found on MB — empty return, cached so we + don't re-search the same name forever.""" + service.mb_client.search_artist.return_value = [] + aliases = service.lookup_artist_aliases('NeverHeardOf') + assert aliases == [] + # Second call should hit cache, not re-search + service.lookup_artist_aliases('NeverHeardOf') + assert service.mb_client.search_artist.call_count == 1 + + def test_low_confidence_match_skipped(self, service): + """Search returned something but the name similarity is too + low — don't trust it. Could pull in aliases for the wrong + artist (e.g. searching 'John Smith' returns a different + John Smith). Empty return + cached.""" + service.mb_client.search_artist.return_value = [ + {'id': 'mb-different', 'name': 'Completely Different Artist', 'score': 30}, + ] + aliases = service.lookup_artist_aliases('Hiroyuki Sawano') + assert aliases == [] + # Didn't even try fetching aliases for the bad match + service.mb_client.get_artist.assert_not_called() + + def test_library_with_empty_aliases_falls_through_to_live(self, service, temp_db): + """Edge case: library has the artist but `aliases` column is + NULL (worker hasn't enriched yet). Don't get stuck — fall + through to live MB lookup.""" + _seed_artist(temp_db, 'Hiroyuki Sawano') # no aliases + service.mb_client.search_artist.return_value = [ + {'id': 'mb-sawano', 'name': 'Hiroyuki Sawano', 'score': 100}, + ] + service.mb_client.get_artist.return_value = { + 'aliases': [{'name': '澤野弘之'}], + } + + aliases = service.lookup_artist_aliases('Hiroyuki Sawano') + + assert '澤野弘之' in aliases + service.mb_client.search_artist.assert_called_once()