Live MB lookup for un-enriched artists with cache

Previous commit only populated `artists.aliases` for artists the MB
worker had enriched. But the AcoustID verifier (next commit) needs
aliases for ANY expected artist — including:

- Artists not yet in the user's library (first download)
- Artists in the library where MB enrichment hasn't run yet
- Artists where MB enrichment ran but found no MBID (NULL aliases)

This commit adds a multi-tier resolution helper that fills those
gaps without thrashing the MB API.

# Multi-tier resolution

`lookup_artist_aliases(artist_name) -> list[str]`:

1. **Library DB** (fast path): existing `get_artist_aliases` lookup
   by name. No network. Most common path once the worker has
   enriched everything.
2. **Cache** (existing `musicbrainz_cache` table, entity_type=
   `artist_aliases`): a prior live lookup for this name. Empty
   cache hit is respected (don't re-query when MB previously had
   nothing).
3. **Live MB**: search artist by name → pick highest-confidence
   match (combined name-similarity + MB relevance) → fetch aliases
   for that MBID → cache the result.

Always returns a list (possibly empty), never raises. Empty result
on any tier means "no alternate spellings found, fall back to
direct match" — identical to the pre-fix behaviour.

# Threshold gate

Live lookup only trusts the MB search result when combined
similarity score >= 0.6. Below that, we'd be guessing at the wrong
artist — searching `John Smith` returns multiple John Smiths and
pulling aliases for one of them could mismatch. Cache the empty
result so we don't keep re-searching the same low-confidence name.

# Performance contract

Critical for the verifier path: 100 quarantine candidates with the
same expected artist must NOT trigger 100 MB API calls. Cache hit
on second + subsequent calls per unique artist name. Verified by
test pinning the call counts.

# Tests added (8)

- Tier 1 library DB hit — no MB API call fired
- Tier 3 live MB lookup → search → fetch → returns aliases
- Tier 2 cache hit on second call — no re-query
- Empty input → empty return + no API call
- Network failure on search → empty + cached so we don't retry
- No search results → empty + cached
- Low-confidence match (sim < 0.6) skipped — defends against
  picking the wrong artist
- Library row exists but aliases NULL → falls through to live
  lookup (defends against the half-enriched state)

# Verification

- 31/31 service tests pass (8 new + 23 prior)
- Ruff clean
pull/541/head
Broque Thomas 1 week ago
parent 48d848bb74
commit 15244f24cf

@ -388,6 +388,84 @@ class MusicBrainzService:
logger.error(f"Error matching recording '{track_name}': {e}")
return None
def lookup_artist_aliases(self, artist_name: str) -> list:
"""Find alternate-spelling aliases for an artist by NAME.
Multi-tier resolution:
1. Library DB row (`artists.aliases` populated by the MB
worker when the artist was enriched). Fast path no
network.
2. Existing musicbrainz_cache entry (entity_type='artist_aliases')
caches a prior live MB lookup for this name.
3. Live MB lookup: search artist fetch aliases for the best
MBID cache the result.
Always returns a list (possibly empty) never raises. Empty
result on any tier means "no alternate spellings found, fall
back to direct match" which is identical to pre-fix behaviour.
Used by the AcoustID verifier when an artist comparison fails
the direct similarity check. Caching means each unique artist
name only hits MB once per cache TTL even if 100 download
candidates fail verification with that artist.
"""
if not artist_name:
return []
# Tier 1: library DB
library = self.get_artist_aliases(artist_name)
if library:
return library
# Tier 2: cached live lookup (re-uses musicbrainz_cache table)
cached = self._check_cache('artist_aliases', artist_name)
if cached:
metadata = cached.get('metadata') or {}
aliases = metadata.get('aliases') if isinstance(metadata, dict) else None
if isinstance(aliases, list):
return [str(x).strip() for x in aliases if x]
# Cache hit with empty result — respect it (don't re-query)
return []
# Tier 3: live MB lookup. Search → fetch by MBID → cache.
try:
results = self.mb_client.search_artist(artist_name, limit=3)
except Exception as e:
logger.debug("lookup_artist_aliases: search_artist(%r) raised: %s", artist_name, e)
self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0)
return []
if not results:
self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0)
return []
# Pick the best match — highest combined score of MB's relevance
# and our name-similarity check (mirrors `match_artist`).
best_mbid = None
best_score = 0
for result in results:
mb_name = result.get('name', '')
mb_score = result.get('score', 0)
sim = self._calculate_similarity(artist_name, mb_name)
combined = (sim * 0.7) + (mb_score / 100 * 0.3)
if combined > best_score:
best_score = combined
best_mbid = result.get('id')
# Threshold: only trust the lookup when name + MB-relevance
# combined is reasonably high. Otherwise we're guessing,
# which could pull in aliases for the wrong artist.
if not best_mbid or best_score < 0.6:
self._save_to_cache('artist_aliases', artist_name, None, None, {'aliases': []}, 0)
return []
aliases = self.fetch_artist_aliases(best_mbid)
self._save_to_cache(
'artist_aliases', artist_name, None, best_mbid,
{'aliases': aliases}, int(best_score * 100),
)
return aliases
def fetch_artist_aliases(self, mbid: str) -> list:
"""Fetch the alias list for an artist from MusicBrainz.

@ -330,3 +330,114 @@ class TestWorkerAliasEnrichment:
worker.mb_service.update_artist_aliases.assert_not_called()
# And the match was still counted
assert worker.stats['matched'] == 1
# ---------------------------------------------------------------------------
# lookup_artist_aliases — multi-tier resolution (library → cache → live)
# ---------------------------------------------------------------------------
class TestLookupArtistAliasesMultiTier:
def test_tier1_library_db_hit(self, service, temp_db):
"""Fast path: artist already enriched in library DB.
No MB API call fired."""
_seed_artist(temp_db, 'Hiroyuki Sawano',
aliases=json.dumps(['澤野弘之', 'SawanoHiroyuki']))
aliases = service.lookup_artist_aliases('Hiroyuki Sawano')
assert '澤野弘之' in aliases
service.mb_client.search_artist.assert_not_called()
service.mb_client.get_artist.assert_not_called()
def test_tier3_live_mb_lookup_when_not_in_library(self, service, temp_db):
"""Cache miss + library miss → MB search → fetch by MBID →
cache the result."""
service.mb_client.search_artist.return_value = [
{'id': 'mb-sawano', 'name': 'Hiroyuki Sawano', 'score': 100},
]
service.mb_client.get_artist.return_value = {
'aliases': [{'name': '澤野弘之'}, {'name': 'SawanoHiroyuki'}],
}
aliases = service.lookup_artist_aliases('Hiroyuki Sawano')
assert '澤野弘之' in aliases
service.mb_client.search_artist.assert_called_once()
service.mb_client.get_artist.assert_called_once_with(
'mb-sawano', includes=['aliases'],
)
def test_tier2_cache_hit_skips_live_lookup(self, service, temp_db):
"""Second call for same artist hits the cache, doesn't
re-query MB. Critical for the verifier path 100 quarantine
candidates with the same artist must NOT trigger 100 MB
calls."""
service.mb_client.search_artist.return_value = [
{'id': 'mb-x', 'name': 'X', 'score': 100},
]
service.mb_client.get_artist.return_value = {
'aliases': [{'name': 'X-alias'}],
}
# First call — populates cache
first = service.lookup_artist_aliases('X')
# Second call — should be cached
second = service.lookup_artist_aliases('X')
assert first == second == ['X-alias']
# Only ONE round-trip to MB despite two calls
assert service.mb_client.search_artist.call_count == 1
assert service.mb_client.get_artist.call_count == 1
def test_empty_name_returns_empty_no_api_call(self, service):
assert service.lookup_artist_aliases('') == []
assert service.lookup_artist_aliases(None) == []
service.mb_client.search_artist.assert_not_called()
def test_search_failure_returns_empty(self, service):
"""Network outage on search — return empty, cache the empty
result so we don't keep retrying."""
service.mb_client.search_artist.side_effect = Exception("network down")
aliases = service.lookup_artist_aliases('Anyone')
assert aliases == []
def test_no_search_results_returns_empty(self, service):
"""Artist not found on MB — empty return, cached so we
don't re-search the same name forever."""
service.mb_client.search_artist.return_value = []
aliases = service.lookup_artist_aliases('NeverHeardOf')
assert aliases == []
# Second call should hit cache, not re-search
service.lookup_artist_aliases('NeverHeardOf')
assert service.mb_client.search_artist.call_count == 1
def test_low_confidence_match_skipped(self, service):
"""Search returned something but the name similarity is too
low don't trust it. Could pull in aliases for the wrong
artist (e.g. searching 'John Smith' returns a different
John Smith). Empty return + cached."""
service.mb_client.search_artist.return_value = [
{'id': 'mb-different', 'name': 'Completely Different Artist', 'score': 30},
]
aliases = service.lookup_artist_aliases('Hiroyuki Sawano')
assert aliases == []
# Didn't even try fetching aliases for the bad match
service.mb_client.get_artist.assert_not_called()
def test_library_with_empty_aliases_falls_through_to_live(self, service, temp_db):
"""Edge case: library has the artist but `aliases` column is
NULL (worker hasn't enriched yet). Don't get stuck fall
through to live MB lookup."""
_seed_artist(temp_db, 'Hiroyuki Sawano') # no aliases
service.mb_client.search_artist.return_value = [
{'id': 'mb-sawano', 'name': 'Hiroyuki Sawano', 'score': 100},
]
service.mb_client.get_artist.return_value = {
'aliases': [{'name': '澤野弘之'}],
}
aliases = service.lookup_artist_aliases('Hiroyuki Sawano')
assert '澤野弘之' in aliases
service.mb_client.search_artist.assert_called_once()

Loading…
Cancel
Save