diff --git a/core/acoustid_verification.py b/core/acoustid_verification.py index 9e4d2d7a..1ad779e0 100644 --- a/core/acoustid_verification.py +++ b/core/acoustid_verification.py @@ -90,7 +90,7 @@ def _similarity(a: str, b: str) -> float: def _alias_aware_artist_sim( expected_artist: str, actual_artist: str, - aliases: Optional[List[str]] = None, + aliases: Optional[Any] = None, ) -> float: """Best artist-similarity across (expected, *aliases) vs actual. @@ -104,12 +104,39 @@ def _alias_aware_artist_sim( threshold checks (>= ARTIST_MATCH_THRESHOLD) keep their semantics. When `aliases` is None or empty, behaves identically to the prior raw `_similarity(expected, actual)` call. + + `aliases` accepts two shapes: + + - **Iterable** (list/tuple/set of strings): used directly. Used + by tests that already know the aliases. + - **Callable**: invoked LAZILY only when direct similarity + falls below the threshold. Lets the verifier pass a memoizing + thunk that resolves aliases (DB / cache / live MB) only when + needed. Verifications where the direct match already passes + never trigger the lookup chain — no wasted DB query for the + happy path. """ from core.matching.artist_aliases import artist_names_match + + direct = _similarity(expected_artist, actual_artist) + # Fast path — direct match already passes the threshold OR caller + # supplied no aliases handle. Avoids any lookup work. + if aliases is None: + return direct + if direct >= ARTIST_MATCH_THRESHOLD: + return direct + + # Resolve the iterable. Callable provider invoked NOW (lazily — + # the caller can memoize the result across multiple invocations + # within one verify_audio_file call). + resolved = aliases() if callable(aliases) else aliases + if not resolved: + return direct + _matched, score = artist_names_match( expected_artist, actual_artist, - aliases=aliases, + aliases=resolved, threshold=ARTIST_MATCH_THRESHOLD, similarity=_similarity, ) @@ -120,17 +147,22 @@ def _find_best_title_artist_match( recordings: List[Dict[str, Any]], expected_title: str, expected_artist: str, - expected_artist_aliases: Optional[List[str]] = None, + expected_artist_aliases: Optional[Any] = None, ) -> Tuple[Optional[Dict], float, float]: """ Find the AcoustID recording that best matches expected title/artist. Issue #442 — `expected_artist_aliases` (when supplied) is the list of alternate spellings for `expected_artist` (Japanese - kanji, Cyrillic, etc.). Each recording's artist is scored - against (expected, *aliases) and the best score wins. When the - list is empty or omitted, behavior is identical to the prior - raw similarity comparison. + kanji, Cyrillic, etc.). Accepts either: + + - An iterable of alias strings (used eagerly), or + - A callable returning the list (resolved lazily — only fires + when at least one recording fails direct artist similarity). + + Each recording's artist is scored against (expected, *aliases) + and the best score wins. When the list is empty/omitted/None, + behavior is identical to the prior raw similarity comparison. Returns: (best_recording, title_similarity, artist_similarity) @@ -371,23 +403,33 @@ class AcoustIDVerification: # Enrich recordings that are missing title/artist via MusicBrainz lookup recordings = _enrich_recordings_from_musicbrainz(recordings) - # Issue #442 — resolve alternate-spelling aliases for the - # expected artist ONCE. Multi-tier resolution (library DB - # → cache → live MB), cached per artist name so 100 - # quarantine candidates with the same artist don't trigger - # 100 MB API calls. Empty list on any failure → verifier - # falls back to prior direct-similarity behaviour. - expected_artist_aliases = _resolve_expected_artist_aliases(expected_artist_name) - if expected_artist_aliases: - logger.debug( - "Resolved %d aliases for expected artist '%s'", - len(expected_artist_aliases), expected_artist_name, - ) + # Issue #442 — alias resolution is LAZY. We pass a memoising + # thunk to the artist-comparison sites; it only fires the + # multi-tier lookup (library DB → cache → live MB) when + # direct artist similarity falls below threshold. Verifications + # where the direct match already passes (the common case for + # same-script artist names) never trigger any lookup work, + # so the fix doesn't add a per-verification DB query for the + # happy path. When the thunk DOES fire, the result is cached + # in the closure so the 3 comparison sites within one + # verification share a single resolution pass. + _alias_cache: Dict[str, Any] = {} + + def _aliases_provider() -> List[str]: + if 'value' not in _alias_cache: + resolved = _resolve_expected_artist_aliases(expected_artist_name) + _alias_cache['value'] = resolved + if resolved: + logger.debug( + "Resolved %d aliases for expected artist '%s'", + len(resolved), expected_artist_name, + ) + return _alias_cache['value'] # Step 4: Find best title/artist match among AcoustID results best_rec, title_sim, artist_sim = _find_best_title_artist_match( recordings, expected_track_name, expected_artist_name, - expected_artist_aliases=expected_artist_aliases, + expected_artist_aliases=_aliases_provider, ) if not best_rec: @@ -449,7 +491,7 @@ class AcoustIDVerification: for rec in recordings: rec_artist = rec.get('artist', '') if _alias_aware_artist_sim( - expected_artist_name, rec_artist, expected_artist_aliases, + expected_artist_name, rec_artist, _aliases_provider, ) >= ARTIST_MATCH_THRESHOLD: msg = ( f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' " @@ -499,7 +541,7 @@ class AcoustIDVerification: continue if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and _alias_aware_artist_sim( - expected_artist_name, a, expected_artist_aliases, + expected_artist_name, a, _aliases_provider, ) >= ARTIST_MATCH_THRESHOLD): msg = ( f"Audio verified: found '{t}' by '{a}' in AcoustID results " diff --git a/core/musicbrainz_worker.py b/core/musicbrainz_worker.py index ee50e08a..bc494433 100644 --- a/core/musicbrainz_worker.py +++ b/core/musicbrainz_worker.py @@ -283,6 +283,31 @@ class MusicBrainzWorker: if conn: conn.close() + def _artist_aliases_empty(self, artist_id: Any) -> bool: + """Check if `artists.aliases` for this row is NULL or empty. + + Used by the existing-MBID backfill path to skip the MB call + when aliases are already populated (re-scan cycles after + backfill complete should be no-ops). Defensive: returns True + on any error so the backfill attempt happens — a redundant MB + call is cheaper than missing the backfill entirely. + """ + conn = None + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("SELECT aliases FROM artists WHERE id = ? LIMIT 1", (artist_id,)) + row = cursor.fetchone() + if not row: + return False # Row doesn't exist — nothing to backfill + value = row[0] + return value is None or value == '' or value == '[]' + except Exception: + return True + finally: + if conn: + conn.close() + def _process_item(self, item: Dict[str, Any]): """Process a single item (artist, album, or track)""" try: @@ -301,6 +326,27 @@ class MusicBrainzWorker: try: if item_type == 'artist': self.mb_service.update_artist_mbid(item_id, existing_id, 'matched') + # Issue #442 — one-time backfill for artists + # enriched before alias support landed. Users with + # pre-existing libraries on day-one of this PR have + # MBIDs but NULL aliases. Fetch ONLY when the + # column is empty so re-scan cycles after backfill + # don't re-query MB. Best-effort: failures are + # logged at debug, don't regress the match outcome. + try: + if self._artist_aliases_empty(item_id): + aliases = self.mb_service.fetch_artist_aliases(existing_id) + if aliases: + self.mb_service.update_artist_aliases(item_id, aliases) + logger.debug( + "Backfilled %d aliases for artist '%s'", + len(aliases), item_name, + ) + except Exception as backfill_err: + logger.debug( + "Alias backfill failed for artist '%s': %s", + item_name, backfill_err, + ) elif item_type == 'album': self.mb_service.update_album_mbid(item_id, existing_id, 'matched') elif item_type == 'track': diff --git a/tests/matching/test_acoustid_verification_aliases.py b/tests/matching/test_acoustid_verification_aliases.py index c14ef6fc..80e7ae01 100644 --- a/tests/matching/test_acoustid_verification_aliases.py +++ b/tests/matching/test_acoustid_verification_aliases.py @@ -274,3 +274,128 @@ class TestAliasLookupCalledOncePerVerify: verifier.verify_audio_file('/fake/path.mp3', 'X', 'Hiroyuki Sawano') assert fake_service.lookup_artist_aliases.call_count == 1 + + +# --------------------------------------------------------------------------- +# Lazy alias resolution — happy path skips MB lookup entirely +# --------------------------------------------------------------------------- + + +class TestLazyAliasResolution: + """Issue #442 perf followup: alias lookup should ONLY fire when + the direct artist comparison fails. Verifications where artist + names already match (the 95% common case for same-script + libraries) must NOT trigger the lookup chain — no wasted DB + query, no wasted MB call.""" + + def test_no_lookup_when_direct_artist_match_passes(self, stubbed_verifier): + """Exact-match Latin-script artist passes verification with + zero alias lookups — no DB query, no MB call. Same-script + libraries (the 95% common case) inherit zero perf cost from + this PR.""" + verifier, fake_service = stubbed_verifier + + verifier.acoustid_client.fingerprint_and_lookup.return_value = { + 'best_score': 0.95, + 'recordings': [ + {'title': 'Dirty White Boy', 'artist': 'Foreigner'}, + ], + } + + result, _ = verifier.verify_audio_file( + '/fake/path.mp3', 'Dirty White Boy', 'Foreigner', + ) + + assert result == VerificationResult.PASS + # Critical — alias lookup must NOT have been called for the + # happy path. Otherwise every successful verification adds a + # DB query for nothing. + fake_service.lookup_artist_aliases.assert_not_called() + + def test_lookup_fires_only_when_direct_artist_match_fails(self, stubbed_verifier): + """Cross-script case where direct sim is 0% → lookup fires + as expected.""" + verifier, fake_service = stubbed_verifier + + verifier.acoustid_client.fingerprint_and_lookup.return_value = { + 'best_score': 0.95, + 'recordings': [ + {'title': 'YAMANAIAME', 'artist': '澤野弘之'}, + ], + } + fake_service.lookup_artist_aliases.return_value = ['澤野弘之'] + + result, _ = verifier.verify_audio_file( + '/fake/path.mp3', 'YAMANAIAME', 'Hiroyuki Sawano', + ) + + assert result == VerificationResult.PASS + # Lookup fired BECAUSE direct match would have failed + fake_service.lookup_artist_aliases.assert_called_once() + + def test_lookup_memoised_across_three_comparison_sites(self, stubbed_verifier): + """When lookup DOES fire, the result must be reused across + the three artist-comparison sites in the verifier (best-match + scoring, secondary scan, fallback scan). One resolution per + verification — not three.""" + verifier, fake_service = stubbed_verifier + + # Force a code path that hits multiple sites: title matches + # several recordings but the best-match's artist sim is below + # threshold (forces secondary scan path). + verifier.acoustid_client.fingerprint_and_lookup.return_value = { + 'best_score': 0.95, + 'recordings': [ + {'title': 'X', 'artist': 'Different Latin Artist'}, # 0 alias hit + {'title': 'X', 'artist': '澤野弘之'}, # alias hit + ], + } + fake_service.lookup_artist_aliases.return_value = ['澤野弘之'] + + verifier.verify_audio_file('/fake/path.mp3', 'X', 'Hiroyuki Sawano') + + # Memoised — one resolution shared across all sites + assert fake_service.lookup_artist_aliases.call_count == 1 + + +# --------------------------------------------------------------------------- +# Provider-callable contract on the helper +# --------------------------------------------------------------------------- + + +class TestAliasProviderCallable: + """Pin the dual-shape contract on `_alias_aware_artist_sim`: + accepts an iterable OR a callable. Callable resolves lazily.""" + + def test_iterable_passed_directly(self): + """Plain list — used as-is, no lazy semantics.""" + score = _alias_aware_artist_sim( + 'Hiroyuki Sawano', '澤野弘之', aliases=['澤野弘之'], + ) + assert score == 1.0 + + def test_callable_resolves_lazily_only_when_direct_fails(self): + """Callable provider — invoked ONLY when direct sim falls + below threshold.""" + call_count = [0] + + def provider(): + call_count[0] += 1 + return ['澤野弘之'] + + # Direct match passes → provider NOT called + _alias_aware_artist_sim('Foreigner', 'Foreigner', aliases=provider) + assert call_count[0] == 0 + + # Direct match fails → provider IS called + _alias_aware_artist_sim('Hiroyuki Sawano', '澤野弘之', aliases=provider) + assert call_count[0] == 1 + + def test_callable_returning_empty_list_falls_back_to_direct(self): + """Provider returns empty (e.g. MB had no aliases) → + score = direct sim, no error.""" + score = _alias_aware_artist_sim( + 'Hiroyuki Sawano', '澤野弘之', aliases=lambda: [], + ) + # ~0 because direct cross-script comparison fails + assert score < 0.1 diff --git a/tests/matching/test_artist_alias_service.py b/tests/matching/test_artist_alias_service.py index 5c474309..e891814e 100644 --- a/tests/matching/test_artist_alias_service.py +++ b/tests/matching/test_artist_alias_service.py @@ -305,6 +305,79 @@ class TestWorkerAliasEnrichment: worker.mb_service.fetch_artist_aliases.assert_not_called() worker.mb_service.update_artist_aliases.assert_not_called() + def test_existing_mbid_path_backfills_aliases_when_column_empty(self, temp_db): + """Issue #442 perf followup: existing-MBID short-circuit path + was skipping alias enrichment entirely. Users with libraries + enriched BEFORE this PR shipped have MBIDs but NULL aliases. + Worker should fetch aliases on the existing-id path too — + one-time backfill on first re-scan post-deploy.""" + from core.musicbrainz_worker import MusicBrainzWorker + artist_id = _seed_artist(temp_db, 'Hiroyuki Sawano') + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.db = temp_db # _artist_aliases_empty uses self.db + worker.mb_service = MagicMock() + worker.mb_service.fetch_artist_aliases.return_value = ['澤野弘之', 'SawanoHiroyuki'] + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + # Existing MBID path + worker._get_existing_id = MagicMock(return_value='mb-existing-id') + + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'Hiroyuki Sawano'}) + + # MBID was preserved + worker.mb_service.update_artist_mbid.assert_called_once_with( + artist_id, 'mb-existing-id', 'matched', + ) + # Aliases backfilled + worker.mb_service.fetch_artist_aliases.assert_called_once_with('mb-existing-id') + worker.mb_service.update_artist_aliases.assert_called_once_with( + artist_id, ['澤野弘之', 'SawanoHiroyuki'], + ) + + def test_existing_mbid_path_skips_backfill_when_aliases_already_set(self, temp_db): + """If aliases are already populated, don't re-fetch — re-scan + cycles after backfill complete should be no-ops.""" + from core.musicbrainz_worker import MusicBrainzWorker + artist_id = _seed_artist( + temp_db, 'X', aliases=json.dumps(['existing-alias']), + ) + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.db = temp_db + worker.mb_service = MagicMock() + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + worker._get_existing_id = MagicMock(return_value='mb-x') + + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'X'}) + + # No alias work — column already populated + worker.mb_service.fetch_artist_aliases.assert_not_called() + worker.mb_service.update_artist_aliases.assert_not_called() + + def test_existing_mbid_backfill_failure_does_not_break_match(self, temp_db): + """Backfill is best-effort — failure to fetch aliases must + NOT prevent the MBID-preservation update from happening.""" + from core.musicbrainz_worker import MusicBrainzWorker + artist_id = _seed_artist(temp_db, 'X') + + worker = MusicBrainzWorker.__new__(MusicBrainzWorker) + worker.database = temp_db + worker.db = temp_db + worker.mb_service = MagicMock() + worker.mb_service.fetch_artist_aliases.side_effect = Exception("MB down") + worker.stats = {'matched': 0, 'not_found': 0, 'errors': 0} + worker._get_existing_id = MagicMock(return_value='mb-x') + + # Should NOT raise + worker._process_item({'type': 'artist', 'id': artist_id, 'name': 'X'}) + + # MBID still preserved despite alias backfill failure + worker.mb_service.update_artist_mbid.assert_called_once_with( + artist_id, 'mb-x', 'matched', + ) + def test_alias_fetch_failure_does_not_break_match(self, temp_db): """If alias fetch raises (network error, malformed response, whatever), the artist match still gets recorded — alias