mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
499 lines
19 KiB
499 lines
19 KiB
"""Regression tests for issue #442 — AcoustID verifier alias awareness.
|
|
|
|
The reporter posted two exact cases:
|
|
|
|
Case 1 (Japanese kanji ↔ romanized):
|
|
File: YAMANAIAME by 澤野弘之
|
|
Expected: YAMANAIAME by Hiroyuki Sawano
|
|
Pre-fix: quarantined (artist_sim=0%)
|
|
Post-fix: passes verification because MB aliases bridge the
|
|
two spellings.
|
|
|
|
Case 2 (Cyrillic ↔ Latin):
|
|
File: On the Other Side by Sergey Lazarev
|
|
Expected: On the other side by Сергей Лазарев
|
|
Pre-fix: quarantined (artist_sim=7%)
|
|
Post-fix: passes via aliases.
|
|
|
|
These tests pin the verifier through the helper. AcoustID's
|
|
fingerprint call is stubbed (no network), MB service's
|
|
`lookup_artist_aliases` is stubbed to return the relevant aliases.
|
|
The verifier's pass/fail decision is the assertion.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from core.acoustid_verification import (
|
|
AcoustIDVerification,
|
|
VerificationResult,
|
|
_alias_aware_artist_sim,
|
|
_find_best_title_artist_match,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pure helper — _alias_aware_artist_sim
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAliasAwareArtistSim:
|
|
def test_returns_higher_score_when_alias_matches(self):
|
|
score = _alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', '澤野弘之',
|
|
aliases=['澤野弘之', 'SawanoHiroyuki'],
|
|
)
|
|
assert score == 1.0
|
|
|
|
def test_no_aliases_falls_back_to_direct_similarity(self):
|
|
"""Cross-script with NO aliases → score ~0, pre-fix behaviour."""
|
|
score = _alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', '澤野弘之', aliases=None,
|
|
)
|
|
assert score < 0.1
|
|
|
|
def test_aliases_dont_mask_genuine_mismatch(self):
|
|
"""Different artist entirely → still scores low even when
|
|
aliases are provided. Aliases bridge synonyms, not unrelated
|
|
artists."""
|
|
score = _alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', 'Khalil Turk & Friends',
|
|
aliases=['澤野弘之', 'SawanoHiroyuki'],
|
|
)
|
|
assert score < 0.5
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _find_best_title_artist_match — accepts aliases now
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestFindBestMatchWithAliases:
|
|
def test_japanese_alias_picks_correct_recording(self):
|
|
"""Reporter's case 1: AcoustID returned recording with kanji
|
|
artist. Without aliases the scorer ranks it low and the
|
|
verifier later quarantines. With aliases it scores high."""
|
|
recordings = [
|
|
{'title': 'YAMANAIAME', 'artist': '澤野弘之'},
|
|
{'title': 'Different Song', 'artist': 'Hiroyuki Sawano'},
|
|
]
|
|
# Aliases provided — bridge to recording 0
|
|
best, title_sim, artist_sim = _find_best_title_artist_match(
|
|
recordings, 'YAMANAIAME', 'Hiroyuki Sawano',
|
|
expected_artist_aliases=['澤野弘之', 'SawanoHiroyuki'],
|
|
)
|
|
assert best is recordings[0]
|
|
assert artist_sim == 1.0
|
|
|
|
def test_no_aliases_legacy_behaviour_preserved(self):
|
|
"""Default arg / empty aliases → identical to pre-fix
|
|
behaviour. Critical for paths not yet wired up to alias
|
|
lookup."""
|
|
recordings = [
|
|
{'title': 'Track', 'artist': 'Artist'},
|
|
]
|
|
best, title_sim, artist_sim = _find_best_title_artist_match(
|
|
recordings, 'Track', 'Artist',
|
|
)
|
|
assert title_sim == 1.0
|
|
assert artist_sim == 1.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# End-to-end — reporter's cases through the full verifier
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture
|
|
def stubbed_verifier(monkeypatch):
|
|
"""AcoustIDVerification with the acoustid client + MB service
|
|
layer stubbed. Lets us drive the verifier's full decision path
|
|
without network or DB. Returns the verifier + mutable handles
|
|
to the stubs so each test can shape the AcoustID response +
|
|
aliases."""
|
|
verifier = AcoustIDVerification()
|
|
verifier.acoustid_client = MagicMock()
|
|
verifier.acoustid_client.is_available.return_value = (True, '')
|
|
|
|
# Stub the MB service so verifier alias lookup doesn't touch DB
|
|
# or network. Each test sets fake_service.lookup_artist_aliases.
|
|
fake_service = MagicMock()
|
|
fake_service.lookup_artist_aliases.return_value = []
|
|
monkeypatch.setattr(
|
|
'core.acoustid_verification._get_mb_service', lambda: fake_service,
|
|
)
|
|
|
|
return verifier, fake_service
|
|
|
|
|
|
class TestIssue442Regression:
|
|
def test_japanese_kanji_artist_passes_verification(self, stubbed_verifier):
|
|
"""Reporter's case 1 — verbatim from the issue:
|
|
|
|
File: YAMANAIAME by 澤野弘之
|
|
Expected: YAMANAIAME by Hiroyuki Sawano
|
|
Pre-fix: Quarantined (artist=0%)
|
|
"""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
# AcoustID returns the recording with kanji artist
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'YAMANAIAME', 'artist': '澤野弘之', 'mbid': 'rec-x'},
|
|
],
|
|
}
|
|
# MB knows Hiroyuki Sawano's aliases
|
|
fake_service.lookup_artist_aliases.return_value = [
|
|
'澤野弘之', 'SawanoHiroyuki', 'Sawano Hiroyuki',
|
|
]
|
|
|
|
result, msg = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'YAMANAIAME', 'Hiroyuki Sawano',
|
|
)
|
|
|
|
assert result == VerificationResult.PASS, (
|
|
f"Reporter's exact case must pass verification post-fix; "
|
|
f"got result={result.value!r} msg={msg!r}"
|
|
)
|
|
fake_service.lookup_artist_aliases.assert_called_once_with('Hiroyuki Sawano')
|
|
|
|
def test_cyrillic_artist_passes_verification(self, stubbed_verifier):
|
|
"""Reporter's case 2 — Sergey Lazarev / Сергей Лазарев."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'On the Other Side', 'artist': 'Sergey Lazarev', 'mbid': 'rec-y'},
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = [
|
|
'Sergey Lazarev', 'Sergei Lazarev',
|
|
]
|
|
|
|
result, msg = verifier.verify_audio_file(
|
|
'/fake/path.flac', 'On the other side', 'Сергей Лазарев',
|
|
)
|
|
|
|
assert result == VerificationResult.PASS
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Backward compat — no aliases available → behavior identical to pre-fix
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestBackwardCompat:
|
|
def test_no_aliases_clear_artist_mismatch_still_fails(self, stubbed_verifier):
|
|
"""Pre-fix: clear mismatches (artist sim near 0, NOT a script
|
|
difference) should FAIL. Post-fix with empty aliases must
|
|
preserve this — aliases bridge synonyms, not unrelated
|
|
artists."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
# Wrong artist entirely — Latin script both sides, sim ~0
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'Some Track', 'artist': 'Khalil Turk & Friends'},
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = [] # No aliases
|
|
|
|
result, msg = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'Some Track', 'Foreigner',
|
|
)
|
|
|
|
assert result == VerificationResult.FAIL
|
|
|
|
def test_no_aliases_exact_match_still_passes(self, stubbed_verifier):
|
|
"""Exact title + artist match → PASS regardless of aliases."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'Dirty White Boy', 'artist': 'Foreigner'},
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = []
|
|
|
|
result, _ = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'Dirty White Boy', 'Foreigner',
|
|
)
|
|
assert result == VerificationResult.PASS
|
|
|
|
def test_alias_lookup_failure_does_not_break_verification(self, stubbed_verifier):
|
|
"""MB service raises → verifier still completes with direct
|
|
similarity (pre-fix behaviour preserved)."""
|
|
verifier, fake_service = stubbed_verifier
|
|
fake_service.lookup_artist_aliases.side_effect = Exception("MB down")
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'Dirty White Boy', 'artist': 'Foreigner'},
|
|
],
|
|
}
|
|
|
|
result, _ = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'Dirty White Boy', 'Foreigner',
|
|
)
|
|
# Should still pass — direct similarity works
|
|
assert result == VerificationResult.PASS
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Performance contract — alias lookup fires ONCE per verification
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAliasLookupCalledOncePerVerify:
|
|
def test_single_lookup_call_regardless_of_recordings_count(self, stubbed_verifier):
|
|
"""The verifier processes multiple recordings + scans through
|
|
them at up to 3 sites — but should only call
|
|
`lookup_artist_aliases` ONCE per verify_audio_file invocation.
|
|
Otherwise verifying a track with 20 AcoustID recordings could
|
|
fire 60+ MB lookups (cached or not, that's wasteful)."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'X', 'artist': '澤野弘之'},
|
|
{'title': 'X', 'artist': 'SawanoHiroyuki'},
|
|
{'title': 'X', 'artist': 'Different Artist'},
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = ['澤野弘之', 'SawanoHiroyuki']
|
|
|
|
verifier.verify_audio_file('/fake/path.mp3', 'X', 'Hiroyuki Sawano')
|
|
|
|
assert fake_service.lookup_artist_aliases.call_count == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Lazy alias resolution — happy path skips MB lookup entirely
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestLazyAliasResolution:
|
|
"""Issue #442 perf followup: alias lookup should ONLY fire when
|
|
the direct artist comparison fails. Verifications where artist
|
|
names already match (the 95% common case for same-script
|
|
libraries) must NOT trigger the lookup chain — no wasted DB
|
|
query, no wasted MB call."""
|
|
|
|
def test_no_lookup_when_direct_artist_match_passes(self, stubbed_verifier):
|
|
"""Exact-match Latin-script artist passes verification with
|
|
zero alias lookups — no DB query, no MB call. Same-script
|
|
libraries (the 95% common case) inherit zero perf cost from
|
|
this PR."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'Dirty White Boy', 'artist': 'Foreigner'},
|
|
],
|
|
}
|
|
|
|
result, _ = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'Dirty White Boy', 'Foreigner',
|
|
)
|
|
|
|
assert result == VerificationResult.PASS
|
|
# Critical — alias lookup must NOT have been called for the
|
|
# happy path. Otherwise every successful verification adds a
|
|
# DB query for nothing.
|
|
fake_service.lookup_artist_aliases.assert_not_called()
|
|
|
|
def test_lookup_fires_only_when_direct_artist_match_fails(self, stubbed_verifier):
|
|
"""Cross-script case where direct sim is 0% → lookup fires
|
|
as expected."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'YAMANAIAME', 'artist': '澤野弘之'},
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = ['澤野弘之']
|
|
|
|
result, _ = verifier.verify_audio_file(
|
|
'/fake/path.mp3', 'YAMANAIAME', 'Hiroyuki Sawano',
|
|
)
|
|
|
|
assert result == VerificationResult.PASS
|
|
# Lookup fired BECAUSE direct match would have failed
|
|
fake_service.lookup_artist_aliases.assert_called_once()
|
|
|
|
def test_lookup_memoised_across_three_comparison_sites(self, stubbed_verifier):
|
|
"""When lookup DOES fire, the result must be reused across
|
|
the three artist-comparison sites in the verifier (best-match
|
|
scoring, secondary scan, fallback scan). One resolution per
|
|
verification — not three."""
|
|
verifier, fake_service = stubbed_verifier
|
|
|
|
# Force a code path that hits multiple sites: title matches
|
|
# several recordings but the best-match's artist sim is below
|
|
# threshold (forces secondary scan path).
|
|
verifier.acoustid_client.fingerprint_and_lookup.return_value = {
|
|
'best_score': 0.95,
|
|
'recordings': [
|
|
{'title': 'X', 'artist': 'Different Latin Artist'}, # 0 alias hit
|
|
{'title': 'X', 'artist': '澤野弘之'}, # alias hit
|
|
],
|
|
}
|
|
fake_service.lookup_artist_aliases.return_value = ['澤野弘之']
|
|
|
|
verifier.verify_audio_file('/fake/path.mp3', 'X', 'Hiroyuki Sawano')
|
|
|
|
# Memoised — one resolution shared across all sites
|
|
assert fake_service.lookup_artist_aliases.call_count == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Provider-callable contract on the helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAliasProviderCallable:
|
|
"""Pin the dual-shape contract on `_alias_aware_artist_sim`:
|
|
accepts an iterable OR a callable. Callable resolves lazily."""
|
|
|
|
def test_iterable_passed_directly(self):
|
|
"""Plain list — used as-is, no lazy semantics."""
|
|
score = _alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', '澤野弘之', aliases=['澤野弘之'],
|
|
)
|
|
assert score == 1.0
|
|
|
|
def test_callable_resolves_lazily_only_when_direct_fails(self):
|
|
"""Callable provider — invoked ONLY when direct sim falls
|
|
below threshold."""
|
|
call_count = [0]
|
|
|
|
def provider():
|
|
call_count[0] += 1
|
|
return ['澤野弘之']
|
|
|
|
# Direct match passes → provider NOT called
|
|
_alias_aware_artist_sim('Foreigner', 'Foreigner', aliases=provider)
|
|
assert call_count[0] == 0
|
|
|
|
# Direct match fails → provider IS called
|
|
_alias_aware_artist_sim('Hiroyuki Sawano', '澤野弘之', aliases=provider)
|
|
assert call_count[0] == 1
|
|
|
|
def test_callable_returning_empty_list_falls_back_to_direct(self):
|
|
"""Provider returns empty (e.g. MB had no aliases) →
|
|
score = direct sim, no error."""
|
|
score = _alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', '澤野弘之', aliases=lambda: [],
|
|
)
|
|
# ~0 because direct cross-script comparison fails
|
|
assert score < 0.1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Diagnostic logging — alias rescues are visible in logs
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAliasRescueLogging:
|
|
"""When an alias bridges a comparison that direct similarity
|
|
would have failed, log it at INFO level. Future bug reports
|
|
where a file passed verification incorrectly can be traced back
|
|
to which alias triggered which decision.
|
|
|
|
Uses a directly-attached handler instead of pytest's caplog —
|
|
full-suite caplog is intermittently flaky for soulsync namespace
|
|
loggers (handler ordering, parallel test state). An owned
|
|
handler on the specific logger sidesteps both issues, same
|
|
pattern as the prior watchdog-test fix.
|
|
"""
|
|
|
|
@staticmethod
|
|
def _capture_records():
|
|
"""Attach an owned ListHandler to the verifier's logger.
|
|
Returns (records list, teardown callable)."""
|
|
import logging as _logging
|
|
records: list = []
|
|
|
|
class _ListHandler(_logging.Handler):
|
|
def emit(self, record):
|
|
records.append(record)
|
|
|
|
handler = _ListHandler(level=_logging.INFO)
|
|
# Logger name is `soulsync.acoustid.verification` per
|
|
# `core.acoustid_verification`'s `get_logger("acoustid_verification")`
|
|
# — dot-separated, NOT underscored.
|
|
verifier_logger = _logging.getLogger('soulsync.acoustid.verification')
|
|
verifier_logger.addHandler(handler)
|
|
prior_level = verifier_logger.level
|
|
verifier_logger.setLevel(_logging.INFO)
|
|
|
|
def teardown():
|
|
verifier_logger.removeHandler(handler)
|
|
verifier_logger.setLevel(prior_level)
|
|
|
|
return records, teardown
|
|
|
|
def test_alias_rescue_emits_info_log(self):
|
|
records, teardown = self._capture_records()
|
|
try:
|
|
_alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', '澤野弘之', aliases=['澤野弘之'],
|
|
)
|
|
finally:
|
|
teardown()
|
|
|
|
rescue_logs = [
|
|
r.getMessage() for r in records
|
|
if 'alias rescued' in r.getMessage().lower()
|
|
]
|
|
assert len(rescue_logs) >= 1, (
|
|
f"Expected an INFO log line about alias rescue; got "
|
|
f"{[r.getMessage() for r in records]}"
|
|
)
|
|
|
|
def test_no_log_when_direct_match_succeeds(self):
|
|
"""Happy path doesn't spam logs — only rescue cases log."""
|
|
records, teardown = self._capture_records()
|
|
try:
|
|
_alias_aware_artist_sim(
|
|
'Foreigner', 'Foreigner', aliases=['ignored-alias'],
|
|
)
|
|
finally:
|
|
teardown()
|
|
|
|
rescue_logs = [
|
|
r.getMessage() for r in records
|
|
if 'alias rescued' in r.getMessage().lower()
|
|
]
|
|
assert rescue_logs == []
|
|
|
|
def test_no_log_when_alias_doesnt_help(self):
|
|
"""If aliases were available but didn't bridge the gap (still
|
|
below threshold), no rescue log — there was no rescue."""
|
|
records, teardown = self._capture_records()
|
|
try:
|
|
_alias_aware_artist_sim(
|
|
'Hiroyuki Sawano', 'Khalil Turk',
|
|
aliases=['Sergey Lazarev'], # unrelated alias
|
|
)
|
|
finally:
|
|
teardown()
|
|
|
|
rescue_logs = [
|
|
r.getMessage() for r in records
|
|
if 'alias rescued' in r.getMessage().lower()
|
|
]
|
|
assert rescue_logs == []
|