You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/tests/matching/test_artist_aliases.py

393 lines
14 KiB

"""Pin the alias-aware artist comparison helper.
Issue #442 — files tagged with one spelling of an artist's name
(Japanese kanji `澤野弘之`) were quarantined when SoulSync expected
the romanized spelling (`Hiroyuki Sawano`). MusicBrainz aliases
should bridge the two — this helper does the bridging.
These tests cover the helper in total isolation: no DB, no network,
no MusicBrainz client. Pure-function contract pinned at the right
boundary so every consumer (verifier, matching engine, future
callers) inherits the same correctness guarantees.
"""
from __future__ import annotations
import pytest
from core.matching.artist_aliases import (
DEFAULT_ARTIST_MATCH_THRESHOLD,
artist_names_match,
best_alias_match,
split_artist_credit,
)
# ---------------------------------------------------------------------------
# Direct compare path — no aliases
# ---------------------------------------------------------------------------
class TestDirectCompareNoAliases:
def test_exact_match(self):
matched, score = artist_names_match('Foreigner', 'Foreigner')
assert matched is True
assert score == 1.0
def test_case_insensitive(self):
matched, score = artist_names_match('foreigner', 'FOREIGNER')
assert matched is True
assert score == 1.0
def test_whitespace_tolerant(self):
matched, score = artist_names_match(' Foreigner ', 'Foreigner')
assert matched is True
def test_completely_different_artists(self):
matched, score = artist_names_match('Foreigner', 'Khalil Turk')
assert matched is False
assert score < DEFAULT_ARTIST_MATCH_THRESHOLD
def test_fuzzy_match_above_threshold(self):
# 'Beatles' vs 'The Beatles' — sim ~0.78
matched, score = artist_names_match('The Beatles', 'Beatles')
assert matched is True
assert score >= DEFAULT_ARTIST_MATCH_THRESHOLD
# ---------------------------------------------------------------------------
# Cross-script — the headline of issue #442
# ---------------------------------------------------------------------------
class TestCrossScriptWithAliases:
def test_japanese_kanji_to_romanized(self):
"""Reporter's case 1: file tagged 澤野弘之, expected
Hiroyuki Sawano. MusicBrainz alias `澤野弘之` on the artist
record bridges the two."""
matched, score = artist_names_match(
'Hiroyuki Sawano',
'澤野弘之',
aliases=['澤野弘之', 'SawanoHiroyuki', 'Sawano Hiroyuki'],
)
assert matched is True, (
f"Expected alias match for Japanese spelling; got matched=False score={score}"
)
def test_romanized_to_japanese_kanji(self):
"""Symmetric direction — file tagged Hiroyuki Sawano, expected
澤野弘之. Aliases should resolve either way."""
matched, score = artist_names_match(
'澤野弘之',
'Hiroyuki Sawano',
aliases=['Hiroyuki Sawano', 'SawanoHiroyuki'],
)
assert matched is True
def test_cyrillic_to_latin(self):
"""Reporter's case 2: file tagged Sergey Lazarev, expected
Сергей Лазарев."""
matched, score = artist_names_match(
'Сергей Лазарев',
'Sergey Lazarev',
aliases=['Sergey Lazarev', 'Sergei Lazarev'],
)
assert matched is True
def test_no_alias_match_falls_through_to_fail(self):
"""Aliases provided but none match the actual artist —
should still fail. Aliases bridge synonyms, they don't mask
genuine mismatches."""
matched, score = artist_names_match(
'Hiroyuki Sawano',
'Khalil Turk',
aliases=['澤野弘之', 'SawanoHiroyuki'],
)
assert matched is False
# ---------------------------------------------------------------------------
# Aliases input handling — defensive coercion
# ---------------------------------------------------------------------------
class TestAliasesInputCoercion:
def test_none_aliases_treated_as_empty(self):
matched, _ = artist_names_match('A', 'A', aliases=None)
assert matched is True
def test_empty_list_aliases(self):
matched, _ = artist_names_match('A', 'A', aliases=[])
assert matched is True
def test_aliases_can_be_set(self):
matched, _ = artist_names_match(
'Hiroyuki Sawano', '澤野弘之', aliases={'澤野弘之', 'SawanoHiroyuki'},
)
assert matched is True
def test_aliases_can_be_tuple(self):
matched, _ = artist_names_match(
'Hiroyuki Sawano', '澤野弘之', aliases=('澤野弘之',),
)
assert matched is True
def test_none_entries_in_aliases_skipped(self):
"""Defensive: caller might pass aliases pulled directly from
a partial MB response. None / empty entries shouldn't crash."""
matched, _ = artist_names_match(
'Hiroyuki Sawano', '澤野弘之',
aliases=[None, '', '澤野弘之', None],
)
assert matched is True
def test_non_string_entries_coerced(self):
"""Defensive: aliases parsed from JSON might surface as ints
or other non-string types. str() coercion in helper handles it."""
matched, _ = artist_names_match(
'A', '123', aliases=[123],
)
assert matched is True
# ---------------------------------------------------------------------------
# Threshold behaviour
# ---------------------------------------------------------------------------
class TestThreshold:
def test_default_threshold_matches_verifier(self):
"""Default threshold must equal the verifier's existing
ARTIST_MATCH_THRESHOLD so wiring the helper into the
verifier preserves current pass/fail semantics on the
no-alias path."""
assert DEFAULT_ARTIST_MATCH_THRESHOLD == 0.6
def test_custom_threshold_stricter(self):
# Direct comparison would normally pass at 0.6 default,
# but a stricter threshold should reject it.
matched, score = artist_names_match(
'The Beatles', 'Beatles', threshold=0.95,
)
assert matched is False
def test_custom_threshold_looser(self):
matched, score = artist_names_match(
'AAAAA', 'AAABB', threshold=0.4,
)
# ~0.6 sim, passes loose threshold
assert matched is True
# ---------------------------------------------------------------------------
# Custom similarity callable
# ---------------------------------------------------------------------------
class TestCustomSimilarity:
def test_custom_sim_used_for_direct_compare(self):
"""Caller (verifier) passes its own normaliser-aware
similarity. Helper must route through it instead of using
the default."""
def stricter(a, b):
# Always returns 0 — proves we're using the custom callable
return 0.0
matched, score = artist_names_match(
'Foreigner', 'Foreigner', similarity=stricter,
)
assert matched is False
assert score == 0.0
def test_custom_sim_used_for_alias_compare(self):
"""Custom similarity also applies to alias scoring — not just
the direct comparison."""
def alias_only_perfect(a, b):
# Returns 1.0 only when comparing the alias 'aliasX'
return 1.0 if 'aliasX' in (a, b) else 0.0
matched, score = artist_names_match(
'Foreigner', 'observed',
aliases=['aliasX'],
similarity=alias_only_perfect,
)
assert matched is True
assert score == 1.0
# ---------------------------------------------------------------------------
# Best-alias-match introspection helper
# ---------------------------------------------------------------------------
class TestBestAliasMatch:
def test_direct_wins_no_alias_winner(self):
winner, score = best_alias_match(
'Foreigner', 'Foreigner', aliases=['otherthing'],
)
assert winner is None
assert score == 1.0
def test_alias_wins_returns_alias(self):
winner, score = best_alias_match(
'Hiroyuki Sawano', '澤野弘之',
aliases=['澤野弘之', 'SawanoHiroyuki'],
)
assert winner == '澤野弘之'
assert score == 1.0
def test_no_aliases_just_direct_score(self):
winner, score = best_alias_match('A', 'B', aliases=None)
assert winner is None
assert isinstance(score, float)
# ---------------------------------------------------------------------------
# Backward compat — pre-fix behaviour preserved when no aliases
# ---------------------------------------------------------------------------
class TestBackwardCompatNoAliases:
"""When callers don't supply aliases (initial wiring, or live MB
unreachable), the helper must behave EXACTLY like a direct
similarity check — no surprises for paths that haven't been
wired up to alias lookup yet."""
@pytest.mark.parametrize('expected,actual,should_match', [
('Foreigner', 'Foreigner', True), # exact
('foreigner', 'FOREIGNER', True), # case
('The Beatles', 'Beatles', True), # fuzzy passes
('Foreigner', 'Khalil Turk', False), # different
('Hiroyuki Sawano', '澤野弘之', False), # cross-script no aliases → fail (pre-fix behaviour)
])
def test_no_alias_path_matches_direct_similarity(self, expected, actual, should_match):
matched, _ = artist_names_match(expected, actual)
assert matched is should_match
# ---------------------------------------------------------------------------
# Multi-value artist credit — Discord report from Foxxify
# ---------------------------------------------------------------------------
#
# AcoustID returns the FULL artist credit ("Okayracer, aldrch &
# poptropicaslutz!") while the library DB carries only the primary
# artist ("Okayracer"). Pre-fix raw similarity scored ~43% — well
# below the 0.6 threshold — and the scanner flagged the track as
# Wrong Song. Post-fix the helper splits the credit and the primary
# match wins at near-100%.
class TestSplitArtistCredit:
@pytest.mark.parametrize('credit,expected', [
('Okayracer, aldrch & poptropicaslutz!',
['Okayracer', 'aldrch', 'poptropicaslutz!']),
('Daft Punk feat. Pharrell',
['Daft Punk', 'Pharrell']),
('Daft Punk ft. Pharrell',
['Daft Punk', 'Pharrell']),
('Daft Punk featuring Pharrell',
['Daft Punk', 'Pharrell']),
('Beyoncé with JAY-Z',
['Beyoncé', 'JAY-Z']),
('Eminem vs. Jay-Z',
['Eminem', 'Jay-Z']),
('Artist1 / Artist2 / Artist3',
['Artist1', 'Artist2', 'Artist3']),
('Artist1; Artist2; Artist3',
['Artist1', 'Artist2', 'Artist3']),
('Artist1 + Artist2',
['Artist1', 'Artist2']),
('A x B',
['A', 'B']),
('Solo Artist',
['Solo Artist']), # single-token = self
('',
[]),
])
def test_splits_on_known_separators(self, credit, expected):
assert split_artist_credit(credit) == expected
def test_drops_empty_tokens(self):
# Trailing / leading separators don't introduce empty entries
assert split_artist_credit('Artist,, Other') == ['Artist', 'Other']
def test_strips_whitespace_per_token(self):
assert split_artist_credit(' A , B ') == ['A', 'B']
class TestMultiValueCreditMatching:
def test_reporters_exact_case_okayracer(self):
"""Discord report from Foxxify — verbatim from the screenshot:
Expected: Okayracer
AcoustID: Okayracer, aldrch & poptropicaslutz!
Pre-fix: artist match 43% → Wrong Song flag
Post-fix: primary in credit → 100% match
"""
matched, score = artist_names_match(
'Okayracer',
'Okayracer, aldrch & poptropicaslutz!',
)
assert matched is True, (
f"Expected primary-in-credit match; got matched=False score={score}"
)
assert score == 1.0
def test_primary_in_middle_of_credit(self):
"""Primary artist isn't always first in the credit."""
matched, score = artist_names_match(
'Pharrell',
'Daft Punk feat. Pharrell',
)
assert matched is True
assert score == 1.0
def test_primary_at_end_of_credit(self):
matched, score = artist_names_match(
'JAY-Z',
'Beyoncé with JAY-Z',
)
assert matched is True
def test_no_match_when_expected_artist_not_in_credit(self):
"""Multi-value path doesn't mask genuine mismatches. If
expected isn't in the credit, the comparison should still
fail."""
matched, _ = artist_names_match(
'Madonna',
'Daft Punk feat. Pharrell',
)
assert matched is False
def test_single_token_actual_falls_through_to_direct(self):
"""When actual has no separators, multi-value path is a
no-op — same as the direct compare."""
matched, _ = artist_names_match('Foreigner', 'Foreigner')
assert matched is True
# And different artists still fail
matched, _ = artist_names_match('Foreigner', 'Khalil Turk')
assert matched is False
def test_multi_value_combines_with_aliases(self):
"""Combination case: expected is romanized, actual credit
contains the kanji form alongside other artists. Both the
alias path AND the multi-value path must collaborate."""
matched, score = artist_names_match(
'Hiroyuki Sawano',
'澤野弘之, FeaturedJp Artist',
aliases=['澤野弘之', 'SawanoHiroyuki'],
)
assert matched is True
assert score == 1.0
def test_threshold_still_respected(self):
"""Multi-value path doesn't bypass the threshold — fuzzy
in-credit matches still need to clear it."""
matched, score = artist_names_match(
'XXXXXX',
'YYYYYY, ZZZZZZ',
threshold=0.99,
)
assert matched is False
assert score < 0.5