You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/tests/test_matching_engine_cjk.py

195 lines
8.2 KiB

"""Tests for ``MusicMatchingEngine.normalize_string`` CJK preservation.
Issue #722 (@Sokhii): downloading a Japanese OST through Apple Music
metadata + Tidal download produced duplicate tracks — same audio
landed under multiple track positions in the album.
Root cause: ``normalize_string`` detected CJK presence and SKIPPED
unidecode (correct — kanji→pinyin would have been gibberish), but
then ran ``re.sub(r'[^a-z0-9\\s$]', '', text)`` which stripped EVERY
CJK character. Every Japanese title normalised to ``''``. The
``similarity_score`` guard at ``if not str1 or not str2: return 0.0``
made every CJK-vs-CJK comparison return ``0.000``, so the matcher
fell back to duration+artist alone — multiple iTunes tracks mapped
to the same Tidal candidate, and the user got duplicate downloads
under different track positions.
These tests pin the new behaviour: CJK characters survive
normalisation, identical CJK titles score 1.0, disjoint CJK titles
score low, mixed CJK+Latin titles work, and Latin-only titles are
completely unaffected.
"""
from __future__ import annotations
import pytest
from core.matching_engine import MusicMatchingEngine
@pytest.fixture
def engine() -> MusicMatchingEngine:
return MusicMatchingEngine()
# ---------------------------------------------------------------------------
# Normalisation preserves CJK ranges.
# ---------------------------------------------------------------------------
def test_normalize_preserves_kanji_characters(engine: MusicMatchingEngine):
"""Japanese kanji must survive normalisation, not get stripped."""
assert engine.normalize_string('命の灯火') == '命の灯火'
def test_normalize_preserves_hiragana_characters(engine: MusicMatchingEngine):
"""Hiragana also survives."""
assert engine.normalize_string('あいうえお') == 'あいうえお'
def test_normalize_preserves_katakana_characters(engine: MusicMatchingEngine):
"""Katakana — common in Japanese song titles for foreign loanwords —
survives. Pre-fix this was the most-visible failure since OST titles
are often Katakana."""
assert engine.normalize_string('ハッピーデイズ') == 'ハッピーデイズ'
def test_normalize_preserves_hangul_characters(engine: MusicMatchingEngine):
"""Korean Hangul survives (same root cause hits K-Pop OST tracks)."""
assert engine.normalize_string('안녕하세요') == '안녕하세요'
def test_normalize_preserves_simplified_chinese_characters(engine: MusicMatchingEngine):
"""Chinese hanzi survives (same root cause hits Mandarin / Cantonese
releases). All three CJK ideograph users were broken together; the
fix covers all three."""
assert engine.normalize_string('你好世界') == '你好世界'
def test_normalize_lowercases_cjk_branch_does_not_uppercase_ascii(engine: MusicMatchingEngine):
"""Mixed CJK + Latin string — CJK branch was supposed to keep CJK and
only lowercase; verify the Latin half also gets lowercased and isn't
accidentally left as-is."""
assert engine.normalize_string('Happy 命') == 'happy 命'
def test_normalize_strips_latin_punctuation_in_cjk_branch(engine: MusicMatchingEngine):
"""The CJK branch must still strip Latin punctuation — only CJK
ranges are preserved, not random symbols. ``!`` should still go,
same as in the Latin branch."""
assert engine.normalize_string('命の灯火!') == '命の灯火'
# ---------------------------------------------------------------------------
# Similarity scoring on CJK titles.
# ---------------------------------------------------------------------------
def test_identical_cjk_titles_score_perfect_match(engine: MusicMatchingEngine):
"""Same Japanese title twice → 1.0. Pre-fix this was 0.0 because
both normalised to '' and the empty-string guard short-circuited."""
a = engine.clean_title('命の灯火')
b = engine.clean_title('命の灯火')
assert engine.similarity_score(a, b) == 1.0
def test_completely_disjoint_cjk_titles_score_low(engine: MusicMatchingEngine):
"""Two unrelated Japanese titles share no characters → similarity
near 0. The point is that they're DIFFERENT — pre-fix they both
normalised to '' so were treated the same as "identical"."""
a = engine.clean_title('命の灯火')
b = engine.clean_title('無職転生')
score = engine.similarity_score(a, b)
assert score < 0.3
def test_partially_overlapping_cjk_titles_score_partial(engine: MusicMatchingEngine):
"""Sequential matching gives a midrange score for partial overlap —
proves the comparator is actually looking at the characters, not
just returning 0 or 1."""
a = engine.clean_title('命の灯火')
b = engine.clean_title('命の音')
score = engine.similarity_score(a, b)
assert 0.3 < score < 1.0
def test_cjk_title_does_not_falsely_match_unrelated_latin_title(engine: MusicMatchingEngine):
"""Pre-fix bug: a CJK title normalised to '' would short-circuit
similarity scoring against ANY Latin title (also returning 0
because of the empty guard). That's still 0 in both directions
so the symptom isn't directly observable here — but pin that a
real CJK string vs a real Latin string returns a meaningful low
score, not a coincidental match."""
a = engine.clean_title('命の灯火')
b = engine.clean_title('Happy Days')
score = engine.similarity_score(a, b)
assert score < 0.2
# ---------------------------------------------------------------------------
# Regression: Latin-only titles are untouched.
# ---------------------------------------------------------------------------
def test_latin_normalisation_unchanged_for_simple_title(engine: MusicMatchingEngine):
"""No CJK in input → unidecode + lowercase path, exactly as before."""
assert engine.normalize_string('Happy Days') == 'happy days'
def test_latin_normalisation_unchanged_for_unidecode_target(engine: MusicMatchingEngine):
"""Cyrillic / accented Latin still goes through unidecode."""
assert engine.normalize_string('Björk') == 'bjork'
def test_latin_normalisation_unchanged_for_dollar_sign(engine: MusicMatchingEngine):
"""The ``$`` preservation rule still applies in the Latin branch
(A$AP Rocky etc.) — pinned so the CJK refactor doesn't accidentally
drop it."""
norm = engine.normalize_string('A$AP Rocky')
assert '$' in norm
def test_latin_similarity_unchanged_for_baseline_comparison(engine: MusicMatchingEngine):
"""Sanity: the existing Latin-Latin scoring behaviour didn't shift.
Identical strings still score 1.0; different strings score below
1.0. Pin a specific pair from the regression report so a future
normaliser tweak doesn't quietly change Latin-side semantics."""
a = engine.clean_title('Happy Days')
b = engine.clean_title('Happy Days')
assert engine.similarity_score(a, b) == 1.0
c = engine.clean_title('Happy Days')
d = engine.clean_title('My Past Self')
assert engine.similarity_score(c, d) < 0.5
# ---------------------------------------------------------------------------
# Real-world scenarios from issue #722.
# ---------------------------------------------------------------------------
def test_mushoku_tensei_ost_track_titles_distinguishable():
"""End-to-end: the exact scenario from #722. Two different iTunes
tracks from Mushoku Tensei Original Soundtrack II — pre-fix both
normalised to '' so the matcher couldn't tell them apart and
routed both to the same Tidal candidate. Post-fix they're
distinguishable.
Track titles taken from the iTunes album response for id 1753240110;
when the storefront returns Japanese titles instead of the English
romanisations (depends on user's region + storefront config), this
is the comparator the matcher will use."""
engine = MusicMatchingEngine()
# Two distinct OST tracks rendered in Japanese.
track_a = engine.clean_title('幸せの日々') # 'Happy Days'
track_b = engine.clean_title('家探し') # 'Home Search'
score = engine.similarity_score(track_a, track_b)
# The match must be well below the 0.7+ threshold the candidate
# scorer uses to accept a match — otherwise both iTunes tracks
# would still pick the same Tidal candidate and duplicate.
assert score < 0.5
if __name__ == '__main__':
pytest.main([__file__, '-v'])