mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
3.1 KiB
80 lines
3.1 KiB
"""Tests for core/matching/script_compat.py — writing-system detection.
|
|
|
|
Issue #797 — these pin the exact boundary that the AcoustID verifier
|
|
relies on: accented Latin is still Latin (no false cross-script
|
|
trigger), but genuinely different writing systems (CJK / Hangul /
|
|
Cyrillic / Greek / Arabic / Hebrew / Thai) ARE flagged so a
|
|
romanized-vs-native artist comparison isn't treated as a real mismatch.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from core.matching.script_compat import (
|
|
has_strong_nonlatin,
|
|
is_cross_script_mismatch,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# has_strong_nonlatin — accented Latin must NOT count
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize('text', [
|
|
'Beyoncé', 'Sigur Rós', 'Mötley Crüe', 'Joe Hisaishi',
|
|
'Kendrick Lamar', 'Dmitry Yablonsky', 'AC/DC', 'P!nk',
|
|
'', ' ', '12345', 'Café del Mar',
|
|
])
|
|
def test_latin_and_accented_latin_is_not_nonlatin(text):
|
|
assert has_strong_nonlatin(text) is False
|
|
|
|
|
|
@pytest.mark.parametrize('text', [
|
|
'久石譲', # kanji (Joe Hisaishi)
|
|
'残酷な天使のテーゼ', # kana + kanji
|
|
'Дмитрий Яблонский', # Cyrillic
|
|
'방탄소년단', # Hangul (BTS)
|
|
'Σωκράτης', # Greek
|
|
'عمرو دياب', # Arabic
|
|
'שלום', # Hebrew
|
|
'ก้อง สหรัถ', # Thai
|
|
])
|
|
def test_real_nonlatin_scripts_detected(text):
|
|
assert has_strong_nonlatin(text) is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# is_cross_script_mismatch — the verifier's gate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_romanized_vs_native_is_cross_script():
|
|
# The reported case (#797): expected romanized, AcoustID native.
|
|
assert is_cross_script_mismatch('Joe Hisaishi', '久石譲') is True
|
|
assert is_cross_script_mismatch('Dmitry Yablonsky', 'Дмитрий Яблонский') is True
|
|
|
|
|
|
def test_is_symmetric():
|
|
assert is_cross_script_mismatch('久石譲', 'Joe Hisaishi') is True
|
|
|
|
|
|
def test_same_latin_both_sides_is_not_mismatch():
|
|
# English-vs-English — comparison is meaningful, no bridge. This is
|
|
# the Kendrick R.O.T.C protection surface: must stay False so the
|
|
# verifier keeps its strict FAIL path.
|
|
assert is_cross_script_mismatch('Kendrick Lamar', 'Kendrick Lamar feat. BJ') is False
|
|
assert is_cross_script_mismatch('Crown', 'Crown of Thorns') is False
|
|
|
|
|
|
def test_same_nonlatin_both_sides_is_not_mismatch():
|
|
# Both native — same-script similarity still works, don't relax.
|
|
assert is_cross_script_mismatch('久石譲', '久石譲') is False
|
|
assert is_cross_script_mismatch('久石譲', '坂本龍一') is False
|
|
|
|
|
|
def test_empty_or_letterless_other_side_is_not_mismatch():
|
|
# One side non-Latin but the other has no Latin LETTER to bridge to.
|
|
assert is_cross_script_mismatch('', '久石譲') is False
|
|
assert is_cross_script_mismatch('12345', '久石譲') is False
|
|
assert is_cross_script_mismatch('久石譲', ' ') is False
|