You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/matching/script_compat.py

97 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""Writing-system (script) compatibility helpers for metadata comparison.
Issue #797 — AcoustID returns a recording's title/artist in their
*original* script (e.g. ``久石譲`` for Joe Hisaishi) while SoulSync's
expected metadata is romanized / English (``Joe Hisaishi``). A raw
string-similarity comparison between two different writing systems
scores ~0 even when they name the very same artist, so correct
downloads of non-English artists get false-quarantined.
These pure helpers let callers DETECT that situation — "one side is
written in a non-Latin script, the other in Latin" — so the comparison
logic can stop treating an untranslatable title/artist as evidence the
file is wrong.
Deliberately conservative: a single accented Latin character (``é``,
``ñ``, ``ü``) is still Latin, NOT a script mismatch. Only genuinely
different writing systems (CJK, Hangul, Cyrillic, Greek, Arabic,
Hebrew, Thai, …) count as "non-Latin".
"""
from __future__ import annotations
# Unicode ranges for non-Latin writing systems we treat as a "hard"
# script difference. Latin (incl. Latin-1 Supplement / Extended with
# diacritics) is intentionally absent — accented Latin is still Latin.
# CJK ranges mirror core.matching_engine's issue #722 detection so the
# two stay consistent.
_NONLATIN_RANGES = (
('Ͱ', 'Ͽ'), # Greek and Coptic
('Ѐ', 'ӿ'), # Cyrillic
('Ԁ', 'ԯ'), # Cyrillic Supplement
('֐', '׿'), # Hebrew
('؀', 'ۿ'), # Arabic
('ݐ', 'ݿ'), # Arabic Supplement
('', '๿'), # Thai
('', '⻿'), # CJK Radicals Supplement
('', ''), # Hiragana
('', ''), # Katakana
('', '䶿'), # CJK Unified Ideographs Extension A
('', '鿿'), # CJK Unified Ideographs
('', ''), # Hangul Syllables
('', '﫿'), # CJK Compatibility Ideographs
('', ''), # Halfwidth Katakana / Hangul
)
def _is_nonlatin_char(c: str) -> bool:
"""True when ``c`` belongs to a non-Latin writing system."""
for lo, hi in _NONLATIN_RANGES:
if lo <= c <= hi:
return True
return False
def has_strong_nonlatin(text: str) -> bool:
"""True when ``text`` contains at least one non-Latin-script letter.
Accented Latin (``Beyoncé``, ``Sigur Rós``, ``Mötley Crüe``) returns
False — those are Latin. ``久石譲``, ``Дмитрий``, ``방탄소년단`` return True.
"""
if not text:
return False
return any(_is_nonlatin_char(c) for c in text)
def _has_latin_letter(text: str) -> bool:
"""True when ``text`` contains an ASCII AZ / az letter."""
if not text:
return False
return any(('a' <= c <= 'z') or ('A' <= c <= 'Z') for c in text)
def is_cross_script_mismatch(a: str, b: str) -> bool:
"""True when ``a`` and ``b`` are written in different scripts.
Specifically: exactly one side uses a non-Latin writing system while
the other is genuine Latin text. This is the signal that a raw
similarity score between the two is meaningless (a romanized name vs
its native-script form), NOT that they name different things.
Symmetric. Returns False when:
- both sides are Latin (ordinary English-vs-English comparison),
- both sides are non-Latin (same-script comparison still works),
- either side is empty / has no comparable letters.
"""
a_nonlatin = has_strong_nonlatin(a)
b_nonlatin = has_strong_nonlatin(b)
if a_nonlatin == b_nonlatin:
# Same script class on both sides (or neither has non-Latin) —
# similarity comparison is meaningful, no script bridge needed.
return False
# Exactly one side is non-Latin. It's only a true cross-script case
# if the OTHER side is real Latin text (not punctuation / digits).
if a_nonlatin:
return _has_latin_letter(b)
return _has_latin_letter(a)