"""Writing-system (script) compatibility helpers for metadata comparison. Issue #797 — AcoustID returns a recording's title/artist in their *original* script (e.g. ``久石譲`` for Joe Hisaishi) while SoulSync's expected metadata is romanized / English (``Joe Hisaishi``). A raw string-similarity comparison between two different writing systems scores ~0 even when they name the very same artist, so correct downloads of non-English artists get false-quarantined. These pure helpers let callers DETECT that situation — "one side is written in a non-Latin script, the other in Latin" — so the comparison logic can stop treating an untranslatable title/artist as evidence the file is wrong. Deliberately conservative: a single accented Latin character (``é``, ``ñ``, ``ü``) is still Latin, NOT a script mismatch. Only genuinely different writing systems (CJK, Hangul, Cyrillic, Greek, Arabic, Hebrew, Thai, …) count as "non-Latin". """ from __future__ import annotations # Unicode ranges for non-Latin writing systems we treat as a "hard" # script difference. Latin (incl. Latin-1 Supplement / Extended with # diacritics) is intentionally absent — accented Latin is still Latin. # CJK ranges mirror core.matching_engine's issue #722 detection so the # two stay consistent. _NONLATIN_RANGES = ( ('Ͱ', 'Ͽ'), # Greek and Coptic ('Ѐ', 'ӿ'), # Cyrillic ('Ԁ', 'ԯ'), # Cyrillic Supplement ('֐', '׿'), # Hebrew ('؀', 'ۿ'), # Arabic ('ݐ', 'ݿ'), # Arabic Supplement ('฀', '๿'), # Thai ('⺀', '⻿'), # CJK Radicals Supplement ('぀', 'ゟ'), # Hiragana ('゠', 'ヿ'), # Katakana ('㐀', '䶿'), # CJK Unified Ideographs Extension A ('一', '鿿'), # CJK Unified Ideographs ('가', '힯'), # Hangul Syllables ('豈', '﫿'), # CJK Compatibility Ideographs ('ヲ', 'ᅵ'), # Halfwidth Katakana / Hangul ) def _is_nonlatin_char(c: str) -> bool: """True when ``c`` belongs to a non-Latin writing system.""" for lo, hi in _NONLATIN_RANGES: if lo <= c <= hi: return True return False def has_strong_nonlatin(text: str) -> bool: """True when ``text`` contains at least one non-Latin-script letter. Accented Latin (``Beyoncé``, ``Sigur Rós``, ``Mötley Crüe``) returns False — those are Latin. ``久石譲``, ``Дмитрий``, ``방탄소년단`` return True. """ if not text: return False return any(_is_nonlatin_char(c) for c in text) def _has_latin_letter(text: str) -> bool: """True when ``text`` contains an ASCII A–Z / a–z letter.""" if not text: return False return any(('a' <= c <= 'z') or ('A' <= c <= 'Z') for c in text) def is_cross_script_mismatch(a: str, b: str) -> bool: """True when ``a`` and ``b`` are written in different scripts. Specifically: exactly one side uses a non-Latin writing system while the other is genuine Latin text. This is the signal that a raw similarity score between the two is meaningless (a romanized name vs its native-script form), NOT that they name different things. Symmetric. Returns False when: - both sides are Latin (ordinary English-vs-English comparison), - both sides are non-Latin (same-script comparison still works), - either side is empty / has no comparable letters. """ a_nonlatin = has_strong_nonlatin(a) b_nonlatin = has_strong_nonlatin(b) if a_nonlatin == b_nonlatin: # Same script class on both sides (or neither has non-Latin) — # similarity comparison is meaningful, no script bridge needed. return False # Exactly one side is non-Latin. It's only a true cross-script case # if the OTHER side is real Latin text (not punctuation / digits). if a_nonlatin: return _has_latin_letter(b) return _has_latin_letter(a)