You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/text/normalize.py

42 lines
1.2 KiB

"""Shared text-normalization helpers.
Extracted from `MusicDatabase._normalize_for_comparison` so callers
outside the database layer (matching engine, sync candidate pool,
import comparisons) don't have to reach across the module boundary
into a leading-underscore "private" method.
Pure functions, no I/O.
"""
from __future__ import annotations
import logging
logger = logging.getLogger(__name__)
try:
from unidecode import unidecode as _unidecode
_HAS_UNIDECODE = True
except ImportError:
_unidecode = None # type: ignore[assignment]
_HAS_UNIDECODE = False
logger.warning("unidecode not available, accent matching may be limited")
def normalize_for_comparison(text: str) -> str:
"""Lowercase + strip whitespace + fold accents to ASCII.
``é → e``, ``ñ → n``, ``Björk → bjork``. Used as the dictionary key
for the sync candidate pool and for fuzzy library lookups where
diacritic differences must NOT split a single artist into two pool
entries.
Empty / falsy input returns ``""`` so callers can blindly key dicts
with the result.
"""
if not text:
return ""
if _HAS_UNIDECODE:
text = _unidecode(text)
return text.lower().strip()