SoulSync/core/text/normalize.py

"""Shared text-normalization helpers.

Extracted from `MusicDatabase._normalize_for_comparison` so callers
outside the database layer (matching engine, sync candidate pool,
import comparisons) don't have to reach across the module boundary
into a leading-underscore "private" method.

Pure functions, no I/O.
"""

from __future__ import annotations

import logging

logger = logging.getLogger(__name__)

try:
    from unidecode import unidecode as _unidecode
    _HAS_UNIDECODE = True
except ImportError:
    _unidecode = None  # type: ignore[assignment]
    _HAS_UNIDECODE = False
    logger.warning("unidecode not available, accent matching may be limited")


def normalize_for_comparison(text: str) -> str:
    """Lowercase + strip whitespace + fold accents to ASCII.

    ``é → e``, ``ñ → n``, ``Björk → bjork``. Used as the dictionary key
    for the sync candidate pool and for fuzzy library lookups where
    diacritic differences must NOT split a single artist into two pool
    entries.

    Empty / falsy input returns ``""`` so callers can blindly key dicts
    with the result.
    """
    if not text:
        return ""
    if _HAS_UNIDECODE:
        text = _unidecode(text)
    return text.lower().strip()