SoulSync/core/library/path_resolve.py

"""Confusable-tolerant filesystem path resolution (#833).

the-hang-man: a track titled "I'm Upset" was written to disk with an ASCII
apostrophe (U+0027) but the library DB stored the title with a typographic one
(U+2019, the form Spotify/Apple metadata uses). Deleting rebuilt the unlink
target from the DB path, so ``os.path.exists`` compared U+2019-bytes against a
U+0027 filename — always a miss — and the file survived ("could not be deleted").

The same byte-exact mismatch hits any on-disk operation that starts from stored
metadata (delete, sidecar cleanup, dead-file checks). The fix is to resolve the
*real* on-disk name: descend the path component by component, taking an exact
match when present and otherwise folding a small set of typographic confusables
(curly vs straight quotes, en/em dash, ellipsis, nbsp) for the comparison ONLY.
We never rename — we just find the file that's actually there.

Case is deliberately preserved: on a case-sensitive dataset (ext4/ZFS) two
tracks can differ only by case, so folding case could delete the wrong file.
The reported failure is purely typographic, so that's all we fold.
"""

from __future__ import annotations

import os
import unicodedata

# Typographic characters that routinely differ between DB metadata (Unicode,
# from streaming-service catalogs) and the ASCII filename on disk. Folded to a
# common form for COMPARISON only.
_CONFUSABLES = {
    '‘': "'", '’': "'", 'ʼ': "'", '′': "'",   # ‘ ’ ʼ ′ → '
    '“': '"', '”': '"', '″': '"',                   # “ ” ″ → "
    '–': '-', '—': '-', '‒': '-', '―': '-',    # – — ‒ ― → -
    '…': '...',                                                # … → ...
    ' ': ' ',                                                  # nbsp → space
}


def fold_confusables(name: str) -> str:
    """Fold typographic confusables + NFC-normalize so a DB name and the real
    on-disk name compare equal despite curly-vs-straight quotes, dashes, etc.
    Case and everything else are left untouched."""
    if not name:
        return ''
    name = unicodedata.normalize('NFC', name)
    for bad, good in _CONFUSABLES.items():
        if bad in name:
            name = name.replace(bad, good)
    return name


def find_on_disk(base_dir: str, suffix_parts):
    """Descend ``base_dir`` following ``suffix_parts`` (the path components of a
    stored file path). Each component is matched exactly when it exists, else by
    confusable-folded comparison against the directory's real entries. Returns
    the real absolute path, or None if any component can't be resolved.

    Exact matches always win — the folded scan only runs for a component that
    isn't present byte-for-byte, so this never changes behaviour for paths that
    already resolve.
    """
    if not base_dir or not os.path.isdir(base_dir):
        return None
    current = base_dir
    for part in suffix_parts:
        if not part:
            continue
        exact = os.path.join(current, part)
        if os.path.exists(exact):
            current = exact
            continue
        target = fold_confusables(part)
        match = None
        try:
            for entry in os.listdir(current):
                if fold_confusables(entry) == target:
                    match = os.path.join(current, entry)
                    break
        except OSError:
            return None
        if match is None:
            return None
        current = match
    return current if current != base_dir else None


__all__ = ['fold_confusables', 'find_on_disk']