SoulSync/core/metadata/canonical_version.py

"""Pick the canonical album release by best-fit to the user's actual files.

Issue #765 / #767-Bug2: SoulSync never pins ONE canonical album version per
album, so the Library Reorganizer, Track Number Repair, and tagging each
re-resolve independently and can land on different releases (standard vs
deluxe; Spotify vs MusicBrainz track numbering) and contradict each other.

This module is the pure, testable heart of the fix: given the metadata of the
files actually on disk and a set of candidate releases, score each release by
how well it FITS those files and pick the best. "Best-fit to the files" means:

  - track-count fit   — a 17-track deluxe is a poor fit for 11 files on disk
  - duration alignment — each file should line up with a release track by length
  - title overlap      — a tiebreaker / sanity check

What this does and does NOT solve:
  - It DOES pick the right EDITION (standard vs deluxe) — the discriminating
    signal is track count + durations.
  - It does NOT (and cannot) decide which of two listings of the SAME album is
    "more correct" when they differ only in track numbering (same files match
    both equally). Instead ``pick_canonical_release`` is DETERMINISTIC and
    breaks ties toward the earlier candidate — so the caller passes candidates
    in source-priority order and every tool that reads the pinned result agrees
    on the same release. Agreement is what resolves #765, not picking a
    "winner" of the numbering disagreement.

Pure, no I/O. Callers fetch candidate tracklists and read on-disk file metadata;
this module only scores.
"""

from __future__ import annotations

import re
from difflib import SequenceMatcher
from typing import Any, Dict, List, Optional, Tuple

# Weights for the three fit signals. Count + duration dominate because "matches
# my files" is fundamentally about having the right NUMBER of the right-LENGTH
# tracks; title is a tiebreaker. Missing signals are dropped and the present
# ones renormalized (see _combine).
_W_COUNT = 0.4
_W_DURATION = 0.4
_W_TITLE = 0.2

_DEFAULT_DURATION_TOLERANCE_MS = 3000  # ±3s — covers encode/version length jitter
_DEFAULT_MIN_SCORE = 0.5               # never pin below this — leave unresolved
_TITLE_FUZZY_THRESHOLD = 0.85


def _norm_title(text: str) -> str:
    """Lowercase, drop bracketed qualifiers ((feat. …), [Remastered]), strip
    punctuation, collapse whitespace."""
    if not text:
        return ""
    t = str(text).lower()
    t = re.sub(r"[\(\[].*?[\)\]]", "", t)
    t = re.sub(r"[^a-z0-9 ]", " ", t)
    return " ".join(t.split())


def _count_fit(n_files: int, n_release: int) -> float:
    """1.0 when track counts match; decays with the relative difference."""
    if n_files <= 0 or n_release <= 0:
        return 0.0
    return 1.0 - min(1.0, abs(n_files - n_release) / max(n_files, n_release))


def _duration_fit(
    file_tracks: List[Dict[str, Any]],
    release_tracks: List[Dict[str, Any]],
    tolerance_ms: int,
) -> Optional[float]:
    """Fraction of tracks that line up by duration (greedy nearest match within
    tolerance), over the larger of the two track counts — so missing or extra
    tracks are penalised. Returns ``None`` when neither side has durations."""
    f_durs = [int(f["duration_ms"]) for f in file_tracks if f.get("duration_ms")]
    r_durs = [int(r["duration_ms"]) for r in release_tracks if r.get("duration_ms")]
    if not f_durs or not r_durs:
        return None
    used = [False] * len(r_durs)
    matched = 0
    for fd in f_durs:
        best_j, best_diff = -1, tolerance_ms + 1
        for j, rd in enumerate(r_durs):
            if used[j]:
                continue
            diff = abs(fd - rd)
            if diff <= tolerance_ms and diff < best_diff:
                best_diff, best_j = diff, j
        if best_j >= 0:
            used[best_j] = True
            matched += 1
    denom = max(len(file_tracks), len(release_tracks))
    return matched / denom if denom else 0.0


def _title_fit(
    file_tracks: List[Dict[str, Any]],
    release_tracks: List[Dict[str, Any]],
) -> Optional[float]:
    """Fraction of files whose title matches some release title (exact-normalised
    or fuzzy), over the larger track count. ``None`` when titles are absent."""
    f_titles = [_norm_title(f.get("title", "")) for f in file_tracks]
    f_titles = [t for t in f_titles if t]
    r_titles = [_norm_title(r.get("title", "")) for r in release_tracks]
    r_titles = [t for t in r_titles if t]
    if not f_titles or not r_titles:
        return None
    r_set = set(r_titles)
    matched = 0
    for ft in f_titles:
        if ft in r_set or any(
            SequenceMatcher(None, ft, rt).ratio() >= _TITLE_FUZZY_THRESHOLD
            for rt in r_titles
        ):
            matched += 1
    denom = max(len(file_tracks), len(release_tracks))
    return matched / denom if denom else 0.0


def _combine(parts: List[Tuple[Optional[float], float]]) -> float:
    """Weighted mean over present (non-None) components, renormalising weights."""
    present = [(v, w) for v, w in parts if v is not None]
    total_w = sum(w for _, w in present)
    if total_w <= 0:
        return 0.0
    return sum(v * w for v, w in present) / total_w


def score_release_against_files(
    file_tracks: List[Dict[str, Any]],
    release_tracks: List[Dict[str, Any]],
    *,
    duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> float:
    """Score 0.0–1.0 of how well ``release_tracks`` fits the on-disk
    ``file_tracks``. Each track dict may carry ``duration_ms`` and ``title``;
    missing signals are dropped and the rest renormalised so the function never
    crashes on sparse metadata (it just leans on what's available)."""
    if not file_tracks or not release_tracks:
        return 0.0
    count = _count_fit(len(file_tracks), len(release_tracks))
    dur = _duration_fit(file_tracks, release_tracks, duration_tolerance_ms)
    title = _title_fit(file_tracks, release_tracks)
    return _combine([(count, _W_COUNT), (dur, _W_DURATION), (title, _W_TITLE)])


def score_release_detail(
    file_tracks: List[Dict[str, Any]],
    release_tracks: List[Dict[str, Any]],
    *,
    duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> Dict[str, Any]:
    """Like ``score_release_against_files`` but returns the per-signal breakdown
    so a UI can show WHY a release scored the way it did. ``duration_fit`` /
    ``title_fit`` are ``None`` when that signal was absent."""
    if not file_tracks or not release_tracks:
        return {
            'score': 0.0, 'count_fit': 0.0, 'duration_fit': None, 'title_fit': None,
            'release_track_count': len(release_tracks), 'file_track_count': len(file_tracks),
        }
    count = _count_fit(len(file_tracks), len(release_tracks))
    dur = _duration_fit(file_tracks, release_tracks, duration_tolerance_ms)
    title = _title_fit(file_tracks, release_tracks)
    score = _combine([(count, _W_COUNT), (dur, _W_DURATION), (title, _W_TITLE)])
    return {
        'score': round(score, 4),
        'count_fit': round(count, 3),
        'duration_fit': round(dur, 3) if dur is not None else None,
        'title_fit': round(title, 3) if title is not None else None,
        'release_track_count': len(release_tracks),
        'file_track_count': len(file_tracks),
    }


def pick_canonical_release(
    file_tracks: List[Dict[str, Any]],
    candidates: List[Dict[str, Any]],
    *,
    min_score: float = _DEFAULT_MIN_SCORE,
    duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> Tuple[Optional[Dict[str, Any]], float]:
    """Choose the best-fit candidate release for the on-disk files.

    ``candidates`` is a list of dicts each with a ``'tracks'`` list (plus any
    caller fields like ``source``/``album_id``, returned untouched). **Pass
    candidates in source-priority order** — ties break toward the EARLIER one,
    so the choice is deterministic and priority-respecting (this is what makes
    every tool agree, #765).

    Returns ``(best_candidate, score)``, or ``(None, best_score)`` when nothing
    clears ``min_score`` — so a low-confidence guess is never pinned (the caller
    leaves the album unresolved and falls back to today's behaviour)."""
    best: Optional[Dict[str, Any]] = None
    best_score = 0.0
    for cand in candidates:
        score = score_release_against_files(
            file_tracks, cand.get("tracks") or [],
            duration_tolerance_ms=duration_tolerance_ms,
        )
        # Strictly-greater so equal scores keep the earlier (higher-priority)
        # candidate — deterministic tiebreak.
        if score > best_score + 1e-9:
            best, best_score = cand, score
    if best is None or best_score < min_score:
        return None, best_score
    return best, best_score


__all__ = ["score_release_against_files", "pick_canonical_release"]