You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/metadata/canonical_version.py

211 lines
8.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""Pick the canonical album release by best-fit to the user's actual files.
Issue #765 / #767-Bug2: SoulSync never pins ONE canonical album version per
album, so the Library Reorganizer, Track Number Repair, and tagging each
re-resolve independently and can land on different releases (standard vs
deluxe; Spotify vs MusicBrainz track numbering) and contradict each other.
This module is the pure, testable heart of the fix: given the metadata of the
files actually on disk and a set of candidate releases, score each release by
how well it FITS those files and pick the best. "Best-fit to the files" means:
- track-count fit — a 17-track deluxe is a poor fit for 11 files on disk
- duration alignment — each file should line up with a release track by length
- title overlap — a tiebreaker / sanity check
What this does and does NOT solve:
- It DOES pick the right EDITION (standard vs deluxe) — the discriminating
signal is track count + durations.
- It does NOT (and cannot) decide which of two listings of the SAME album is
"more correct" when they differ only in track numbering (same files match
both equally). Instead ``pick_canonical_release`` is DETERMINISTIC and
breaks ties toward the earlier candidate — so the caller passes candidates
in source-priority order and every tool that reads the pinned result agrees
on the same release. Agreement is what resolves #765, not picking a
"winner" of the numbering disagreement.
Pure, no I/O. Callers fetch candidate tracklists and read on-disk file metadata;
this module only scores.
"""
from __future__ import annotations
import re
from difflib import SequenceMatcher
from typing import Any, Dict, List, Optional, Tuple
# Weights for the three fit signals. Count + duration dominate because "matches
# my files" is fundamentally about having the right NUMBER of the right-LENGTH
# tracks; title is a tiebreaker. Missing signals are dropped and the present
# ones renormalized (see _combine).
_W_COUNT = 0.4
_W_DURATION = 0.4
_W_TITLE = 0.2
_DEFAULT_DURATION_TOLERANCE_MS = 3000 # ±3s — covers encode/version length jitter
_DEFAULT_MIN_SCORE = 0.5 # never pin below this — leave unresolved
_TITLE_FUZZY_THRESHOLD = 0.85
def _norm_title(text: str) -> str:
"""Lowercase, drop bracketed qualifiers ((feat. …), [Remastered]), strip
punctuation, collapse whitespace."""
if not text:
return ""
t = str(text).lower()
t = re.sub(r"[\(\[].*?[\)\]]", "", t)
t = re.sub(r"[^a-z0-9 ]", " ", t)
return " ".join(t.split())
def _count_fit(n_files: int, n_release: int) -> float:
"""1.0 when track counts match; decays with the relative difference."""
if n_files <= 0 or n_release <= 0:
return 0.0
return 1.0 - min(1.0, abs(n_files - n_release) / max(n_files, n_release))
def _duration_fit(
file_tracks: List[Dict[str, Any]],
release_tracks: List[Dict[str, Any]],
tolerance_ms: int,
) -> Optional[float]:
"""Fraction of tracks that line up by duration (greedy nearest match within
tolerance), over the larger of the two track counts — so missing or extra
tracks are penalised. Returns ``None`` when neither side has durations."""
f_durs = [int(f["duration_ms"]) for f in file_tracks if f.get("duration_ms")]
r_durs = [int(r["duration_ms"]) for r in release_tracks if r.get("duration_ms")]
if not f_durs or not r_durs:
return None
used = [False] * len(r_durs)
matched = 0
for fd in f_durs:
best_j, best_diff = -1, tolerance_ms + 1
for j, rd in enumerate(r_durs):
if used[j]:
continue
diff = abs(fd - rd)
if diff <= tolerance_ms and diff < best_diff:
best_diff, best_j = diff, j
if best_j >= 0:
used[best_j] = True
matched += 1
denom = max(len(file_tracks), len(release_tracks))
return matched / denom if denom else 0.0
def _title_fit(
file_tracks: List[Dict[str, Any]],
release_tracks: List[Dict[str, Any]],
) -> Optional[float]:
"""Fraction of files whose title matches some release title (exact-normalised
or fuzzy), over the larger track count. ``None`` when titles are absent."""
f_titles = [_norm_title(f.get("title", "")) for f in file_tracks]
f_titles = [t for t in f_titles if t]
r_titles = [_norm_title(r.get("title", "")) for r in release_tracks]
r_titles = [t for t in r_titles if t]
if not f_titles or not r_titles:
return None
r_set = set(r_titles)
matched = 0
for ft in f_titles:
if ft in r_set or any(
SequenceMatcher(None, ft, rt).ratio() >= _TITLE_FUZZY_THRESHOLD
for rt in r_titles
):
matched += 1
denom = max(len(file_tracks), len(release_tracks))
return matched / denom if denom else 0.0
def _combine(parts: List[Tuple[Optional[float], float]]) -> float:
"""Weighted mean over present (non-None) components, renormalising weights."""
present = [(v, w) for v, w in parts if v is not None]
total_w = sum(w for _, w in present)
if total_w <= 0:
return 0.0
return sum(v * w for v, w in present) / total_w
def score_release_against_files(
file_tracks: List[Dict[str, Any]],
release_tracks: List[Dict[str, Any]],
*,
duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> float:
"""Score 0.01.0 of how well ``release_tracks`` fits the on-disk
``file_tracks``. Each track dict may carry ``duration_ms`` and ``title``;
missing signals are dropped and the rest renormalised so the function never
crashes on sparse metadata (it just leans on what's available)."""
if not file_tracks or not release_tracks:
return 0.0
count = _count_fit(len(file_tracks), len(release_tracks))
dur = _duration_fit(file_tracks, release_tracks, duration_tolerance_ms)
title = _title_fit(file_tracks, release_tracks)
return _combine([(count, _W_COUNT), (dur, _W_DURATION), (title, _W_TITLE)])
def score_release_detail(
file_tracks: List[Dict[str, Any]],
release_tracks: List[Dict[str, Any]],
*,
duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> Dict[str, Any]:
"""Like ``score_release_against_files`` but returns the per-signal breakdown
so a UI can show WHY a release scored the way it did. ``duration_fit`` /
``title_fit`` are ``None`` when that signal was absent."""
if not file_tracks or not release_tracks:
return {
'score': 0.0, 'count_fit': 0.0, 'duration_fit': None, 'title_fit': None,
'release_track_count': len(release_tracks), 'file_track_count': len(file_tracks),
}
count = _count_fit(len(file_tracks), len(release_tracks))
dur = _duration_fit(file_tracks, release_tracks, duration_tolerance_ms)
title = _title_fit(file_tracks, release_tracks)
score = _combine([(count, _W_COUNT), (dur, _W_DURATION), (title, _W_TITLE)])
return {
'score': round(score, 4),
'count_fit': round(count, 3),
'duration_fit': round(dur, 3) if dur is not None else None,
'title_fit': round(title, 3) if title is not None else None,
'release_track_count': len(release_tracks),
'file_track_count': len(file_tracks),
}
def pick_canonical_release(
file_tracks: List[Dict[str, Any]],
candidates: List[Dict[str, Any]],
*,
min_score: float = _DEFAULT_MIN_SCORE,
duration_tolerance_ms: int = _DEFAULT_DURATION_TOLERANCE_MS,
) -> Tuple[Optional[Dict[str, Any]], float]:
"""Choose the best-fit candidate release for the on-disk files.
``candidates`` is a list of dicts each with a ``'tracks'`` list (plus any
caller fields like ``source``/``album_id``, returned untouched). **Pass
candidates in source-priority order** — ties break toward the EARLIER one,
so the choice is deterministic and priority-respecting (this is what makes
every tool agree, #765).
Returns ``(best_candidate, score)``, or ``(None, best_score)`` when nothing
clears ``min_score`` — so a low-confidence guess is never pinned (the caller
leaves the album unresolved and falls back to today's behaviour)."""
best: Optional[Dict[str, Any]] = None
best_score = 0.0
for cand in candidates:
score = score_release_against_files(
file_tracks, cand.get("tracks") or [],
duration_tolerance_ms=duration_tolerance_ms,
)
# Strictly-greater so equal scores keep the earlier (higher-priority)
# candidate — deterministic tiebreak.
if score > best_score + 1e-9:
best, best_score = cand, score
if best is None or best_score < min_score:
return None, best_score
return best, best_score
__all__ = ["score_release_against_files", "pick_canonical_release"]