diff --git a/core/library/duplicate_keep.py b/core/library/duplicate_keep.py new file mode 100644 index 00000000..281bf1d7 --- /dev/null +++ b/core/library/duplicate_keep.py @@ -0,0 +1,57 @@ +"""Choosing which copy of a duplicate track to keep. + +The keeper is the highest-quality copy, and **format/lossless is the primary +criterion**: a lossless FLAC must beat a lossy MP3 regardless of the recorded +bitrate number — a FLAC whose bitrate the library scan never populated (a +common gap) still has to win over a 282 kbps MP3. Only *within* the same format +tier do bitrate, then duration, then track number break the tie. + +This was lifted out of the repair worker so the deletion path and the UI's +"Keep Best" recommendation rank identically (previously both ranked by bitrate +first, which kept the MP3 when the FLAC's bitrate was missing), and so the +ranking is unit-testable in isolation. +""" + +from __future__ import annotations + +import os +from typing import Dict, List, Optional, Tuple + +# Format quality rank by file extension (higher = better). Mirrors +# ``auto_import_worker._quality_rank`` so lossless always outranks lossy. +_FORMAT_RANK = { + ".flac": 10, ".wav": 9, ".aiff": 9, ".aif": 9, ".ape": 8, + ".m4a": 7, ".ogg": 6, ".opus": 6, ".mp3": 5, ".aac": 5, ".wma": 3, +} + + +def format_rank_for_path(file_path: Optional[str]) -> int: + """Quality rank for a file by extension (higher = better, unknown = 1).""" + if not file_path: + return 1 + ext = os.path.splitext(str(file_path))[1].lower() + return _FORMAT_RANK.get(ext, 1) + + +def duplicate_keep_sort_key(track: Dict) -> Tuple[int, int, float, int]: + """Sort key for picking the keeper — the higher tuple wins. + + Order of precedence: format/lossless tier, then bitrate, then duration, + then track number (a real number over a placeholder ``01``). Putting format + first is the whole point — it makes FLAC beat MP3 even when the FLAC's + bitrate is 0/missing in the DB. + """ + return ( + format_rank_for_path(track.get("file_path")), + track.get("bitrate") or 0, + track.get("duration") or 0, + track.get("track_number") or 0, + ) + + +def pick_duplicate_to_keep(tracks: List[Dict]) -> Optional[Dict]: + """Return the copy to keep from a duplicate group (highest sort key), or + ``None`` if the group is empty.""" + if not tracks: + return None + return max(tracks, key=duplicate_keep_sort_key) diff --git a/core/repair_worker.py b/core/repair_worker.py index 9af2ff30..87ba8abb 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -1364,8 +1364,11 @@ class RepairWorker: return {'success': False, 'error': f'Selected track ID {keep_id} not found in duplicates'} best_id = keep_id else: - # Auto-pick: highest bitrate, then longest duration, then highest track number (correct > 01) - best = max(tracks, key=lambda t: (t.get('bitrate', 0) or 0, t.get('duration', 0) or 0, t.get('track_number', 0) or 0)) + # Auto-pick the keeper: lossless format first (so a FLAC beats an + # MP3 even when the FLAC's bitrate is missing in the DB), then + # bitrate, duration, and track number as tie-breakers. + from core.library.duplicate_keep import pick_duplicate_to_keep + best = pick_duplicate_to_keep(tracks) best_id = best.get('track_id') or best.get('id') if not best_id: diff --git a/tests/test_duplicate_keep.py b/tests/test_duplicate_keep.py new file mode 100644 index 00000000..8c8bf0d4 --- /dev/null +++ b/tests/test_duplicate_keep.py @@ -0,0 +1,95 @@ +"""Tests for duplicate-keeper selection (core/library/duplicate_keep.py). + +The headline contract: lossless format wins over lossy regardless of the +recorded bitrate — the bug a user hit was a FLAC (no bitrate in the DB) being +deleted in favor of a 282 kbps MP3 because the old ranking compared bitrate +first. +""" + +from __future__ import annotations + +from core.library.duplicate_keep import ( + duplicate_keep_sort_key, + format_rank_for_path, + pick_duplicate_to_keep, +) + + +def _t(path, bitrate=None, duration=None, track_number=None, tid=1): + return {"id": tid, "file_path": path, "bitrate": bitrate, + "duration": duration, "track_number": track_number} + + +# --- the reported regression -------------------------------------------------- + + +def test_flac_with_missing_bitrate_beats_282kbps_mp3(): + # Havok "Prepare For Attack": FLAC has no bitrate recorded, MP3 is 282 kbps. + mp3 = _t("/music/Havok/01 - Prepare For Attack.mp3", bitrate=282, duration=236, tid=1) + flac = _t("/music/Havok/01 - Prepare for Attack.flac", bitrate=None, duration=236, tid=2) + keep = pick_duplicate_to_keep([mp3, flac]) + assert keep["id"] == 2 # the FLAC + + +def test_flac_beats_mp3_regardless_of_order(): + mp3 = _t("/x/a.mp3", bitrate=320, tid=1) + flac = _t("/x/a.flac", bitrate=0, tid=2) + assert pick_duplicate_to_keep([mp3, flac])["id"] == 2 + assert pick_duplicate_to_keep([flac, mp3])["id"] == 2 + + +# --- format ranking ----------------------------------------------------------- + + +def test_format_rank_lossless_outranks_lossy(): + assert format_rank_for_path("a.flac") > format_rank_for_path("a.mp3") + assert format_rank_for_path("a.wav") > format_rank_for_path("a.aac") + assert format_rank_for_path("a.m4a") > format_rank_for_path("a.mp3") + + +def test_format_rank_unknown_and_missing(): + assert format_rank_for_path("a.xyz") == 1 + assert format_rank_for_path("noext") == 1 + assert format_rank_for_path(None) == 1 + assert format_rank_for_path("") == 1 + + +def test_format_rank_case_insensitive(): + assert format_rank_for_path("A.FLAC") == format_rank_for_path("a.flac") + + +# --- tie-breakers within the same format ------------------------------------- + + +def test_same_format_higher_bitrate_wins(): + lo = _t("/x/a.mp3", bitrate=192, tid=1) + hi = _t("/x/b.mp3", bitrate=320, tid=2) + assert pick_duplicate_to_keep([lo, hi])["id"] == 2 + + +def test_same_format_same_bitrate_longer_duration_wins(): + short = _t("/x/a.flac", bitrate=900, duration=200, tid=1) + long = _t("/x/b.flac", bitrate=900, duration=240, tid=2) + assert pick_duplicate_to_keep([short, long])["id"] == 2 + + +def test_track_number_is_final_tiebreak(): + a = _t("/x/a.flac", bitrate=900, duration=240, track_number=1, tid=1) + b = _t("/x/b.flac", bitrate=900, duration=240, track_number=7, tid=2) + assert pick_duplicate_to_keep([a, b])["id"] == 2 + + +# --- shape / edge cases ------------------------------------------------------- + + +def test_sort_key_tuple_order_is_format_first(): + key = duplicate_keep_sort_key(_t("/x/a.flac", bitrate=100, duration=5, track_number=3)) + assert key == (10, 100, 5, 3) + + +def test_missing_numeric_fields_default_to_zero(): + assert duplicate_keep_sort_key(_t("/x/a.mp3")) == (5, 0, 0, 0) + + +def test_empty_group_returns_none(): + assert pick_duplicate_to_keep([]) is None diff --git a/webui/static/enrichment.js b/webui/static/enrichment.js index 0cf9ced6..b82708d7 100644 --- a/webui/static/enrichment.js +++ b/webui/static/enrichment.js @@ -2859,12 +2859,21 @@ function _renderFindingDetail(f) { case 'duplicate_tracks': if (!d.tracks || !d.tracks.length) return _gridRows([['Count', d.count || '?']]); - // Determine best copy (same logic as backend: highest bitrate, then duration, then track number) + // Determine best copy — same logic as the backend + // (core/library/duplicate_keep.py): lossless format first, so a FLAC + // beats an MP3 even when the FLAC's bitrate is missing, then bitrate, + // duration, track number. + const _dupFmtRank = (fp) => { + const r = { flac: 10, wav: 9, aiff: 9, aif: 9, ape: 8, m4a: 7, ogg: 6, opus: 6, mp3: 5, aac: 5, wma: 3 }; + return r[String(fp || '').split('.').pop().toLowerCase()] || 1; + }; + const _dupKey = (t) => [_dupFmtRank(t.file_path), t.bitrate || 0, t.duration || 0, t.track_number || 0]; const bestDup = d.tracks.reduce((best, t) => { - const bBr = best.bitrate || 0, tBr = t.bitrate || 0; - const bDur = best.duration || 0, tDur = t.duration || 0; - const bTn = best.track_number || 0, tTn = t.track_number || 0; - return (tBr > bBr || (tBr === bBr && tDur > bDur) || (tBr === bBr && tDur === bDur && tTn > bTn)) ? t : best; + const bk = _dupKey(best), tk = _dupKey(t); + for (let i = 0; i < bk.length; i++) { + if (tk[i] !== bk[i]) return tk[i] > bk[i] ? t : best; + } + return best; }, d.tracks[0]); const findingId = f.id; return media + `
${d.tracks.map((t, i) => {