From 28850672a66e1c4efb2a4b263d42037dc3cb97da Mon Sep 17 00:00:00 2001 From: BoulderBadgeDad Date: Mon, 1 Jun 2026 12:49:34 -0700 Subject: [PATCH] Fix: duplicate detector kept lossy over lossless (rank format first) The Duplicate Detector's 'Keep Best' auto-selection ranked copies by highest bitrate -> duration -> track number, with no notion of format. A FLAC whose bitrate the library scan never populated (a common gap) therefore lost to a 282 kbps MP3: 282 > 0, so the MP3 was kept and the FLAC deleted (reported on Havok 'Prepare For Attack', and again on Kendrick GNX). Fix: rank by format/lossless tier FIRST, then bitrate, duration, track number. A lossless file now always beats a lossy one regardless of the recorded bitrate; bitrate/duration/track# only break ties within the same format. - core/library/duplicate_keep.py (new): pure, importable pick_duplicate_to_keep + duplicate_keep_sort_key + format_rank_for_path (extension rank mirroring auto_import_worker._quality_rank: flac=10 ... mp3=5 ... unknown=1). - core/repair_worker.py: _fix_duplicates auto-pick now calls pick_duplicate_to_keep instead of the bitrate-first max(). - webui/static/enrichment.js: the KEEP/REMOVE recommendation mirrors the same format-first ranking so the badge matches what the backend will delete. Parity: Python uses '.ext' keys (os.path.splitext), JS uses 'ext' (split('.').pop()) -> identical results; both keep the first copy on a full tie. Verified the only other dedup path (the standalone Duplicate Cleaner automation, core/library/duplicate_cleaner.py) was already format-priority-first and correct -- no change needed there. Tests: tests/test_duplicate_keep.py (11 -- incl. the exact FLAC-with-missing- bitrate vs 282 kbps MP3 case, format ranking, within-format tie-breakers, and edge cases). 147 repair/duplicate tests still pass. Note: why FLAC bitrate is NULL in the DB is a separate library-scan gap; format-first ranking makes the keep decision correct regardless. --- core/library/duplicate_keep.py | 57 ++++++++++++++++++++ core/repair_worker.py | 7 ++- tests/test_duplicate_keep.py | 95 ++++++++++++++++++++++++++++++++++ webui/static/enrichment.js | 19 +++++-- 4 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 core/library/duplicate_keep.py create mode 100644 tests/test_duplicate_keep.py diff --git a/core/library/duplicate_keep.py b/core/library/duplicate_keep.py new file mode 100644 index 00000000..281bf1d7 --- /dev/null +++ b/core/library/duplicate_keep.py @@ -0,0 +1,57 @@ +"""Choosing which copy of a duplicate track to keep. + +The keeper is the highest-quality copy, and **format/lossless is the primary +criterion**: a lossless FLAC must beat a lossy MP3 regardless of the recorded +bitrate number — a FLAC whose bitrate the library scan never populated (a +common gap) still has to win over a 282 kbps MP3. Only *within* the same format +tier do bitrate, then duration, then track number break the tie. + +This was lifted out of the repair worker so the deletion path and the UI's +"Keep Best" recommendation rank identically (previously both ranked by bitrate +first, which kept the MP3 when the FLAC's bitrate was missing), and so the +ranking is unit-testable in isolation. +""" + +from __future__ import annotations + +import os +from typing import Dict, List, Optional, Tuple + +# Format quality rank by file extension (higher = better). Mirrors +# ``auto_import_worker._quality_rank`` so lossless always outranks lossy. +_FORMAT_RANK = { + ".flac": 10, ".wav": 9, ".aiff": 9, ".aif": 9, ".ape": 8, + ".m4a": 7, ".ogg": 6, ".opus": 6, ".mp3": 5, ".aac": 5, ".wma": 3, +} + + +def format_rank_for_path(file_path: Optional[str]) -> int: + """Quality rank for a file by extension (higher = better, unknown = 1).""" + if not file_path: + return 1 + ext = os.path.splitext(str(file_path))[1].lower() + return _FORMAT_RANK.get(ext, 1) + + +def duplicate_keep_sort_key(track: Dict) -> Tuple[int, int, float, int]: + """Sort key for picking the keeper — the higher tuple wins. + + Order of precedence: format/lossless tier, then bitrate, then duration, + then track number (a real number over a placeholder ``01``). Putting format + first is the whole point — it makes FLAC beat MP3 even when the FLAC's + bitrate is 0/missing in the DB. + """ + return ( + format_rank_for_path(track.get("file_path")), + track.get("bitrate") or 0, + track.get("duration") or 0, + track.get("track_number") or 0, + ) + + +def pick_duplicate_to_keep(tracks: List[Dict]) -> Optional[Dict]: + """Return the copy to keep from a duplicate group (highest sort key), or + ``None`` if the group is empty.""" + if not tracks: + return None + return max(tracks, key=duplicate_keep_sort_key) diff --git a/core/repair_worker.py b/core/repair_worker.py index 9af2ff30..87ba8abb 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -1364,8 +1364,11 @@ class RepairWorker: return {'success': False, 'error': f'Selected track ID {keep_id} not found in duplicates'} best_id = keep_id else: - # Auto-pick: highest bitrate, then longest duration, then highest track number (correct > 01) - best = max(tracks, key=lambda t: (t.get('bitrate', 0) or 0, t.get('duration', 0) or 0, t.get('track_number', 0) or 0)) + # Auto-pick the keeper: lossless format first (so a FLAC beats an + # MP3 even when the FLAC's bitrate is missing in the DB), then + # bitrate, duration, and track number as tie-breakers. + from core.library.duplicate_keep import pick_duplicate_to_keep + best = pick_duplicate_to_keep(tracks) best_id = best.get('track_id') or best.get('id') if not best_id: diff --git a/tests/test_duplicate_keep.py b/tests/test_duplicate_keep.py new file mode 100644 index 00000000..8c8bf0d4 --- /dev/null +++ b/tests/test_duplicate_keep.py @@ -0,0 +1,95 @@ +"""Tests for duplicate-keeper selection (core/library/duplicate_keep.py). + +The headline contract: lossless format wins over lossy regardless of the +recorded bitrate — the bug a user hit was a FLAC (no bitrate in the DB) being +deleted in favor of a 282 kbps MP3 because the old ranking compared bitrate +first. +""" + +from __future__ import annotations + +from core.library.duplicate_keep import ( + duplicate_keep_sort_key, + format_rank_for_path, + pick_duplicate_to_keep, +) + + +def _t(path, bitrate=None, duration=None, track_number=None, tid=1): + return {"id": tid, "file_path": path, "bitrate": bitrate, + "duration": duration, "track_number": track_number} + + +# --- the reported regression -------------------------------------------------- + + +def test_flac_with_missing_bitrate_beats_282kbps_mp3(): + # Havok "Prepare For Attack": FLAC has no bitrate recorded, MP3 is 282 kbps. + mp3 = _t("/music/Havok/01 - Prepare For Attack.mp3", bitrate=282, duration=236, tid=1) + flac = _t("/music/Havok/01 - Prepare for Attack.flac", bitrate=None, duration=236, tid=2) + keep = pick_duplicate_to_keep([mp3, flac]) + assert keep["id"] == 2 # the FLAC + + +def test_flac_beats_mp3_regardless_of_order(): + mp3 = _t("/x/a.mp3", bitrate=320, tid=1) + flac = _t("/x/a.flac", bitrate=0, tid=2) + assert pick_duplicate_to_keep([mp3, flac])["id"] == 2 + assert pick_duplicate_to_keep([flac, mp3])["id"] == 2 + + +# --- format ranking ----------------------------------------------------------- + + +def test_format_rank_lossless_outranks_lossy(): + assert format_rank_for_path("a.flac") > format_rank_for_path("a.mp3") + assert format_rank_for_path("a.wav") > format_rank_for_path("a.aac") + assert format_rank_for_path("a.m4a") > format_rank_for_path("a.mp3") + + +def test_format_rank_unknown_and_missing(): + assert format_rank_for_path("a.xyz") == 1 + assert format_rank_for_path("noext") == 1 + assert format_rank_for_path(None) == 1 + assert format_rank_for_path("") == 1 + + +def test_format_rank_case_insensitive(): + assert format_rank_for_path("A.FLAC") == format_rank_for_path("a.flac") + + +# --- tie-breakers within the same format ------------------------------------- + + +def test_same_format_higher_bitrate_wins(): + lo = _t("/x/a.mp3", bitrate=192, tid=1) + hi = _t("/x/b.mp3", bitrate=320, tid=2) + assert pick_duplicate_to_keep([lo, hi])["id"] == 2 + + +def test_same_format_same_bitrate_longer_duration_wins(): + short = _t("/x/a.flac", bitrate=900, duration=200, tid=1) + long = _t("/x/b.flac", bitrate=900, duration=240, tid=2) + assert pick_duplicate_to_keep([short, long])["id"] == 2 + + +def test_track_number_is_final_tiebreak(): + a = _t("/x/a.flac", bitrate=900, duration=240, track_number=1, tid=1) + b = _t("/x/b.flac", bitrate=900, duration=240, track_number=7, tid=2) + assert pick_duplicate_to_keep([a, b])["id"] == 2 + + +# --- shape / edge cases ------------------------------------------------------- + + +def test_sort_key_tuple_order_is_format_first(): + key = duplicate_keep_sort_key(_t("/x/a.flac", bitrate=100, duration=5, track_number=3)) + assert key == (10, 100, 5, 3) + + +def test_missing_numeric_fields_default_to_zero(): + assert duplicate_keep_sort_key(_t("/x/a.mp3")) == (5, 0, 0, 0) + + +def test_empty_group_returns_none(): + assert pick_duplicate_to_keep([]) is None diff --git a/webui/static/enrichment.js b/webui/static/enrichment.js index 0cf9ced6..b82708d7 100644 --- a/webui/static/enrichment.js +++ b/webui/static/enrichment.js @@ -2859,12 +2859,21 @@ function _renderFindingDetail(f) { case 'duplicate_tracks': if (!d.tracks || !d.tracks.length) return _gridRows([['Count', d.count || '?']]); - // Determine best copy (same logic as backend: highest bitrate, then duration, then track number) + // Determine best copy — same logic as the backend + // (core/library/duplicate_keep.py): lossless format first, so a FLAC + // beats an MP3 even when the FLAC's bitrate is missing, then bitrate, + // duration, track number. + const _dupFmtRank = (fp) => { + const r = { flac: 10, wav: 9, aiff: 9, aif: 9, ape: 8, m4a: 7, ogg: 6, opus: 6, mp3: 5, aac: 5, wma: 3 }; + return r[String(fp || '').split('.').pop().toLowerCase()] || 1; + }; + const _dupKey = (t) => [_dupFmtRank(t.file_path), t.bitrate || 0, t.duration || 0, t.track_number || 0]; const bestDup = d.tracks.reduce((best, t) => { - const bBr = best.bitrate || 0, tBr = t.bitrate || 0; - const bDur = best.duration || 0, tDur = t.duration || 0; - const bTn = best.track_number || 0, tTn = t.track_number || 0; - return (tBr > bBr || (tBr === bBr && tDur > bDur) || (tBr === bBr && tDur === bDur && tTn > bTn)) ? t : best; + const bk = _dupKey(best), tk = _dupKey(t); + for (let i = 0; i < bk.length; i++) { + if (tk[i] !== bk[i]) return tk[i] > bk[i] ? t : best; + } + return best; }, d.tracks[0]); const findingId = f.id; return media + `
${d.tracks.map((t, i) => {