Fix: duplicate detector kept lossy over lossless (rank format first)

The Duplicate Detector's 'Keep Best' auto-selection ranked copies by highest
bitrate -> duration -> track number, with no notion of format. A FLAC whose
bitrate the library scan never populated (a common gap) therefore lost to a
282 kbps MP3: 282 > 0, so the MP3 was kept and the FLAC deleted (reported on
Havok 'Prepare For Attack', and again on Kendrick GNX).

Fix: rank by format/lossless tier FIRST, then bitrate, duration, track number.
A lossless file now always beats a lossy one regardless of the recorded
bitrate; bitrate/duration/track# only break ties within the same format.

- core/library/duplicate_keep.py (new): pure, importable pick_duplicate_to_keep
  + duplicate_keep_sort_key + format_rank_for_path (extension rank mirroring
  auto_import_worker._quality_rank: flac=10 ... mp3=5 ... unknown=1).
- core/repair_worker.py: _fix_duplicates auto-pick now calls
  pick_duplicate_to_keep instead of the bitrate-first max().
- webui/static/enrichment.js: the KEEP/REMOVE recommendation mirrors the same
  format-first ranking so the badge matches what the backend will delete.

Parity: Python uses '.ext' keys (os.path.splitext), JS uses 'ext'
(split('.').pop()) -> identical results; both keep the first copy on a full
tie. Verified the only other dedup path (the standalone Duplicate Cleaner
automation, core/library/duplicate_cleaner.py) was already format-priority-first
and correct -- no change needed there.

Tests: tests/test_duplicate_keep.py (11 -- incl. the exact FLAC-with-missing-
bitrate vs 282 kbps MP3 case, format ranking, within-format tie-breakers, and
edge cases). 147 repair/duplicate tests still pass.

Note: why FLAC bitrate is NULL in the DB is a separate library-scan gap;
format-first ranking makes the keep decision correct regardless.
pull/761/head
BoulderBadgeDad 1 week ago
parent b202c176f7
commit 28850672a6

@ -0,0 +1,57 @@
"""Choosing which copy of a duplicate track to keep.
The keeper is the highest-quality copy, and **format/lossless is the primary
criterion**: a lossless FLAC must beat a lossy MP3 regardless of the recorded
bitrate number a FLAC whose bitrate the library scan never populated (a
common gap) still has to win over a 282 kbps MP3. Only *within* the same format
tier do bitrate, then duration, then track number break the tie.
This was lifted out of the repair worker so the deletion path and the UI's
"Keep Best" recommendation rank identically (previously both ranked by bitrate
first, which kept the MP3 when the FLAC's bitrate was missing), and so the
ranking is unit-testable in isolation.
"""
from __future__ import annotations
import os
from typing import Dict, List, Optional, Tuple
# Format quality rank by file extension (higher = better). Mirrors
# ``auto_import_worker._quality_rank`` so lossless always outranks lossy.
_FORMAT_RANK = {
".flac": 10, ".wav": 9, ".aiff": 9, ".aif": 9, ".ape": 8,
".m4a": 7, ".ogg": 6, ".opus": 6, ".mp3": 5, ".aac": 5, ".wma": 3,
}
def format_rank_for_path(file_path: Optional[str]) -> int:
"""Quality rank for a file by extension (higher = better, unknown = 1)."""
if not file_path:
return 1
ext = os.path.splitext(str(file_path))[1].lower()
return _FORMAT_RANK.get(ext, 1)
def duplicate_keep_sort_key(track: Dict) -> Tuple[int, int, float, int]:
"""Sort key for picking the keeper — the higher tuple wins.
Order of precedence: format/lossless tier, then bitrate, then duration,
then track number (a real number over a placeholder ``01``). Putting format
first is the whole point it makes FLAC beat MP3 even when the FLAC's
bitrate is 0/missing in the DB.
"""
return (
format_rank_for_path(track.get("file_path")),
track.get("bitrate") or 0,
track.get("duration") or 0,
track.get("track_number") or 0,
)
def pick_duplicate_to_keep(tracks: List[Dict]) -> Optional[Dict]:
"""Return the copy to keep from a duplicate group (highest sort key), or
``None`` if the group is empty."""
if not tracks:
return None
return max(tracks, key=duplicate_keep_sort_key)

@ -1364,8 +1364,11 @@ class RepairWorker:
return {'success': False, 'error': f'Selected track ID {keep_id} not found in duplicates'}
best_id = keep_id
else:
# Auto-pick: highest bitrate, then longest duration, then highest track number (correct > 01)
best = max(tracks, key=lambda t: (t.get('bitrate', 0) or 0, t.get('duration', 0) or 0, t.get('track_number', 0) or 0))
# Auto-pick the keeper: lossless format first (so a FLAC beats an
# MP3 even when the FLAC's bitrate is missing in the DB), then
# bitrate, duration, and track number as tie-breakers.
from core.library.duplicate_keep import pick_duplicate_to_keep
best = pick_duplicate_to_keep(tracks)
best_id = best.get('track_id') or best.get('id')
if not best_id:

@ -0,0 +1,95 @@
"""Tests for duplicate-keeper selection (core/library/duplicate_keep.py).
The headline contract: lossless format wins over lossy regardless of the
recorded bitrate the bug a user hit was a FLAC (no bitrate in the DB) being
deleted in favor of a 282 kbps MP3 because the old ranking compared bitrate
first.
"""
from __future__ import annotations
from core.library.duplicate_keep import (
duplicate_keep_sort_key,
format_rank_for_path,
pick_duplicate_to_keep,
)
def _t(path, bitrate=None, duration=None, track_number=None, tid=1):
return {"id": tid, "file_path": path, "bitrate": bitrate,
"duration": duration, "track_number": track_number}
# --- the reported regression --------------------------------------------------
def test_flac_with_missing_bitrate_beats_282kbps_mp3():
# Havok "Prepare For Attack": FLAC has no bitrate recorded, MP3 is 282 kbps.
mp3 = _t("/music/Havok/01 - Prepare For Attack.mp3", bitrate=282, duration=236, tid=1)
flac = _t("/music/Havok/01 - Prepare for Attack.flac", bitrate=None, duration=236, tid=2)
keep = pick_duplicate_to_keep([mp3, flac])
assert keep["id"] == 2 # the FLAC
def test_flac_beats_mp3_regardless_of_order():
mp3 = _t("/x/a.mp3", bitrate=320, tid=1)
flac = _t("/x/a.flac", bitrate=0, tid=2)
assert pick_duplicate_to_keep([mp3, flac])["id"] == 2
assert pick_duplicate_to_keep([flac, mp3])["id"] == 2
# --- format ranking -----------------------------------------------------------
def test_format_rank_lossless_outranks_lossy():
assert format_rank_for_path("a.flac") > format_rank_for_path("a.mp3")
assert format_rank_for_path("a.wav") > format_rank_for_path("a.aac")
assert format_rank_for_path("a.m4a") > format_rank_for_path("a.mp3")
def test_format_rank_unknown_and_missing():
assert format_rank_for_path("a.xyz") == 1
assert format_rank_for_path("noext") == 1
assert format_rank_for_path(None) == 1
assert format_rank_for_path("") == 1
def test_format_rank_case_insensitive():
assert format_rank_for_path("A.FLAC") == format_rank_for_path("a.flac")
# --- tie-breakers within the same format -------------------------------------
def test_same_format_higher_bitrate_wins():
lo = _t("/x/a.mp3", bitrate=192, tid=1)
hi = _t("/x/b.mp3", bitrate=320, tid=2)
assert pick_duplicate_to_keep([lo, hi])["id"] == 2
def test_same_format_same_bitrate_longer_duration_wins():
short = _t("/x/a.flac", bitrate=900, duration=200, tid=1)
long = _t("/x/b.flac", bitrate=900, duration=240, tid=2)
assert pick_duplicate_to_keep([short, long])["id"] == 2
def test_track_number_is_final_tiebreak():
a = _t("/x/a.flac", bitrate=900, duration=240, track_number=1, tid=1)
b = _t("/x/b.flac", bitrate=900, duration=240, track_number=7, tid=2)
assert pick_duplicate_to_keep([a, b])["id"] == 2
# --- shape / edge cases -------------------------------------------------------
def test_sort_key_tuple_order_is_format_first():
key = duplicate_keep_sort_key(_t("/x/a.flac", bitrate=100, duration=5, track_number=3))
assert key == (10, 100, 5, 3)
def test_missing_numeric_fields_default_to_zero():
assert duplicate_keep_sort_key(_t("/x/a.mp3")) == (5, 0, 0, 0)
def test_empty_group_returns_none():
assert pick_duplicate_to_keep([]) is None

@ -2859,12 +2859,21 @@ function _renderFindingDetail(f) {
case 'duplicate_tracks':
if (!d.tracks || !d.tracks.length) return _gridRows([['Count', d.count || '?']]);
// Determine best copy (same logic as backend: highest bitrate, then duration, then track number)
// Determine best copy — same logic as the backend
// (core/library/duplicate_keep.py): lossless format first, so a FLAC
// beats an MP3 even when the FLAC's bitrate is missing, then bitrate,
// duration, track number.
const _dupFmtRank = (fp) => {
const r = { flac: 10, wav: 9, aiff: 9, aif: 9, ape: 8, m4a: 7, ogg: 6, opus: 6, mp3: 5, aac: 5, wma: 3 };
return r[String(fp || '').split('.').pop().toLowerCase()] || 1;
};
const _dupKey = (t) => [_dupFmtRank(t.file_path), t.bitrate || 0, t.duration || 0, t.track_number || 0];
const bestDup = d.tracks.reduce((best, t) => {
const bBr = best.bitrate || 0, tBr = t.bitrate || 0;
const bDur = best.duration || 0, tDur = t.duration || 0;
const bTn = best.track_number || 0, tTn = t.track_number || 0;
return (tBr > bBr || (tBr === bBr && tDur > bDur) || (tBr === bBr && tDur === bDur && tTn > bTn)) ? t : best;
const bk = _dupKey(best), tk = _dupKey(t);
for (let i = 0; i < bk.length; i++) {
if (tk[i] !== bk[i]) return tk[i] > bk[i] ? t : best;
}
return best;
}, d.tracks[0]);
const findingId = f.id;
return media + `<div class="repair-detail-sublist">${d.tracks.map((t, i) => {

Loading…
Cancel
Save