mirror of https://github.com/Nezreka/SoulSync.git
The Duplicate Detector's 'Keep Best' auto-selection ranked copies by highest
bitrate -> duration -> track number, with no notion of format. A FLAC whose
bitrate the library scan never populated (a common gap) therefore lost to a
282 kbps MP3: 282 > 0, so the MP3 was kept and the FLAC deleted (reported on
Havok 'Prepare For Attack', and again on Kendrick GNX).
Fix: rank by format/lossless tier FIRST, then bitrate, duration, track number.
A lossless file now always beats a lossy one regardless of the recorded
bitrate; bitrate/duration/track# only break ties within the same format.
- core/library/duplicate_keep.py (new): pure, importable pick_duplicate_to_keep
+ duplicate_keep_sort_key + format_rank_for_path (extension rank mirroring
auto_import_worker._quality_rank: flac=10 ... mp3=5 ... unknown=1).
- core/repair_worker.py: _fix_duplicates auto-pick now calls
pick_duplicate_to_keep instead of the bitrate-first max().
- webui/static/enrichment.js: the KEEP/REMOVE recommendation mirrors the same
format-first ranking so the badge matches what the backend will delete.
Parity: Python uses '.ext' keys (os.path.splitext), JS uses 'ext'
(split('.').pop()) -> identical results; both keep the first copy on a full
tie. Verified the only other dedup path (the standalone Duplicate Cleaner
automation, core/library/duplicate_cleaner.py) was already format-priority-first
and correct -- no change needed there.
Tests: tests/test_duplicate_keep.py (11 -- incl. the exact FLAC-with-missing-
bitrate vs 282 kbps MP3 case, format ranking, within-format tie-breakers, and
edge cases). 147 repair/duplicate tests still pass.
Note: why FLAC bitrate is NULL in the DB is a separate library-scan gap;
format-first ranking makes the keep decision correct regardless.
pull/761/head
parent
b202c176f7
commit
28850672a6
@ -0,0 +1,57 @@
|
||||
"""Choosing which copy of a duplicate track to keep.
|
||||
|
||||
The keeper is the highest-quality copy, and **format/lossless is the primary
|
||||
criterion**: a lossless FLAC must beat a lossy MP3 regardless of the recorded
|
||||
bitrate number — a FLAC whose bitrate the library scan never populated (a
|
||||
common gap) still has to win over a 282 kbps MP3. Only *within* the same format
|
||||
tier do bitrate, then duration, then track number break the tie.
|
||||
|
||||
This was lifted out of the repair worker so the deletion path and the UI's
|
||||
"Keep Best" recommendation rank identically (previously both ranked by bitrate
|
||||
first, which kept the MP3 when the FLAC's bitrate was missing), and so the
|
||||
ranking is unit-testable in isolation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Format quality rank by file extension (higher = better). Mirrors
|
||||
# ``auto_import_worker._quality_rank`` so lossless always outranks lossy.
|
||||
_FORMAT_RANK = {
|
||||
".flac": 10, ".wav": 9, ".aiff": 9, ".aif": 9, ".ape": 8,
|
||||
".m4a": 7, ".ogg": 6, ".opus": 6, ".mp3": 5, ".aac": 5, ".wma": 3,
|
||||
}
|
||||
|
||||
|
||||
def format_rank_for_path(file_path: Optional[str]) -> int:
|
||||
"""Quality rank for a file by extension (higher = better, unknown = 1)."""
|
||||
if not file_path:
|
||||
return 1
|
||||
ext = os.path.splitext(str(file_path))[1].lower()
|
||||
return _FORMAT_RANK.get(ext, 1)
|
||||
|
||||
|
||||
def duplicate_keep_sort_key(track: Dict) -> Tuple[int, int, float, int]:
|
||||
"""Sort key for picking the keeper — the higher tuple wins.
|
||||
|
||||
Order of precedence: format/lossless tier, then bitrate, then duration,
|
||||
then track number (a real number over a placeholder ``01``). Putting format
|
||||
first is the whole point — it makes FLAC beat MP3 even when the FLAC's
|
||||
bitrate is 0/missing in the DB.
|
||||
"""
|
||||
return (
|
||||
format_rank_for_path(track.get("file_path")),
|
||||
track.get("bitrate") or 0,
|
||||
track.get("duration") or 0,
|
||||
track.get("track_number") or 0,
|
||||
)
|
||||
|
||||
|
||||
def pick_duplicate_to_keep(tracks: List[Dict]) -> Optional[Dict]:
|
||||
"""Return the copy to keep from a duplicate group (highest sort key), or
|
||||
``None`` if the group is empty."""
|
||||
if not tracks:
|
||||
return None
|
||||
return max(tracks, key=duplicate_keep_sort_key)
|
||||
@ -0,0 +1,95 @@
|
||||
"""Tests for duplicate-keeper selection (core/library/duplicate_keep.py).
|
||||
|
||||
The headline contract: lossless format wins over lossy regardless of the
|
||||
recorded bitrate — the bug a user hit was a FLAC (no bitrate in the DB) being
|
||||
deleted in favor of a 282 kbps MP3 because the old ranking compared bitrate
|
||||
first.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from core.library.duplicate_keep import (
|
||||
duplicate_keep_sort_key,
|
||||
format_rank_for_path,
|
||||
pick_duplicate_to_keep,
|
||||
)
|
||||
|
||||
|
||||
def _t(path, bitrate=None, duration=None, track_number=None, tid=1):
|
||||
return {"id": tid, "file_path": path, "bitrate": bitrate,
|
||||
"duration": duration, "track_number": track_number}
|
||||
|
||||
|
||||
# --- the reported regression --------------------------------------------------
|
||||
|
||||
|
||||
def test_flac_with_missing_bitrate_beats_282kbps_mp3():
|
||||
# Havok "Prepare For Attack": FLAC has no bitrate recorded, MP3 is 282 kbps.
|
||||
mp3 = _t("/music/Havok/01 - Prepare For Attack.mp3", bitrate=282, duration=236, tid=1)
|
||||
flac = _t("/music/Havok/01 - Prepare for Attack.flac", bitrate=None, duration=236, tid=2)
|
||||
keep = pick_duplicate_to_keep([mp3, flac])
|
||||
assert keep["id"] == 2 # the FLAC
|
||||
|
||||
|
||||
def test_flac_beats_mp3_regardless_of_order():
|
||||
mp3 = _t("/x/a.mp3", bitrate=320, tid=1)
|
||||
flac = _t("/x/a.flac", bitrate=0, tid=2)
|
||||
assert pick_duplicate_to_keep([mp3, flac])["id"] == 2
|
||||
assert pick_duplicate_to_keep([flac, mp3])["id"] == 2
|
||||
|
||||
|
||||
# --- format ranking -----------------------------------------------------------
|
||||
|
||||
|
||||
def test_format_rank_lossless_outranks_lossy():
|
||||
assert format_rank_for_path("a.flac") > format_rank_for_path("a.mp3")
|
||||
assert format_rank_for_path("a.wav") > format_rank_for_path("a.aac")
|
||||
assert format_rank_for_path("a.m4a") > format_rank_for_path("a.mp3")
|
||||
|
||||
|
||||
def test_format_rank_unknown_and_missing():
|
||||
assert format_rank_for_path("a.xyz") == 1
|
||||
assert format_rank_for_path("noext") == 1
|
||||
assert format_rank_for_path(None) == 1
|
||||
assert format_rank_for_path("") == 1
|
||||
|
||||
|
||||
def test_format_rank_case_insensitive():
|
||||
assert format_rank_for_path("A.FLAC") == format_rank_for_path("a.flac")
|
||||
|
||||
|
||||
# --- tie-breakers within the same format -------------------------------------
|
||||
|
||||
|
||||
def test_same_format_higher_bitrate_wins():
|
||||
lo = _t("/x/a.mp3", bitrate=192, tid=1)
|
||||
hi = _t("/x/b.mp3", bitrate=320, tid=2)
|
||||
assert pick_duplicate_to_keep([lo, hi])["id"] == 2
|
||||
|
||||
|
||||
def test_same_format_same_bitrate_longer_duration_wins():
|
||||
short = _t("/x/a.flac", bitrate=900, duration=200, tid=1)
|
||||
long = _t("/x/b.flac", bitrate=900, duration=240, tid=2)
|
||||
assert pick_duplicate_to_keep([short, long])["id"] == 2
|
||||
|
||||
|
||||
def test_track_number_is_final_tiebreak():
|
||||
a = _t("/x/a.flac", bitrate=900, duration=240, track_number=1, tid=1)
|
||||
b = _t("/x/b.flac", bitrate=900, duration=240, track_number=7, tid=2)
|
||||
assert pick_duplicate_to_keep([a, b])["id"] == 2
|
||||
|
||||
|
||||
# --- shape / edge cases -------------------------------------------------------
|
||||
|
||||
|
||||
def test_sort_key_tuple_order_is_format_first():
|
||||
key = duplicate_keep_sort_key(_t("/x/a.flac", bitrate=100, duration=5, track_number=3))
|
||||
assert key == (10, 100, 5, 3)
|
||||
|
||||
|
||||
def test_missing_numeric_fields_default_to_zero():
|
||||
assert duplicate_keep_sort_key(_t("/x/a.mp3")) == (5, 0, 0, 0)
|
||||
|
||||
|
||||
def test_empty_group_returns_none():
|
||||
assert pick_duplicate_to_keep([]) is None
|
||||
Loading…
Reference in new issue