diff --git a/core/auto_import_worker.py b/core/auto_import_worker.py index be80a7d2..b024120a 100644 --- a/core/auto_import_worker.py +++ b/core/auto_import_worker.py @@ -961,24 +961,32 @@ class AutoImportWorker: for f in candidate.audio_files: file_tags[f] = _read_file_tags(f) - # Resolve quality duplicates — if multiple files match same track, keep best - # Group by probable track (using track number from tags) - seen_track_nums = {} + # Resolve quality duplicates — if multiple files match the same + # (disc, track) position, keep the higher-quality one. Key on + # the (disc_number, track_number) tuple — keying on track_number + # alone breaks multi-disc albums where every disc has tracks + # numbered 1..N, since disc 2 track 1 looks identical to disc 1 + # track 1 to a track-number-only dedup. (Reported on Mr. Morale + # & The Big Steppers — discs 1+2 dumped loose in staging, dedup + # collapsed every same-numbered pair into one survivor.) + seen_positions = {} deduped_files = [] for f in candidate.audio_files: tn = file_tags[f]['track_number'] + dn = file_tags[f].get('disc_number', 1) or 1 ext = os.path.splitext(f)[1].lower() - if tn > 0 and tn in seen_track_nums: - prev_f = seen_track_nums[tn] + position_key = (dn, tn) + if tn > 0 and position_key in seen_positions: + prev_f = seen_positions[position_key] prev_ext = os.path.splitext(prev_f)[1].lower() if _quality_rank(ext) > _quality_rank(prev_ext): deduped_files.remove(prev_f) deduped_files.append(f) - seen_track_nums[tn] = f + seen_positions[position_key] = f else: deduped_files.append(f) if tn > 0: - seen_track_nums[tn] = f + seen_positions[position_key] = f # Match files to tracks using weighted scoring matches = [] @@ -988,6 +996,15 @@ class AutoImportWorker: for track in tracks: track_name = track.get('name', '') track_num = track.get('track_number', 0) + # Track disc number — Spotify uses `disc_number`, Deezer + # `disk_number`, iTunes `discNumber`. Default to 1 when + # missing so single-disc albums still match. + track_disc = ( + track.get('disc_number') + or track.get('disk_number') + or track.get('discNumber') + or 1 + ) track_artists = track.get('artists', []) track_artist = '' if track_artists: @@ -1012,11 +1029,27 @@ class AutoImportWorker: if ft['artist'] and track_artist: score += _similarity(ft['artist'], track_artist) * 0.15 - # Track number (30%) + # Position match (30%) — gates the bonus on (disc, track) + # tuple, NOT track_number alone. Multi-disc albums with + # parallel numbering (every disc has tracks 1..N) used + # to mismatch here: file with track_number=6 from disc 2 + # got the full bonus when matched against disc 1 track 6 + # because disc was ignored. Result: wrong files matched + # to wrong tracks, integrity check rejected them all. if ft['track_number'] > 0 and track_num > 0: - if ft['track_number'] == track_num: + ft_disc = ft.get('disc_number', 1) or 1 + if ft['track_number'] == track_num and ft_disc == track_disc: score += 0.30 - elif abs(ft['track_number'] - track_num) <= 1: + elif ( + ft['track_number'] == track_num + and ft_disc != track_disc + ): + # Same track number, different disc — treat as + # a mild penalty case so the title/artist + # similarity has to carry the match. Cross-disc + # collisions are common in deluxe releases. + score += 0.05 + elif abs(ft['track_number'] - track_num) <= 1 and ft_disc == track_disc: score += 0.12 # Album tag bonus (10%) diff --git a/tests/imports/test_auto_import_multi_disc_matching.py b/tests/imports/test_auto_import_multi_disc_matching.py new file mode 100644 index 00000000..3e86f87d --- /dev/null +++ b/tests/imports/test_auto_import_multi_disc_matching.py @@ -0,0 +1,298 @@ +"""Regression test for the multi-disc auto-import matching bug. + +User report (2026-05-08, Mr. Morale & The Big Steppers): an album with +multiple discs got dumped into staging — discs 1 and 2 loose in the +root, disc 3 in its own folder, every file perfectly tagged with +``disc_number`` + ``track_number`` + ``title``. Auto-import processed +only 9 tracks total instead of all 27. + +Two bugs in ``AutoImportWorker._match_tracks`` caused it: + +1. **Quality dedup keyed on track_number alone.** The dedup loop kept + ``seen_track_nums[track_number] = file`` and dropped any later file + with the same number, treating it as a quality duplicate. On a + multi-disc release where every disc has tracks 1..N, that collapses + the album to one disc's worth of files before matching even runs — + half (or more) of the tracks vanish before the matcher sees them. + +2. **Match scoring ignored disc_number.** The 30% track-number bonus + fired whenever ``ft['track_number'] == track_num`` regardless of disc. + File with tag ``(disc=2, track=6)`` (Auntie Diaries, 281s) got the + full bonus when matched against API track ``(disc=1, track=6)`` (Rich + Interlude, 103s) — wrong file → wrong destination → integrity + check correctly rejected and quarantined the file. + +Fix in this PR: dedup keys on ``(disc_number, track_number)`` tuples; +match scoring only awards the 30% bonus when BOTH disc and track +numbers agree, with a small consolation bonus for same-track-number +cross-disc collisions so title similarity still drives the match. + +These tests pin both behaviors so multi-disc albums stay intact +through the full import pipeline. +""" + +from __future__ import annotations + +from typing import Any, Dict +from unittest.mock import MagicMock, patch + +import pytest + +from core import auto_import_worker as aiw + + +# --------------------------------------------------------------------------- +# Fixtures — tagged file fakes + worker setup +# --------------------------------------------------------------------------- + + +def _file_tags(*, disc: int, track: int, title: str, artist: str = 'Kendrick Lamar', + album: str = 'Mr. Morale & The Big Steppers') -> Dict[str, Any]: + """Build the tag dict shape ``_read_file_tags`` returns.""" + return { + 'title': title, + 'artist': artist, + 'album': album, + 'track_number': track, + 'disc_number': disc, + 'year': '2022', + } + + +def _api_track(disc: int, track: int, title: str, artist: str = 'Kendrick Lamar') -> Dict[str, Any]: + """Build a track dict matching the shape the metadata source returns.""" + return { + 'name': title, + 'track_number': track, + 'disc_number': disc, + 'artists': [{'name': artist}], + 'id': f'{disc}-{track}', + } + + +@pytest.fixture +def worker(): + """A worker instance — the matching logic doesn't actually need + most of the worker's deps so we instantiate raw.""" + w = aiw.AutoImportWorker.__new__(aiw.AutoImportWorker) + return w + + +# --------------------------------------------------------------------------- +# Test 1 — dedup must NOT collapse same-track-numbers across discs +# --------------------------------------------------------------------------- + + +def test_dedup_preserves_files_with_same_track_number_across_different_discs(worker, monkeypatch): + """The bug: dedup keyed by track_number alone treated disc 1 track 6 + and disc 2 track 6 as quality duplicates, dropped one. Fix: key by + (disc_number, track_number) tuple — both files survive dedup. + + User scenario: 18 loose files in staging root, all tagged with + ``disc_number`` 1 or 2 and ``track_number`` 1..9. Pre-fix the + matcher only saw 9 of them after dedup. Post-fix all 18. + """ + # 18 fake files: discs 1 + 2, tracks 1..9 each + files = [f"/fake/d{disc}_t{track}.flac" for disc in (1, 2) for track in range(1, 10)] + file_tags = { + f: _file_tags(disc=disc, track=track, + title=f'Track {disc}.{track}') + for f, (disc, track) in zip( + files, [(d, t) for d in (1, 2) for t in range(1, 10)], + ) + } + + # Mock _read_file_tags to return our test tags + monkeypatch.setattr(aiw, '_read_file_tags', lambda f: file_tags[f]) + + # Mock the metadata client + album fetch to return 18 tracks + api_tracks = [_api_track(disc, track, f'Track {disc}.{track}') + for disc in (1, 2) for track in range(1, 10)] + fake_client = MagicMock() + fake_client.get_album = MagicMock(return_value={ + 'id': 'album-1', + 'name': 'Mr. Morale & The Big Steppers', + 'tracks': {'items': api_tracks}, + }) + + candidate = aiw.FolderCandidate( + path='/staging', + name='staging', + audio_files=files, + folder_hash='hash1', + ) + + identification = { + 'source': 'spotify', + 'album_id': 'album-1', + 'album_name': 'Mr. Morale & The Big Steppers', + 'artist_name': 'Kendrick Lamar', + 'identification_confidence': 1.0, + } + + with patch('core.metadata_service.get_client_for_source', return_value=fake_client), \ + patch('core.metadata_service.get_album_tracks_for_source', return_value=None): + result = worker._match_tracks(candidate, identification) + + assert result is not None + # All 18 files must end up matched — pre-fix only 9 survived dedup, + # then 4 of those got mismatched and integrity-rejected. + assert len(result['matches']) == 18, ( + f"Expected 18 matches across both discs, got {len(result['matches'])}. " + f"Dedup probably collapsed same-track-numbers across discs." + ) + # No file should be in unmatched + assert not result['unmatched_files'] + + +# --------------------------------------------------------------------------- +# Test 2 — match scoring respects disc_number +# --------------------------------------------------------------------------- + + +def test_match_scoring_pairs_files_to_correct_disc(worker, monkeypatch): + """The bug: the 30% track-number bonus fired regardless of disc, so + files got matched to the wrong-disc track when both shared a track + number. Fix: bonus only when (disc, track) BOTH match. + + Pin behavior: file tagged (disc=2, track=6, title='Auntie Diaries') + must match the API's (disc=2, track=6) track, NOT the (disc=1, + track=6) track even though both have track_number=6. + """ + files = [ + '/fake/disc1_06.flac', # Rich (Interlude) + '/fake/disc2_06.flac', # Auntie Diaries + ] + file_tags = { + '/fake/disc1_06.flac': _file_tags(disc=1, track=6, title='Rich (Interlude)'), + '/fake/disc2_06.flac': _file_tags(disc=2, track=6, title='Auntie Diaries'), + } + monkeypatch.setattr(aiw, '_read_file_tags', lambda f: file_tags[f]) + + api_tracks = [ + _api_track(1, 6, 'Rich (Interlude)'), + _api_track(2, 6, 'Auntie Diaries'), + ] + fake_client = MagicMock() + fake_client.get_album = MagicMock(return_value={ + 'id': 'album-1', 'name': 'Mr. Morale', + 'tracks': {'items': api_tracks}, + }) + + candidate = aiw.FolderCandidate( + path='/staging', name='staging', + audio_files=files, folder_hash='hash2', + ) + identification = { + 'source': 'spotify', 'album_id': 'album-1', + 'album_name': 'Mr. Morale', 'artist_name': 'Kendrick Lamar', + 'identification_confidence': 1.0, + } + + with patch('core.metadata_service.get_client_for_source', return_value=fake_client), \ + patch('core.metadata_service.get_album_tracks_for_source', return_value=None): + result = worker._match_tracks(candidate, identification) + + assert result is not None + assert len(result['matches']) == 2 + + # Build a {track_disc: matched_file} map for assertion + by_disc = { + m['track']['disc_number']: m['file'] for m in result['matches'] + } + assert by_disc[1] == '/fake/disc1_06.flac', ( + "API track (disc=1, track=6) should match the disc-1 file, " + "not get cross-matched to the disc-2 file just because they " + "share track_number=6." + ) + assert by_disc[2] == '/fake/disc2_06.flac', ( + "API track (disc=2, track=6) should match the disc-2 file." + ) + + +# --------------------------------------------------------------------------- +# Test 3 — single-disc albums still work (regression guard) +# --------------------------------------------------------------------------- + + +def test_single_disc_albums_still_match_normally(worker, monkeypatch): + """The disc-aware fix mustn't break single-disc albums where every + file has disc_number=1 (or no disc tag at all → defaults to 1).""" + files = [f'/fake/track_{i:02d}.flac' for i in range(1, 6)] + file_tags = { + f'/fake/track_{i:02d}.flac': _file_tags(disc=1, track=i, title=f'Song {i}') + for i in range(1, 6) + } + monkeypatch.setattr(aiw, '_read_file_tags', lambda f: file_tags[f]) + + api_tracks = [_api_track(1, i, f'Song {i}') for i in range(1, 6)] + fake_client = MagicMock() + fake_client.get_album = MagicMock(return_value={ + 'id': 'album-1', 'name': 'Test Album', + 'tracks': {'items': api_tracks}, + }) + + candidate = aiw.FolderCandidate( + path='/staging', name='Album', + audio_files=files, folder_hash='hash3', + ) + identification = { + 'source': 'spotify', 'album_id': 'album-1', + 'album_name': 'Test Album', 'artist_name': 'Test Artist', + 'identification_confidence': 1.0, + } + + with patch('core.metadata_service.get_client_for_source', return_value=fake_client), \ + patch('core.metadata_service.get_album_tracks_for_source', return_value=None): + result = worker._match_tracks(candidate, identification) + + assert result is not None + assert len(result['matches']) == 5 + # Each track i matched to track_0i.flac + for m in result['matches']: + track_num = m['track']['track_number'] + assert m['file'] == f'/fake/track_{track_num:02d}.flac' + + +# --------------------------------------------------------------------------- +# Test 4 — quality dedup still works WITHIN a single (disc, track) position +# --------------------------------------------------------------------------- + + +def test_quality_dedup_still_picks_higher_quality_for_same_position(worker, monkeypatch): + """Two files at (disc=1, track=1) — one .mp3, one .flac. Dedup must + keep the .flac. The fix only changed the dedup KEY (added disc_number + to the tuple), not the quality-comparison logic — pin the quality + behavior.""" + files = ['/fake/disc1_track1.mp3', '/fake/disc1_track1.flac'] + file_tags = { + '/fake/disc1_track1.mp3': _file_tags(disc=1, track=1, title='Song 1'), + '/fake/disc1_track1.flac': _file_tags(disc=1, track=1, title='Song 1'), + } + monkeypatch.setattr(aiw, '_read_file_tags', lambda f: file_tags[f]) + + api_tracks = [_api_track(1, 1, 'Song 1')] + fake_client = MagicMock() + fake_client.get_album = MagicMock(return_value={ + 'id': 'album-1', 'name': 'Test Album', + 'tracks': {'items': api_tracks}, + }) + + candidate = aiw.FolderCandidate( + path='/staging', name='Album', + audio_files=files, folder_hash='hash4', + ) + identification = { + 'source': 'spotify', 'album_id': 'album-1', + 'album_name': 'Test Album', 'artist_name': 'Test Artist', + 'identification_confidence': 1.0, + } + + with patch('core.metadata_service.get_client_for_source', return_value=fake_client), \ + patch('core.metadata_service.get_album_tracks_for_source', return_value=None): + result = worker._match_tracks(candidate, identification) + + assert result is not None + assert len(result['matches']) == 1 + # FLAC must win + assert result['matches'][0]['file'].endswith('.flac') diff --git a/webui/static/helper.js b/webui/static/helper.js index 37e74f55..c1058796 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3419,6 +3419,7 @@ const WHATS_NEW = { { title: 'Manual Search In The Failed-Track Candidates Modal', desc: 'when a download fails or returns "not found" the user can already click the status cell to open a modal showing whatever search candidates the auto-search left over and pick a different one. that modal now ALSO has a manual search bar. type any query, hit search, get a fresh round of results from the download sources without having to start the whole download flow over from the search page. solves the case where the auto-query was bad (featured artist not in title, parentheticals like "(remastered 2019)" tripping the matcher, slight artist-name variants) but the file genuinely exists on the source. source picker is smart per download mode: single-source mode (soulseek-only / youtube-only / etc) shows a "searching X" label, no dropdown; hybrid mode shows a dropdown with "all sources" default plus every configured source — picking "all" runs parallel searches across all of them and tags each result row with its source badge. only configured sources show up; unconfigured ones are hidden. results stream in as each source completes via NDJSON instead of blocking on the slowest source — the table starts populating the moment the first source returns. clicking a result reuses the existing retry-download flow → same path, same acoustid verification on the file when it lands, no shortcut around the safety net. additive in the truest sense: the existing modal layout / candidates table / download buttons are byte-identical when the user doesn\'t use manual search. backend extends the candidates endpoint with `download_mode` + `available_sources` + a `source` field per candidate (purely additive — old fields untouched), and adds a new `POST /api/downloads/task//manual-search` that streams NDJSON (one header line, one source_results line per source as completed, one done terminator) so the frontend renderer can append rows incrementally. 11 tests pin the streaming contract: query length / source whitelist / task 404 validation, single-source dispatch, parallel "all" dispatch, one-event-per-source streaming shape, unconfigured-source skip + reject, header metadata, and per-source exception isolation (one source raising emits a `source_error` event but doesn\'t fail the stream).', page: 'downloads' }, { title: 'Manual Picks Don\'t Auto-Retry Anymore (And The Modal Always Opens)', desc: 'three follow-on fixes to the manual-search feature once people started actually using it. (1) when the user picked a candidate and that download failed (e.g. soundcloud 404 on a stale track url), the auto-retry monitor would treat it like any other failed auto-attempt — yank the task back to "searching" and pick a different candidate. felt completely wrong from the user\'s perspective: "i picked THIS one, why is it searching for something else?" now manual picks are tagged with a `_user_manual_pick` flag and the auto-retry path bails on it. failure surfaces to the user instead of getting silently fallen-back. (2) non-soulseek manual picks (youtube / tidal / qobuz / hifi / deezer / soundcloud / lidarr) were getting stuck at "downloading 0%" forever even after their engine reported terminal failure. cause: status polling went into a "let monitor handle retry" branch that never fired because manual picks bail on retry — task was orphaned in downloading state. fix: when the engine reports Errored on a manual pick, mark the task failed directly, don\'t defer to the monitor. plus an engine-state fallback path covers the rare race where the orchestrator\'s pre-populated transfer lookup is missing the entry. (3) failed / not_found rows were only clickable when the auto-search had cached candidates — but the whole point of opening the modal now is to RUN a manual search, which doesn\'t need pre-existing candidates. now every failed / not_found / cancelled row opens the modal regardless. (4) one nasty deadlock fix in the process: the new "mark failed" path was synchronously calling `on_download_completed` while holding `tasks_lock`, which itself re-acquires the same lock — `threading.Lock` is non-reentrant so the polling thread wedged forever. while wedged the lock was held → every other endpoint that needed it (including /candidates → can\'t open OTHER modals) hung waiting. moved completion callbacks onto a daemon thread so the lock releases first. (5) manual download worker now runs on its own dedicated thread instead of competing with the batch\'s 3-worker `missing_download_executor` pool — saturated batches no longer queue manual picks indefinitely. all changes are scoped to manual picks only via the `_user_manual_pick` flag — auto-attempt flow is byte-identical to before. 17 unit tests pin the gate behavior (status engine fallback / monitor retry skip / IF-branch failure transition / auto-attempt skip).', page: 'downloads' }, { title: 'Manual Import: Stop Writing "Unknown Artist / album_id / 0 tracks" Garbage', desc: 'github issue #524 (radoslav-orlov): clicking an album in the import page → all imported albums landed in the library as "Unknown Artist" with the raw 10-digit album id as the title and 0 tracks. cause: the click handler `importPageSelectAlbum(albumId)` was passing only the id to the `/api/import/album/match` POST. the search response carried `source` (which metadata source the album_id came from) + `album_name` + `album_artist`, but the click discarded everything except the id. backend `get_artist_album_tracks` then guessed the source via the configured primary-source priority chain — for a non-deezer-primary user clicking a deezer search result, the chain tries spotify/itunes/discogs first against a deezer numeric id, all return None, and the lookup falls through to the failure-fallback dict (`name = album_id`, no artist field, `total_tracks = 0`). that broken metadata then flowed through the import pipeline → soulsync standalone library got the garbage rows. fix: cache album lookup by id when the suggestions / search renderers run, then have `importPageSelectAlbum` pull `source` + `name` + `artist` from the cache and include them in the match POST. backend now also logs a clear warning when source is missing from the match request, so any future caller dropping it shows up in app.log instead of silently corrupting library imports.', page: 'import' }, + { title: 'Auto-Import: Multi-Disc Albums No Longer Lose Half The Tracks', desc: 'caught while testing #524 with kendrick lamar mr morale & the big steppers (3 discs). dropped discs 1+2 loose in staging root + disc 3 in its own folder, all perfectly tagged → only 9 tracks ended up imported, the rest got integrity-rejected and quarantined. two related bugs in `auto_import_worker._match_tracks`: (1) the "quality dedup" loop kept `seen_track_nums[track_number] = file` and dropped any later file with the same number as a quality duplicate. on a multi-disc release where every disc has tracks 1..N, that collapses the album to one disc\'s worth of files BEFORE the matcher even runs. fix: dedup keys on `(disc_number, track_number)` tuples instead. (2) the 30% track-number bonus in the match scoring fired whenever `ft[track_number] == track_num` regardless of disc — file tagged (disc=2, track=6, "Auntie Diaries") got the full bonus matching API track (disc=1, track=6, "Rich Interlude"), wrong file → integrity check correctly rejected and quarantined. fix: 30% bonus only when BOTH disc and track numbers agree, with a small consolation bonus for cross-disc collisions so title similarity has to carry the match. 4 new tests pin: dedup preserves all files across discs (18-file regression case), match scoring pairs to correct disc, single-disc albums still match normally, and quality dedup within a single (disc, track) position still picks the higher quality file.', page: 'import' }, ], '2.4.3': [ // --- May 8, 2026 — patch release ---