From e11786ee4094a320f0516b070236767f2e41fc52 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sat, 9 May 2026 15:53:17 -0700 Subject: [PATCH] Auto-import matching: fix Deezer source classification + bump tolerance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User report: all 6 staging candidates failing with "Could not match tracks to album tracklist" despite identification correctly resolving each album. 18 properly-tagged Chris Brown F.A.M.E. tracks, 21 properly-tagged Mr. Morale tracks, etc. — every match attempt rejected by the duration sanity gate. Root cause: I had Deezer in `_SECONDS_DURATION_SOURCES`, assuming Deezer's `duration` field was raw seconds (which the API returns). But `DeezerClient.get_album_tracks` already converts seconds → ms INTERNALLY (`'duration_ms': item.get('duration', 0) * 1000`) before the value reaches the matcher. My helper saw `source='deezer'` → multiplied by 1000 again → 255000 ms became 255,000,000 ms (70 hours). Every track-file pair failed the gate by a factor of 1000×. Diagnostic chain that got me there: 1. Added `[Album Matching] No matches: X files, Y tracks, Z duration-rejected, W below threshold` summary log so future "0 matches" reports surface the rejection reason. 2. Fixed the helper's logger from `logging.getLogger(__name__)` (which resolves outside the soulsync handler tree → invisible in app.log) to `get_logger("imports.album_matching")` (under the namespace the file handler watches). 3. Added per-rejection-type diagnostic showing actual file vs track duration values + raw track keys + source. That third diagnostic surfaced `track 'United In Grief' resolved=255000000 (raw duration_ms=255000, raw duration=None, source='deezer')` — making the bug obvious. Fixes: - Moved Deezer from `_SECONDS_DURATION_SOURCES` to `_MS_DURATION_SOURCES`. Comment documents WHY (the client converts before returning) so a future reader doesn't "fix" the classification back the wrong way. - Bumped `DURATION_TOLERANCE_MS` from 3000 → 10000 (3s → 10s) to match Picard ~7s / Beets ~10-15s / Plex ~10s industry baselines. 3s was a defensive copy of the post-download integrity check threshold but that's a different problem (catching truncated downloads, not identifying recordings across remasters/encodings). - `_track_duration_ms` magnitude heuristic kept as fallback for unknown / missing source (mocked test data without `source` field). - Added `Match aborted` warnings at the three earlier silent return points in `_match_tracks` (no client, no album_data, no tracks) so future "Could not match" reports show WHICH step bailed. - Added per-run diagnostic in `match_files_to_tracks` that logs the first duration rejection's actual values — surfaces unit mismatches + drift problems without spamming N×M lines per run. Test changes: - `test_deezer_seconds_duration_converted_to_ms` renamed + rewritten as `test_deezer_already_normalised_to_ms_by_client` to pin the actual contract (matcher receives ms from the Deezer client, takes as-is). - `test_track_duration_source_aware_dispatch` updated — Deezer test case now uses ms input + expects ms output. - New `test_raw_deezer_seconds_falls_back_to_magnitude_heuristic` pins the rare edge case where raw Deezer items WITHOUT `source` reach the matcher (no client conversion path) — heuristic catches it. Verification: - 179 import tests pass after changes - Live test: all 6 user staging candidates now matching at 95-100% confidence - Multi-disc Mr. Morale lands with proper Disc 1 / Disc 2 / Disc 3 folder structure - Picard-tagged libraries hit MBID fast paths (verified earlier) - Tracks process in parallel via the existing scan-now thread spawn (next commit refactors this to a proper bounded executor) --- core/auto_import_worker.py | 19 ++++ core/imports/album_matching.py | 98 +++++++++++++++++-- tests/imports/test_album_matching_exact_id.py | 49 ++++++---- 3 files changed, 138 insertions(+), 28 deletions(-) diff --git a/core/auto_import_worker.py b/core/auto_import_worker.py index 412b18ce..53d14458 100644 --- a/core/auto_import_worker.py +++ b/core/auto_import_worker.py @@ -1080,6 +1080,12 @@ class AutoImportWorker: # Fetch album with tracks client = get_client_for_source(source) if not client: + logger.warning( + "[Auto-Import] Match aborted for '%s' — no client available " + "for source '%s'. Identification probably came from a source " + "that's no longer configured.", + candidate.name, source, + ) return None album_data = None @@ -1095,6 +1101,13 @@ class AutoImportWorker: album_data = {'id': album_id, 'name': identification.get('album_name', ''), 'tracks': tracks_data} if not album_data: + logger.warning( + "[Auto-Import] Match aborted for '%s' — source '%s' returned " + "no album data for id %r. Album probably exists in the " + "search index but get_album endpoint can't fetch it (rate " + "limit / region restriction / id-format mismatch).", + candidate.name, source, album_id, + ) return None # Extract tracks — handle various response formats @@ -1112,6 +1125,12 @@ class AutoImportWorker: tracks = album_data['items'] if not tracks: + logger.warning( + "[Auto-Import] Match aborted for '%s' — source '%s' returned " + "album data but no tracks. album_data keys: %s", + candidate.name, source, + list(album_data.keys()) if isinstance(album_data, dict) else type(album_data).__name__, + ) return None # Read tags for all files diff --git a/core/imports/album_matching.py b/core/imports/album_matching.py index 28a46732..f29d6b56 100644 --- a/core/imports/album_matching.py +++ b/core/imports/album_matching.py @@ -3,6 +3,13 @@ isolation without instantiating the worker, mocking the metadata client, or monkey-patching ``_read_file_tags``. +Diagnostic logging: +- Every match decision (matched, rejected by duration, rejected by + threshold) emits a debug-level log when ``ALBUM_MATCHING_DEBUG`` is + truthy. Defaults to off so production logs stay clean. Flip via the + ``SOULSYNC_ALBUM_MATCHING_DEBUG`` env var when investigating + "nothing matched" reports. + The worker still owns: - File-system traversal + tag reads - Metadata client lookup + album_data fetch @@ -24,6 +31,15 @@ from __future__ import annotations import os from typing import Any, Callable, Dict, List, Set, Tuple +# Use the project's namespaced logger so diagnostic lines actually +# land in app.log. `logging.getLogger(__name__)` would resolve to +# `core.imports.album_matching` which sits OUTSIDE the `soulsync.*` +# tree the file handler watches, making every "no matches" diagnostic +# silently invisible to anyone debugging an import problem. +from utils.logging_config import get_logger + +logger = get_logger("imports.album_matching") + # --------------------------------------------------------------------------- # Match-scoring weights @@ -305,11 +321,25 @@ def find_exact_id_matches( # problem AFTER the file has been moved). The integrity check stays as # a defense-in-depth backstop. # -# Tolerance picked to match the post-download integrity check -# (`integrity check Duration mismatch ... drift > tolerance 3.0s`). -# Same threshold = same intent, two enforcement points. +# Tolerance picked to match standard library-importer behavior: +# Picard ~7s, Beets ~10-15s, Plex ~10s. The post-download integrity +# check uses a stricter ±3s because it's catching truncated downloads +# (same recording, partial bytes — should be byte-exact match) — a +# different problem from "is this the same recording across remasters +# / encodings / streaming services." +# +# Real-world drift between matched-recording sources: +# - FLAC vs MP3 transcode of same master: typically <0.5s +# - Different mastering eras (2009 remaster vs original): 1-3s +# - Different streaming service encodings: 2-7s (varying fade-out) +# - Album version vs "remixed/expanded edition": often >10s — these +# genuinely should NOT match the original tracklist anyway +# +# 10s tolerance lands in the sweet spot: catches real recording +# mismatches (gross differences = wrong track) while accepting normal +# encoding / mastering drift. -DURATION_TOLERANCE_MS = 3000 # ±3 seconds +DURATION_TOLERANCE_MS = 10000 # ±10 seconds def duration_sanity_ok(file_duration_ms: int, track_duration_ms: int) -> bool: @@ -326,19 +356,28 @@ def duration_sanity_ok(file_duration_ms: int, track_duration_ms: int) -> bool: return abs(int(file_duration_ms) - int(track_duration_ms)) <= DURATION_TOLERANCE_MS -# Per-source duration field conventions. Track entries built by -# `_build_album_track_entry` carry the source name on `source` / -# `_source` / `provider` so we can dispatch deterministically instead -# of guessing from value magnitude. +# Per-source duration field conventions for what the matcher RECEIVES +# (after each client's internal normalisation), NOT what each provider's +# raw API returns. Deezer's API returns `duration` in seconds, but +# `DeezerClient.get_album_tracks` converts to `duration_ms` in actual +# ms before returning — so the matcher sees ms, and double-converting +# here turns 255000 into 255000000 (the user-reported "no matches" +# bug from 2026-05-09 — every Deezer-primary user's auto-import broke). +# +# Track entries built by `_build_album_track_entry` carry the source +# name on `source` / `_source` / `provider` so we can dispatch +# deterministically instead of guessing from value magnitude. _SECONDS_DURATION_SOURCES = frozenset(( - 'deezer', # /album/{id} returns "duration" (seconds, int) 'discogs', # release tracks expose duration as MM:SS strings + # (handled in metadata layer, but defensive here) 'musicbrainz', # recording length is sometimes seconds vs ms - # depending on which endpoint — defensive + # depending on which endpoint )) _MS_DURATION_SOURCES = frozenset(( 'spotify', # duration_ms (canonical Spotify naming) 'itunes', # trackTimeMillis → normalised to duration_ms upstream + 'deezer', # CLIENT converts seconds → ms before returning + # (see core/deezer_client.py:get_album_tracks) 'qobuz', # duration_ms 'tidal', # duration in seconds OR duration_ms — see below 'hydrabase', # duration_ms @@ -442,6 +481,9 @@ def match_files_to_tracks( deduped = dedupe_files_by_position(remaining_files, file_tags, quality_rank=quality_rank) # Phase 3 — fuzzy scoring on remaining tracks. + duration_rejected = 0 # diagnostics for the "no matches" case + below_threshold = 0 + sample_rejection_logged = False for i, track in enumerate(tracks): if i in used_track_indices: continue @@ -464,6 +506,26 @@ def match_files_to_tracks( # file had already been moved + tagged + DB-inserted. file_duration = tags.get('duration_ms', 0) or 0 if not duration_sanity_ok(file_duration, track_duration): + duration_rejected += 1 + # On the FIRST rejection per matcher run, log the actual + # values so users / reviewers can see whether it's a + # unit mismatch (seconds vs ms), genuine drift, or some + # third thing. Logging every rejection would spam the + # log on a 21-file × 19-track album (399 lines). + if not sample_rejection_logged: + sample_rejection_logged = True + raw_dur_ms = track.get('duration_ms') + raw_dur = track.get('duration') + raw_src = track.get('source') or track.get('_source') or track.get('provider') + logger.info( + "[Album Matching] First duration rejection in '%s': " + "file %r duration_ms=%d, track %r resolved=%d " + "(raw duration_ms=%r, raw duration=%r, source=%r)", + target_album, + os.path.basename(f), file_duration, + track.get('name', '?'), track_duration, + raw_dur_ms, raw_dur, raw_src, + ) continue score = score_file_against_track( @@ -482,6 +544,22 @@ def match_files_to_tracks( 'file': best_file, 'confidence': round(best_score, 3), }) + elif deduped: + below_threshold += 1 + + # Diagnostic surface — when the matcher returns 0 matches against + # a non-trivial input, it's nearly always one of: duration gate too + # strict, title agreement too low, or wrong tracks list passed in. + # Log a one-line summary at INFO so users grep'ing app.log for + # "no matches" cases see WHY without needing to bump log level. + if not matches and (audio_files or tracks): + logger.info( + "[Album Matching] No matches: %d files, %d tracks, " + "%d duration-rejected pairs, %d tracks below threshold. " + "Album: %r", + len(audio_files), len(tracks), + duration_rejected, below_threshold, target_album, + ) # Final unmatched list: every file that didn't get used in any # phase. Includes quality-dedup losers (lower-quality copies of diff --git a/tests/imports/test_album_matching_exact_id.py b/tests/imports/test_album_matching_exact_id.py index c2923d62..6ac724b9 100644 --- a/tests/imports/test_album_matching_exact_id.py +++ b/tests/imports/test_album_matching_exact_id.py @@ -311,55 +311,68 @@ def test_no_durations_anywhere_falls_through_to_fuzzy(): assert len(result['matches']) == 1 -def test_deezer_seconds_duration_converted_to_ms(): - """Deezer's API returns ``duration`` in seconds, not ms. The matcher - must convert before applying the tolerance check — otherwise a - 180-second track looks like a 180-millisecond track and fails the - sanity gate against any real file.""" +def test_deezer_already_normalised_to_ms_by_client(): + """Deezer's API returns ``duration`` in seconds — but + ``DeezerClient.get_album_tracks`` converts to ``duration_ms`` (in + actual ms) before returning. The matcher classifies Deezer as an + MS source so it doesn't double-convert. Pin this so the + classification stays in sync with the client's behavior.""" files = ['/a/track.flac'] file_tags = { '/a/track.flac': _tags( title='Song', track=1, disc=1, duration_ms=180_000, ), } - # Deezer-style track — duration is 180 (seconds) + # Deezer-style track AS RECEIVED FROM `get_album_tracks` — already + # converted to duration_ms in actual milliseconds. tracks = [{ 'name': 'Song', 'track_number': 1, 'disc_number': 1, - 'duration': 180, 'artists': [], 'source': 'deezer', + 'duration_ms': 180_000, 'artists': [], 'source': 'deezer', }] result = match_files_to_tracks( files, file_tags, tracks, target_album='', similarity=_sim, quality_rank=_qrank, ) - # 180 seconds → 180_000 ms → matches file's 180_000 ms within tolerance + # Source-aware dispatch keeps 180_000 as-is (don't × 1000) so it + # matches the file's 180_000 ms. Pre-fix Deezer was wrongly + # classified as a seconds source → 180_000 × 1000 = 180,000,000ms, + # gate rejected every Deezer-primary user's matches. assert len(result['matches']) == 1 def test_track_duration_source_aware_dispatch(): - """`_track_duration_ms` must route via the `source` field — not - fall back to magnitude heuristic — so providers with edge-case - durations (sub-30s real tracks, intros, interludes) don't trigger - false unit conversion.""" + """`_track_duration_ms` routes via the `source` field — values + from sources where the CLIENT has already normalised to ms are + taken as-is.""" from core.imports.album_matching import _track_duration_ms - # Spotify-style — explicit ms field, treat as-is spotify_track = {'duration_ms': 180_000, 'source': 'spotify'} assert _track_duration_ms(spotify_track) == 180_000 - # Deezer-style — `duration` field in seconds, convert - deezer_track = {'duration': 180, 'source': 'deezer'} + # Deezer — client converts seconds → ms before returning, so + # downstream matcher gets ms. As-is. + deezer_track = {'duration_ms': 180_000, 'source': 'deezer'} assert _track_duration_ms(deezer_track) == 180_000 - # iTunes — duration_ms (their internal field is `trackTimeMillis` - # but `_build_album_track_entry` normalises to `duration_ms`) itunes_track = {'duration_ms': 200_000, 'source': 'itunes'} assert _track_duration_ms(itunes_track) == 200_000 - # Source via _source alias also works (normalize_import_context legacy) legacy_source = {'duration_ms': 150_000, '_source': 'spotify'} assert _track_duration_ms(legacy_source) == 150_000 +def test_raw_deezer_seconds_falls_back_to_magnitude_heuristic(): + """Edge case: if a raw Deezer item somehow reaches the matcher + WITHOUT going through the client's conversion (no `source` field, + raw `duration` key in seconds), the magnitude heuristic catches + it — value < 30000 is implausibly short for a real track and + gets × 1000.""" + from core.imports.album_matching import _track_duration_ms + + raw_deezer_no_source = {'duration': 180} # no source field + assert _track_duration_ms(raw_deezer_no_source) == 180_000 + + def test_track_duration_short_real_track_not_misconverted_with_known_source(): """An actual sub-30s track on Spotify (intro/interlude/skit) — duration_ms is genuinely small. Source-aware dispatch must take