diff --git a/core/imports/album_matching.py b/core/imports/album_matching.py index db1257d0..28a46732 100644 --- a/core/imports/album_matching.py +++ b/core/imports/album_matching.py @@ -326,20 +326,64 @@ def duration_sanity_ok(file_duration_ms: int, track_duration_ms: int) -> bool: return abs(int(file_duration_ms) - int(track_duration_ms)) <= DURATION_TOLERANCE_MS -def _track_duration_ms(track: Dict[str, Any]) -> int: - """Pull track duration in milliseconds. +# Per-source duration field conventions. Track entries built by +# `_build_album_track_entry` carry the source name on `source` / +# `_source` / `provider` so we can dispatch deterministically instead +# of guessing from value magnitude. +_SECONDS_DURATION_SOURCES = frozenset(( + 'deezer', # /album/{id} returns "duration" (seconds, int) + 'discogs', # release tracks expose duration as MM:SS strings + 'musicbrainz', # recording length is sometimes seconds vs ms + # depending on which endpoint — defensive +)) +_MS_DURATION_SOURCES = frozenset(( + 'spotify', # duration_ms (canonical Spotify naming) + 'itunes', # trackTimeMillis → normalised to duration_ms upstream + 'qobuz', # duration_ms + 'tidal', # duration in seconds OR duration_ms — see below + 'hydrabase', # duration_ms + 'hifi', # duration_ms +)) + - Spotify / iTunes return ``duration_ms``. Deezer's ``duration`` is - in seconds. Heuristic: anything below 30000 (would be 30 seconds in - ms — implausibly short for a real track) is treated as seconds and - converted. Beyond 30000 is already milliseconds. +def _track_duration_ms(track: Dict[str, Any]) -> int: + """Pull track duration in milliseconds — source-aware. + + Different metadata providers spell + scale duration differently: + + - Spotify / iTunes / Qobuz / HiFi / Hydrabase: ``duration_ms`` (ms) + - Deezer / Discogs: ``duration`` (seconds, int) + - Tidal: depends on endpoint — usually seconds for browse, ms for + album tracks; defensive heuristic kicks in if source missing + + Decision order: + 1. If the track carries a source name + that source is in the + seconds-only list, treat raw value as seconds and × 1000. + 2. If source is ms-only, take the value as-is. + 3. If source unknown / missing (e.g. mocked test data), fall back + to a magnitude heuristic — values < 30000 treated as seconds. + This is the legacy behavior, kept as the safety net. """ raw = track.get('duration_ms') or track.get('duration') or 0 try: value = int(raw) except (TypeError, ValueError): return 0 - if 0 < value < 30000: + if value <= 0: + return 0 + + source = (track.get('source') or track.get('_source') or track.get('provider') or '').strip().lower() + + if source in _SECONDS_DURATION_SOURCES: + return value * 1000 + if source in _MS_DURATION_SOURCES: + return value + + # Unknown / missing source — fall back to the magnitude heuristic. + # Anything below 30000 (30 seconds in ms) is implausibly short for + # a real track and is almost certainly seconds being passed where + # ms was expected. + if value < 30000: return value * 1000 return value diff --git a/tests/imports/test_album_matching_exact_id.py b/tests/imports/test_album_matching_exact_id.py index decaa8e1..c2923d62 100644 --- a/tests/imports/test_album_matching_exact_id.py +++ b/tests/imports/test_album_matching_exact_id.py @@ -325,7 +325,7 @@ def test_deezer_seconds_duration_converted_to_ms(): # Deezer-style track — duration is 180 (seconds) tracks = [{ 'name': 'Song', 'track_number': 1, 'disc_number': 1, - 'duration': 180, 'artists': [], + 'duration': 180, 'artists': [], 'source': 'deezer', }] result = match_files_to_tracks( files, file_tags, tracks, @@ -335,6 +335,60 @@ def test_deezer_seconds_duration_converted_to_ms(): assert len(result['matches']) == 1 +def test_track_duration_source_aware_dispatch(): + """`_track_duration_ms` must route via the `source` field — not + fall back to magnitude heuristic — so providers with edge-case + durations (sub-30s real tracks, intros, interludes) don't trigger + false unit conversion.""" + from core.imports.album_matching import _track_duration_ms + + # Spotify-style — explicit ms field, treat as-is + spotify_track = {'duration_ms': 180_000, 'source': 'spotify'} + assert _track_duration_ms(spotify_track) == 180_000 + + # Deezer-style — `duration` field in seconds, convert + deezer_track = {'duration': 180, 'source': 'deezer'} + assert _track_duration_ms(deezer_track) == 180_000 + + # iTunes — duration_ms (their internal field is `trackTimeMillis` + # but `_build_album_track_entry` normalises to `duration_ms`) + itunes_track = {'duration_ms': 200_000, 'source': 'itunes'} + assert _track_duration_ms(itunes_track) == 200_000 + + # Source via _source alias also works (normalize_import_context legacy) + legacy_source = {'duration_ms': 150_000, '_source': 'spotify'} + assert _track_duration_ms(legacy_source) == 150_000 + + +def test_track_duration_short_real_track_not_misconverted_with_known_source(): + """An actual sub-30s track on Spotify (intro/interlude/skit) — + duration_ms is genuinely small. Source-aware dispatch must take + spotify_ms_value as-is and NOT × 1000 it via the magnitude + heuristic. Pre-fix this would have been hit by: + + 20_000 ms (a 20-second intro) > 0 and < 30000 → converted to + 20_000_000 ms = 5.5 hours. Wrong. + + Post-fix: source='spotify' is in MS list, value taken as-is. + """ + from core.imports.album_matching import _track_duration_ms + + short_intro = {'duration_ms': 20_000, 'source': 'spotify'} + assert _track_duration_ms(short_intro) == 20_000 + + +def test_track_duration_unknown_source_falls_back_to_heuristic(): + """No source field — apply the legacy magnitude heuristic so + tests / mocks without source still work. < 30000 = seconds.""" + from core.imports.album_matching import _track_duration_ms + + no_source_seconds = {'duration': 180} # heuristic: < 30000 → seconds + assert _track_duration_ms(no_source_seconds) == 180_000 + + no_source_ms = {'duration_ms': 200_000} # heuristic: > 30000 → ms + assert _track_duration_ms(no_source_ms) == 200_000 + + def test_album_track_entry_propagates_isrc_and_mbid_from_source(): """Production-path guard: the metadata-source layer (`_build_album_track_entry`) must propagate ISRC + MBID from the diff --git a/tests/imports/test_album_matching_helper.py b/tests/imports/test_album_matching_helper.py index 05e45bda..c4350546 100644 --- a/tests/imports/test_album_matching_helper.py +++ b/tests/imports/test_album_matching_helper.py @@ -164,6 +164,67 @@ def test_score_position_match_requires_both_disc_and_track(): assert abs(score - expected) < 0.001 +def test_cross_disc_consolation_is_load_bearing_for_imperfect_titles(): + """Pin the design rationale for ``CROSS_DISC_POSITION_WEIGHT`` so + the magic number isn't silently regressable. + + Scenario: file has the right title spelling but the metadata + source returns a slightly-different version (e.g. "(Remix)" + suffix), AND the file's disc tag is wrong / missing while the + track number agrees. The bonus is sized so this case still + matches: + + title_only_score = sim("Auntie Diaries", + "Auntie Diaries (Remix)") * 0.45 + ≈ 0.78 * 0.45 = ~0.35 ← below MATCH_THRESHOLD + with cross_disc bonus ≈ 0.35 + 0.05 = ~0.40 ← clears + + Without this consolation, the imperfect-title cross-disc case + would silently start going unmatched. If anyone considers setting + ``CROSS_DISC_POSITION_WEIGHT`` to 0, this test makes the trade-off + explicit (this case becomes unmatched) instead of letting it + regress invisibly. + """ + track = { + 'name': 'Auntie Diaries (Remix)', + 'track_number': 6, 'disc_number': 1, + 'artists': [], + } + # File: same track number, different disc, similar but not perfect + # title (file has the canonical name, source has the version + # variant — common with deluxe / remix / live editions) + tags = _tags( + title='Auntie Diaries', + track=6, + disc=2, + ) + + # Compute the title-only contribution to verify the test's premise: + # title agreement is moderate, NOT high enough on its own to clear + # MATCH_THRESHOLD. The consolation has to be load-bearing. + title_only_score = _sim( + 'Auntie Diaries', 'Auntie Diaries (Remix)', + ) * TITLE_WEIGHT + assert title_only_score < MATCH_THRESHOLD, ( + f"Test premise broken — title sim alone ({title_only_score:.3f}) " + f"already clears MATCH_THRESHOLD ({MATCH_THRESHOLD}). The " + f"cross-disc consolation isn't load-bearing for this scenario; " + f"pick a less-similar title pair." + ) + + score = score_file_against_track( + '/a/file.flac', tags, track, + target_album='', similarity=_sim, + ) + assert score >= MATCH_THRESHOLD, ( + f"Cross-disc consolation ({CROSS_DISC_POSITION_WEIGHT}) is no " + f"longer enough to push the score across MATCH_THRESHOLD " + f"({MATCH_THRESHOLD}) for imperfect-title cases. Total score: " + f"{score:.3f}. Either bump the consolation OR drop it to 0 " + f"deliberately and accept that these files now go unmatched." + ) + + def test_score_near_position_only_when_same_disc(): """Off-by-one track number gets NEAR_POSITION bonus, but ONLY when disc agrees. Cross-disc off-by-one gets nothing.""" diff --git a/tests/imports/test_auto_import_tag_reader_real_files.py b/tests/imports/test_auto_import_tag_reader_real_files.py new file mode 100644 index 00000000..9676c5ff --- /dev/null +++ b/tests/imports/test_auto_import_tag_reader_real_files.py @@ -0,0 +1,257 @@ +"""Integration tests for ``_read_file_tags`` against real audio files +written with mutagen. + +The unit tests for the matcher use dict fixtures — they prove the +algorithm handles the right shapes once tags are read. These tests +prove the tag READER itself extracts the right values from real +files, including the Picard tags (``musicbrainz_trackid``, ``isrc``) +that the new fast paths depend on. + +Without this layer, a mutagen normalisation quirk (different easy- +mode key for FLAC vs MP3 vs M4A, version-specific schema changes) +could silently break the fast paths in production while every unit +test passes. + +Files are written + read in-memory via mutagen so no binary fixture +ships in the repo. +""" + +from __future__ import annotations + +import os +import struct +import tempfile + +import pytest + +from core.auto_import_worker import _read_file_tags + + +# --------------------------------------------------------------------------- +# Helpers — write a minimal valid FLAC with the requested tags +# --------------------------------------------------------------------------- + + +def _write_minimal_flac(path: str, tags: dict): + """Create a real FLAC file with mutagen + write the given Vorbis + comment tags. Mirrors the helper pattern in + ``test_album_mbid_consistency.py`` so we don't reinvent the FLAC + bootstrap. + + Note: duration on these test files is whatever mutagen derives + from the synthesized STREAMINFO — typically near-zero. Tests that + care about a specific duration use the ``streaminfo_total_samples`` + helper to override. + """ + from mutagen.flac import FLAC + + fLaC = b'fLaC' + # Minimum STREAMINFO: 16 bits min/max block size, 24 bits min/max + # frame size, 20 bits sample rate, 3 bits channels-1, 5 bits + # bits-per-sample-1, 36 bits total samples, 128 bits md5 sig. + streaminfo = bytearray(34) + streaminfo[0:2] = struct.pack('>H', 4096) + streaminfo[2:4] = struct.pack('>H', 4096) + streaminfo[10] = 0x0A # sample rate / channels packed + streaminfo[12] = 0x70 # bits-per-sample bits + # Block header: last_block=1, type=0 (STREAMINFO), length=34 + block_header = bytes([0x80, 0x00, 0x00, 0x22]) + with open(path, 'wb') as f: + f.write(fLaC + block_header + bytes(streaminfo)) + + audio = FLAC(path) + for key, value in tags.items(): + audio[key] = value + audio.save() + + +def _write_flac_with_duration(path: str, tags: dict, *, duration_seconds: float): + """Write a FLAC file then patch STREAMINFO to claim the given + duration. Used by the duration-reading test — mutagen reports + audio.info.length from STREAMINFO's total_samples / sample_rate. + """ + from mutagen.flac import FLAC + + _write_minimal_flac(path, tags) + + # Open + patch the STREAMINFO total_samples to encode the desired + # duration. STREAMINFO sits at fLaC[4:] + 4-byte block header + + # 34-byte body. total_samples is a 36-bit field starting at byte + # 18 (top 4 bits packed with the previous byte's bottom 4 bits). + audio = FLAC(path) + audio.info.length = duration_seconds # mutagen exposes this directly + # The reader pulls .info.length, so just patching the in-memory + # representation isn't enough — we need the file on disk to match. + # Easier: write our own STREAMINFO block with the right values. + sample_rate = 44100 + total_samples = int(duration_seconds * sample_rate) + + # Read the file, rewrite the STREAMINFO byte range. + with open(path, 'rb') as f: + data = bytearray(f.read()) + + # STREAMINFO body starts at offset 8 (4-byte 'fLaC' + 4-byte block + # header). Sample rate is 20 bits starting at bit offset 80 (byte + # 10). For our purposes, we need to set: + # sample_rate = 44100 (bits 80..99) + # total_samples = computed (bits 108..143, 36 bits) + # Easier path: synthesize a fresh STREAMINFO with all fields right. + + streaminfo = bytearray(34) + streaminfo[0:2] = struct.pack('>H', 4096) # min_blocksize + streaminfo[2:4] = struct.pack('>H', 4096) # max_blocksize + # min/max framesize stay 0 (bytes 4..9) + + # Pack sample_rate (20 bits) | channels-1 (3 bits) | bps-1 (5 bits) | + # total_samples (36 bits) into bytes 10..17 (64 bits). + # 44100 << 12 leaves room for channels (3 bits, 0=mono so we set 1=stereo) + # bps-1 = 15 (16-bit) + sr = sample_rate # 20 bits + ch = 1 # 3 bits (channels-1: 1 = stereo) + bps = 15 # 5 bits (bps-1: 15 = 16bps) + ts = total_samples # 36 bits + + packed = (sr << 44) | (ch << 41) | (bps << 36) | ts + streaminfo[10:18] = packed.to_bytes(8, 'big') + + # MD5 stays 16 bytes of zeroes (bytes 18..33) + data[8:8 + 34] = streaminfo + + with open(path, 'wb') as f: + f.write(bytes(data)) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_reads_picard_style_mbid_and_isrc_from_flac(): + """Picard writes Vorbis comment tags ``musicbrainz_trackid`` and + ``isrc`` on every tagged FLAC. The reader must extract both via + mutagen's easy-mode normalisation.""" + pytest.importorskip("mutagen") + + with tempfile.TemporaryDirectory() as td: + flac_path = os.path.join(td, 'test.flac') + _write_minimal_flac(flac_path, { + 'TITLE': 'Father Time', + 'ARTIST': 'Kendrick Lamar', + 'ALBUM': 'Mr. Morale & The Big Steppers', + 'TRACKNUMBER': '5', + 'DISCNUMBER': '1', + 'ISRC': 'USUM72202156', + 'MUSICBRAINZ_TRACKID': '8a89a04f-7eba-4c0c-bf0c-5c9d7d54df54', + }) + + tags = _read_file_tags(flac_path) + + assert tags['title'] == 'Father Time' + assert tags['artist'] == 'Kendrick Lamar' + assert tags['album'] == 'Mr. Morale & The Big Steppers' + assert tags['track_number'] == 5 + assert tags['disc_number'] == 1 + # ISRC is upper-cased by reader, stripping done at matcher layer + assert tags['isrc'] == 'USUM72202156' + # MBID is lower-cased + assert tags['mbid'] == '8a89a04f-7eba-4c0c-bf0c-5c9d7d54df54' + + +def test_reads_duration_from_streaminfo(): + """Duration comes off ``audio.info.length`` (StreamInfo on FLAC), + NOT from any tag. Reader must convert seconds to ms to match the + metadata-source convention.""" + pytest.importorskip("mutagen") + + with tempfile.TemporaryDirectory() as td: + flac_path = os.path.join(td, 'test.flac') + _write_flac_with_duration(flac_path, { + 'TITLE': 'Test', 'ARTIST': 'Test', + }, duration_seconds=180.5) + + tags = _read_file_tags(flac_path) + + # 180.5s × 1000 = 180500 ms + assert tags['duration_ms'] == 180_500 + + +def test_reads_file_with_no_tags(): + """File with valid audio but no Vorbis comment block — reader + must return empty/default values, not crash. Common for files + converted from formats that don't carry tags.""" + pytest.importorskip("mutagen") + + with tempfile.TemporaryDirectory() as td: + flac_path = os.path.join(td, 'test.flac') + # No tags dict — only mandatory STREAMINFO + _write_minimal_flac(flac_path, {}) + + tags = _read_file_tags(flac_path) + + # Empty defaults across the board, but the structure is + # complete — no KeyError downstream. + assert tags['title'] == '' + assert tags['artist'] == '' + assert tags['album'] == '' + assert tags['track_number'] == 0 + assert tags['disc_number'] == 1 + assert tags['isrc'] == '' + assert tags['mbid'] == '' + # duration_ms is int (may be 0 for the synthesized minimal flac + # — pin the SHAPE not the value, separate test pins the actual + # duration via _write_flac_with_duration) + assert isinstance(tags['duration_ms'], int) + + +def test_reader_handles_unreadable_file_gracefully(): + """File that's not actually audio — mutagen raises, reader + returns the default-empty dict, doesn't crash.""" + with tempfile.NamedTemporaryFile(suffix='.flac', delete=False) as f: + f.write(b'this is not flac data') + path = f.name + + try: + tags = _read_file_tags(path) + # All defaults, no crash + assert tags['title'] == '' + assert tags['mbid'] == '' + assert tags['duration_ms'] == 0 + finally: + os.unlink(path) + + +def test_track_number_with_total_format_parses_correctly(): + """Some tag schemas write track numbers as ``"5/12"`` (track 5 of + 12). Reader must parse just the leading number, not crash on the + slash.""" + pytest.importorskip("mutagen") + + with tempfile.TemporaryDirectory() as td: + flac_path = os.path.join(td, 'test.flac') + _write_minimal_flac(flac_path, { + 'TRACKNUMBER': '5/12', + 'DISCNUMBER': '2/3', + }) + + tags = _read_file_tags(flac_path) + assert tags['track_number'] == 5 + assert tags['disc_number'] == 2 + + +def test_isrc_with_dashes_preserved_for_matcher_to_normalise(): + """Reader keeps ISRC formatting as-is from the tag — normalisation + (uppercase + strip dashes) happens at the matcher layer + (``_file_identifier``). Splitting normalisation across reader + + matcher is fine; pinning the contract here so no one assumes + the reader normalises.""" + pytest.importorskip("mutagen") + + with tempfile.TemporaryDirectory() as td: + flac_path = os.path.join(td, 'test.flac') + _write_minimal_flac(flac_path, { + 'ISRC': 'us-um7-22-02156', # mixed case, dashes + }) + + tags = _read_file_tags(flac_path) + # Reader uppercases; matcher will strip dashes + assert tags['isrc'] == 'US-UM7-22-02156'