diff --git a/core/auto_import_worker.py b/core/auto_import_worker.py index b024120a..e71fa381 100644 --- a/core/auto_import_worker.py +++ b/core/auto_import_worker.py @@ -961,112 +961,23 @@ class AutoImportWorker: for f in candidate.audio_files: file_tags[f] = _read_file_tags(f) - # Resolve quality duplicates — if multiple files match the same - # (disc, track) position, keep the higher-quality one. Key on - # the (disc_number, track_number) tuple — keying on track_number - # alone breaks multi-disc albums where every disc has tracks - # numbered 1..N, since disc 2 track 1 looks identical to disc 1 - # track 1 to a track-number-only dedup. (Reported on Mr. Morale - # & The Big Steppers — discs 1+2 dumped loose in staging, dedup - # collapsed every same-numbered pair into one survivor.) - seen_positions = {} - deduped_files = [] - for f in candidate.audio_files: - tn = file_tags[f]['track_number'] - dn = file_tags[f].get('disc_number', 1) or 1 - ext = os.path.splitext(f)[1].lower() - position_key = (dn, tn) - if tn > 0 and position_key in seen_positions: - prev_f = seen_positions[position_key] - prev_ext = os.path.splitext(prev_f)[1].lower() - if _quality_rank(ext) > _quality_rank(prev_ext): - deduped_files.remove(prev_f) - deduped_files.append(f) - seen_positions[position_key] = f - else: - deduped_files.append(f) - if tn > 0: - seen_positions[position_key] = f - - # Match files to tracks using weighted scoring - matches = [] - used_files = set() + # Dedupe + match — both lifted into core.imports.album_matching + # so the matching algorithm is unit-testable in isolation + # (no worker instantiation, no metadata-client mocking, no + # _read_file_tags monkeypatch). Worker still owns I/O + + # metadata fetch; the helper is a pure function over dicts. + from core.imports.album_matching import match_files_to_tracks target_album = identification.get('album_name', '') - - for track in tracks: - track_name = track.get('name', '') - track_num = track.get('track_number', 0) - # Track disc number — Spotify uses `disc_number`, Deezer - # `disk_number`, iTunes `discNumber`. Default to 1 when - # missing so single-disc albums still match. - track_disc = ( - track.get('disc_number') - or track.get('disk_number') - or track.get('discNumber') - or 1 - ) - track_artists = track.get('artists', []) - track_artist = '' - if track_artists: - a = track_artists[0] - track_artist = a.get('name', str(a)) if isinstance(a, dict) else str(a) - - best_file = None - best_score = 0 - - for f in deduped_files: - if f in used_files: - continue - - ft = file_tags[f] - score = 0 - - # Title similarity (45%) - title = ft['title'] or os.path.splitext(os.path.basename(f))[0] - score += _similarity(title, track_name) * 0.45 - - # Artist similarity (15%) - if ft['artist'] and track_artist: - score += _similarity(ft['artist'], track_artist) * 0.15 - - # Position match (30%) — gates the bonus on (disc, track) - # tuple, NOT track_number alone. Multi-disc albums with - # parallel numbering (every disc has tracks 1..N) used - # to mismatch here: file with track_number=6 from disc 2 - # got the full bonus when matched against disc 1 track 6 - # because disc was ignored. Result: wrong files matched - # to wrong tracks, integrity check rejected them all. - if ft['track_number'] > 0 and track_num > 0: - ft_disc = ft.get('disc_number', 1) or 1 - if ft['track_number'] == track_num and ft_disc == track_disc: - score += 0.30 - elif ( - ft['track_number'] == track_num - and ft_disc != track_disc - ): - # Same track number, different disc — treat as - # a mild penalty case so the title/artist - # similarity has to carry the match. Cross-disc - # collisions are common in deluxe releases. - score += 0.05 - elif abs(ft['track_number'] - track_num) <= 1 and ft_disc == track_disc: - score += 0.12 - - # Album tag bonus (10%) - if ft['album']: - score += _similarity(ft['album'], target_album) * 0.10 - - if score > best_score and score >= 0.4: - best_score = score - best_file = f - - if best_file: - used_files.add(best_file) - matches.append({ - 'track': track, - 'file': best_file, - 'confidence': round(best_score, 3), - }) + match_result = match_files_to_tracks( + candidate.audio_files, + file_tags, + tracks, + target_album=target_album, + similarity=_similarity, + quality_rank=_quality_rank, + ) + matches = match_result['matches'] + unmatched_files = match_result['unmatched_files'] if not matches: return None @@ -1079,7 +990,7 @@ class AutoImportWorker: return { 'matches': matches, - 'unmatched_files': [f for f in deduped_files if f not in used_files], + 'unmatched_files': unmatched_files, 'total_tracks': len(tracks), 'matched_count': len(matches), 'coverage': round(coverage, 3), diff --git a/core/imports/album_matching.py b/core/imports/album_matching.py new file mode 100644 index 00000000..a63b4a80 --- /dev/null +++ b/core/imports/album_matching.py @@ -0,0 +1,255 @@ +"""Album-track matching helpers — lifted out of +``AutoImportWorker._match_tracks`` so the matching logic is testable in +isolation without instantiating the worker, mocking the metadata +client, or monkey-patching ``_read_file_tags``. + +The worker still owns: +- File-system traversal + tag reads +- Metadata client lookup + album_data fetch +- Album-vs-single routing + +This module owns: +- Quality-aware deduplication keyed on the ``(disc_number, track_number)`` + position tuple +- Weighted match scoring against the album's tracklist +- Returning the list of (track, file, confidence) matches + leftover + unmatched files + +Both behaviors are pure functions over already-fetched data, so the +test surface is just dicts in / dicts out. +""" + +from __future__ import annotations + +import os +from typing import Any, Callable, Dict, List, Set, Tuple + + +# --------------------------------------------------------------------------- +# Match-scoring weights +# --------------------------------------------------------------------------- +# Each weight is a fraction of the 0..1 confidence score the matcher +# accumulates per (file, track) pair. Sum of all maximum-bonus paths +# equals 1.0 in the happy case (perfect title + artist + position + +# album tag agreement). +# +# History note: the position bonus (30%) used to fire on track_number +# alone, which broke multi-disc albums where every disc has tracks 1..N. +# Disc-aware split (POSITION + CROSS_DISC) shipped 2026-05-09 after +# user reported Mr. Morale & The Big Steppers losing half its tracks +# during auto-import. + +TITLE_WEIGHT = 0.45 # case-folded fuzzy title similarity +ARTIST_WEIGHT = 0.15 # albumartist (or artist) similarity +POSITION_WEIGHT = 0.30 # exact (disc_number, track_number) match +NEAR_POSITION_WEIGHT = 0.12 # off-by-one track number, same disc +CROSS_DISC_POSITION_WEIGHT = 0.05 # same track_number, different disc +ALBUM_WEIGHT = 0.10 # album tag similarity to target album + +# A file scoring below this threshold against every track is treated +# as unmatched. Threshold sits below the per-component partial-match +# floor (~0.5 × 0.45 = 0.22) plus a small position consolation, so +# files with weak title agreement still need at least one strong signal. +MATCH_THRESHOLD = 0.4 + + +SimilarityFn = Callable[[str, str], float] +QualityRankFn = Callable[[str], int] + + +def dedupe_files_by_position( + audio_files: List[str], + file_tags: Dict[str, Dict[str, Any]], + *, + quality_rank: QualityRankFn, +) -> List[str]: + """Drop quality-duplicate files at the same ``(disc, track)`` + position, keeping the higher-quality one. + + The position key is ``(disc_number, track_number)`` — NOT + ``track_number`` alone. Multi-disc albums where every disc has + tracks 1..N would otherwise collapse to one disc's worth of files + here, before the matcher even sees the rest. + + Files with ``track_number == 0`` (no tag) all pass through — + can't dedupe positions we don't know. + """ + seen_positions: Dict[Tuple[int, int], str] = {} + deduped: List[str] = [] + + for f in audio_files: + tags = file_tags.get(f, {}) + track_num = tags.get('track_number', 0) or 0 + disc_num = tags.get('disc_number', 1) or 1 + ext = os.path.splitext(f)[1].lower() + position_key = (disc_num, track_num) + + if track_num > 0 and position_key in seen_positions: + prev_f = seen_positions[position_key] + prev_ext = os.path.splitext(prev_f)[1].lower() + if quality_rank(ext) > quality_rank(prev_ext): + deduped.remove(prev_f) + deduped.append(f) + seen_positions[position_key] = f + else: + deduped.append(f) + if track_num > 0: + seen_positions[position_key] = f + + return deduped + + +def _extract_track_disc(track: Dict[str, Any]) -> int: + """Pull disc number off an API track dict. + + Different metadata sources spell the field differently: + Spotify ``disc_number``, Deezer ``disk_number``, iTunes + ``discNumber``. Default to 1 when missing so single-disc albums + still match. + """ + return ( + track.get('disc_number') + or track.get('disk_number') + or track.get('discNumber') + or 1 + ) + + +def _extract_track_artist(track: Dict[str, Any]) -> str: + artists = track.get('artists') or [] + if not artists: + return '' + a = artists[0] + return a.get('name', str(a)) if isinstance(a, dict) else str(a) + + +def score_file_against_track( + file_path: str, + file_tags: Dict[str, Any], + track: Dict[str, Any], + *, + target_album: str, + similarity: SimilarityFn, +) -> float: + """Compute the 0..1 confidence score for matching ``file_path`` + (with its tags) to ``track`` (an API track dict). + + Pure scoring — caller decides what to do with the score (compare + against ``MATCH_THRESHOLD``, pick best-per-track, etc). + """ + score = 0.0 + + # Title similarity (TITLE_WEIGHT). Falls back to filename stem when + # the file has no title tag. + title = file_tags.get('title') or os.path.splitext(os.path.basename(file_path))[0] + track_name = track.get('name', '') + score += similarity(title, track_name) * TITLE_WEIGHT + + # Artist similarity (ARTIST_WEIGHT). Skipped if either side missing. + file_artist = file_tags.get('artist', '') + track_artist = _extract_track_artist(track) + if file_artist and track_artist: + score += similarity(file_artist, track_artist) * ARTIST_WEIGHT + + # Position match (POSITION_WEIGHT / NEAR_POSITION_WEIGHT / + # CROSS_DISC_POSITION_WEIGHT). Gates on the (disc, track) tuple + # rather than track_number alone — see the module docstring's + # multi-disc history note. + file_track_num = file_tags.get('track_number', 0) or 0 + track_num = track.get('track_number', 0) or 0 + if file_track_num > 0 and track_num > 0: + file_disc = file_tags.get('disc_number', 1) or 1 + track_disc = _extract_track_disc(track) + if file_track_num == track_num and file_disc == track_disc: + score += POSITION_WEIGHT + elif file_track_num == track_num and file_disc != track_disc: + # Same track number, different disc — small consolation so + # title/artist similarity has to carry the match. Common + # collision in deluxe / multi-disc releases where every + # disc has tracks numbered 1..N. + score += CROSS_DISC_POSITION_WEIGHT + elif abs(file_track_num - track_num) <= 1 and file_disc == track_disc: + score += NEAR_POSITION_WEIGHT + + # Album tag bonus (ALBUM_WEIGHT). Helps disambiguate when the + # target_album name is a strong signal. + file_album = file_tags.get('album', '') + if file_album: + score += similarity(file_album, target_album) * ALBUM_WEIGHT + + return score + + +def match_files_to_tracks( + audio_files: List[str], + file_tags: Dict[str, Dict[str, Any]], + tracks: List[Dict[str, Any]], + *, + target_album: str, + similarity: SimilarityFn, + quality_rank: QualityRankFn, +) -> Dict[str, Any]: + """Match staging files to album tracks. + + Returns a dict with: + - ``matches``: list of ``{'track': dict, 'file': str, 'confidence': float}``, + one per track that found a file scoring at or above + ``MATCH_THRESHOLD`` + - ``unmatched_files``: files left over after every track found its + best (or none) + + Each file matches at most one track (best-scoring track that + accepted it wins). Each track matches at most one file (the highest- + scoring still-unused file). + + Pure function — no side effects, no I/O, no metadata client. Easy + to unit-test by feeding tag dicts and track dicts directly. + """ + deduped = dedupe_files_by_position(audio_files, file_tags, quality_rank=quality_rank) + + matches: List[Dict[str, Any]] = [] + used_files: Set[str] = set() + + for track in tracks: + best_file = None + best_score = 0.0 + + for f in deduped: + if f in used_files: + continue + tags = file_tags.get(f, {}) + score = score_file_against_track( + f, tags, track, + target_album=target_album, + similarity=similarity, + ) + if score > best_score and score >= MATCH_THRESHOLD: + best_score = score + best_file = f + + if best_file: + used_files.add(best_file) + matches.append({ + 'track': track, + 'file': best_file, + 'confidence': round(best_score, 3), + }) + + return { + 'matches': matches, + 'unmatched_files': [f for f in deduped if f not in used_files], + } + + +__all__ = [ + 'TITLE_WEIGHT', + 'ARTIST_WEIGHT', + 'POSITION_WEIGHT', + 'NEAR_POSITION_WEIGHT', + 'CROSS_DISC_POSITION_WEIGHT', + 'ALBUM_WEIGHT', + 'MATCH_THRESHOLD', + 'dedupe_files_by_position', + 'score_file_against_track', + 'match_files_to_tracks', +] diff --git a/tests/imports/test_album_matching_helper.py b/tests/imports/test_album_matching_helper.py new file mode 100644 index 00000000..05e45bda --- /dev/null +++ b/tests/imports/test_album_matching_helper.py @@ -0,0 +1,378 @@ +"""Direct unit tests for ``core.imports.album_matching`` — the lifted +helper that powers ``AutoImportWorker._match_tracks``. + +The original test file (``test_auto_import_multi_disc_matching.py``) +exercised the matching logic via the worker, requiring monkeypatches +on ``_read_file_tags`` + mocks on the metadata client. These tests +exercise the helper directly with dict inputs / dict outputs — no I/O, +no class instantiation, no patches. + +Together with the worker-level tests, the helper has full behavior +coverage: +- Dedup: same-(disc, track) collapses, cross-disc preserves +- Match: per-component scoring, threshold, position weights, cross-disc + consolation, near-position bonus +- Edge cases: tag-less files (track_number=0), missing artist tags, + cross-disc collision when one side has no disc tag +""" + +from __future__ import annotations + +from difflib import SequenceMatcher + +from core.imports.album_matching import ( + ALBUM_WEIGHT, + ARTIST_WEIGHT, + CROSS_DISC_POSITION_WEIGHT, + MATCH_THRESHOLD, + NEAR_POSITION_WEIGHT, + POSITION_WEIGHT, + TITLE_WEIGHT, + dedupe_files_by_position, + match_files_to_tracks, + score_file_against_track, +) + + +# --------------------------------------------------------------------------- +# Stand-in similarity + quality_rank — match real worker behavior closely +# enough that test scores reflect production behavior. +# --------------------------------------------------------------------------- + + +def _sim(a: str, b: str) -> float: + """Mirror of the worker's _similarity (case-folded SequenceMatcher).""" + return SequenceMatcher(None, (a or '').lower(), (b or '').lower()).ratio() + + +def _qrank(ext: str) -> int: + """Mirror of the worker's _quality_rank.""" + ranks = {'.flac': 100, '.alac': 95, '.wav': 80, '.aac': 60, + '.ogg': 50, '.opus': 50, '.m4a': 60, '.mp3': 30, '.wma': 20} + return ranks.get((ext or '').lower(), 0) + + +def _tags(*, title='', artist='Artist', album='Album', track=0, disc=1): + return { + 'title': title, 'artist': artist, 'album': album, + 'track_number': track, 'disc_number': disc, 'year': '', + } + + +# --------------------------------------------------------------------------- +# Constants — pin the weights so accidental tweaks fail at the test boundary +# --------------------------------------------------------------------------- + + +def test_constants_sum_to_one(): + """Sum of TITLE + ARTIST + POSITION + ALBUM should equal 1.0 in + the happy case (perfect agreement). Catches accidental drift if + someone edits one weight without checking the rest. Float tolerance + because 0.45 + 0.15 + 0.30 + 0.10 has a 1e-16 rounding error.""" + total = TITLE_WEIGHT + ARTIST_WEIGHT + POSITION_WEIGHT + ALBUM_WEIGHT + assert abs(total - 1.0) < 1e-9 + + +def test_match_threshold_requires_more_than_position_alone(): + """Pin the design intent: a file matching ONLY on position + (perfect track + disc, zero title similarity) should NOT meet + the threshold. The matcher requires meaningful title agreement + AT LEAST in addition to position. Catches accidental threshold + drops that would let position-only matches sneak through.""" + assert MATCH_THRESHOLD > POSITION_WEIGHT + + +# --------------------------------------------------------------------------- +# dedupe_files_by_position — pure-function tests +# --------------------------------------------------------------------------- + + +def test_dedupe_keeps_higher_quality_at_same_position(): + files = ['/a/track1.mp3', '/a/track1.flac'] + file_tags = { + '/a/track1.mp3': _tags(track=1, disc=1), + '/a/track1.flac': _tags(track=1, disc=1), + } + result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank) + assert result == ['/a/track1.flac'] + + +def test_dedupe_preserves_same_track_across_discs(): + """The fix for the multi-disc bug: track_number=1 on disc 1 and + track_number=1 on disc 2 are different positions, both survive.""" + files = ['/a/d1t1.flac', '/a/d2t1.flac'] + file_tags = { + '/a/d1t1.flac': _tags(track=1, disc=1), + '/a/d2t1.flac': _tags(track=1, disc=2), + } + result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank) + assert set(result) == {'/a/d1t1.flac', '/a/d2t1.flac'} + + +def test_dedupe_passes_through_files_with_no_track_number(): + """Files with track_number=0 (no tag) can't be deduped — keep them + all so the matcher gets a chance to title-match them.""" + files = ['/a/no_tag_a.mp3', '/a/no_tag_b.mp3', '/a/no_tag_c.mp3'] + file_tags = {f: _tags(title='Untagged', track=0, disc=1) for f in files} + result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank) + assert set(result) == set(files) + + +def test_dedupe_keeps_first_when_quality_equal(): + """Two files at same position, same quality — first one wins.""" + files = ['/a/first.flac', '/a/second.flac'] + file_tags = { + '/a/first.flac': _tags(track=1, disc=1), + '/a/second.flac': _tags(track=1, disc=1), + } + result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank) + assert result == ['/a/first.flac'] + + +# --------------------------------------------------------------------------- +# score_file_against_track — per-component scoring +# --------------------------------------------------------------------------- + + +def test_score_perfect_agreement_equals_one(): + """Title + artist + (disc, track) + album all match → score = 1.0.""" + track = { + 'name': 'Song', 'track_number': 5, 'disc_number': 2, + 'artists': [{'name': 'Artist'}], + } + tags = _tags(title='Song', artist='Artist', album='Album', track=5, disc=2) + score = score_file_against_track( + '/a/file.flac', tags, track, + target_album='Album', similarity=_sim, + ) + assert abs(score - 1.0) < 0.001 + + +def test_score_position_match_requires_both_disc_and_track(): + """Same track number, different disc → only CROSS_DISC bonus, not + full POSITION bonus. This is the regression fix for multi-disc + cross-collisions.""" + track = {'name': 'X', 'track_number': 6, 'disc_number': 1, 'artists': []} + # File for disc 2 track 6 — same number, wrong disc + tags = _tags(title='X', track=6, disc=2) + score = score_file_against_track( + '/a/file.flac', tags, track, + target_album='', similarity=_sim, + ) + # Title weight (1.0) + cross-disc consolation (0.05) + nothing else + expected = TITLE_WEIGHT + CROSS_DISC_POSITION_WEIGHT + assert abs(score - expected) < 0.001 + + +def test_score_near_position_only_when_same_disc(): + """Off-by-one track number gets NEAR_POSITION bonus, but ONLY when + disc agrees. Cross-disc off-by-one gets nothing.""" + track = {'name': 'Y', 'track_number': 5, 'disc_number': 1, 'artists': []} + + same_disc = _tags(title='Y', track=6, disc=1) # off by 1 on same disc + score_same = score_file_against_track( + '/a/f.flac', same_disc, track, target_album='', similarity=_sim, + ) + expected_same = TITLE_WEIGHT + NEAR_POSITION_WEIGHT + assert abs(score_same - expected_same) < 0.001 + + diff_disc = _tags(title='Y', track=6, disc=2) # off by 1, different disc + score_diff = score_file_against_track( + '/a/f.flac', diff_disc, track, target_album='', similarity=_sim, + ) + # No position bonus at all (off-by-one + cross-disc) + expected_diff = TITLE_WEIGHT + assert abs(score_diff - expected_diff) < 0.001 + + +def test_score_handles_missing_track_artist(): + """Track with no artists list — artist component just contributes 0.""" + track = {'name': 'Z', 'track_number': 1, 'disc_number': 1, 'artists': []} + tags = _tags(title='Z', artist='Real Artist', track=1, disc=1) + score = score_file_against_track( + '/a/f.flac', tags, track, target_album='', similarity=_sim, + ) + # Title (1.0) + position (0.30) + no artist bonus + no album + expected = TITLE_WEIGHT + POSITION_WEIGHT + assert abs(score - expected) < 0.001 + + +def test_score_handles_missing_file_artist(): + """File with no artist tag — same as missing track artist, no bonus.""" + track = {'name': 'Z', 'track_number': 1, 'disc_number': 1, + 'artists': [{'name': 'Artist'}]} + tags = _tags(title='Z', artist='', track=1, disc=1) + score = score_file_against_track( + '/a/f.flac', tags, track, target_album='', similarity=_sim, + ) + expected = TITLE_WEIGHT + POSITION_WEIGHT + assert abs(score - expected) < 0.001 + + +def test_score_disc_field_aliases(): + """API track disc number can come from disc_number / disk_number / + discNumber depending on source. All three should be honored.""" + tags = _tags(title='X', track=1, disc=2) + for disc_field in ('disc_number', 'disk_number', 'discNumber'): + track = {'name': 'X', 'track_number': 1, disc_field: 2, 'artists': []} + score = score_file_against_track( + '/a/f.flac', tags, track, target_album='', similarity=_sim, + ) + # Should get full POSITION bonus + expected = TITLE_WEIGHT + POSITION_WEIGHT + assert abs(score - expected) < 0.001, ( + f"Disc field '{disc_field}' should be recognised (score={score})" + ) + + +def test_score_filename_fallback_when_title_tag_missing(): + """File with no title tag falls back to the filename stem for the + title-similarity comparison.""" + track = {'name': 'Filename Title', 'track_number': 0, 'artists': []} + tags = _tags(title='', track=0, disc=1) + score = score_file_against_track( + '/a/Filename Title.flac', tags, track, + target_album='', similarity=_sim, + ) + # Title fallback gives perfect match → TITLE_WEIGHT + assert abs(score - TITLE_WEIGHT) < 0.001 + + +# --------------------------------------------------------------------------- +# match_files_to_tracks — end-to-end (still pure) +# --------------------------------------------------------------------------- + + +def test_match_pairs_files_to_correct_tracks(): + """Happy path — 3 files, 3 tracks, all match perfectly.""" + files = ['/a/01.flac', '/a/02.flac', '/a/03.flac'] + file_tags = { + '/a/01.flac': _tags(title='A', track=1, disc=1), + '/a/02.flac': _tags(title='B', track=2, disc=1), + '/a/03.flac': _tags(title='C', track=3, disc=1), + } + tracks = [ + {'name': 'A', 'track_number': 1, 'disc_number': 1, 'artists': [{'name': 'Artist'}]}, + {'name': 'B', 'track_number': 2, 'disc_number': 1, 'artists': [{'name': 'Artist'}]}, + {'name': 'C', 'track_number': 3, 'disc_number': 1, 'artists': [{'name': 'Artist'}]}, + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='Album', similarity=_sim, quality_rank=_qrank, + ) + assert len(result['matches']) == 3 + assert not result['unmatched_files'] + + +def test_match_each_file_used_at_most_once(): + """Two tracks competing for the same file — only one wins, the + other gets no match.""" + files = ['/a/only.flac'] + file_tags = {'/a/only.flac': _tags(title='Track Name', track=1, disc=1)} + tracks = [ + {'name': 'Track Name', 'track_number': 1, 'disc_number': 1, 'artists': []}, + {'name': 'Track Name', 'track_number': 1, 'disc_number': 1, 'artists': []}, # dup + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='', similarity=_sim, quality_rank=_qrank, + ) + assert len(result['matches']) == 1 + + +def test_match_below_threshold_files_left_unmatched(): + """File with weak title + no other signals should be left in + unmatched_files, not force-matched.""" + files = ['/a/random.flac'] + file_tags = {'/a/random.flac': _tags(title='Totally Different', track=0, disc=1)} + tracks = [ + {'name': 'Specific Track', 'track_number': 99, 'disc_number': 1, 'artists': []}, + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='', similarity=_sim, quality_rank=_qrank, + ) + assert not result['matches'] + assert result['unmatched_files'] == ['/a/random.flac'] + + +# --------------------------------------------------------------------------- +# Edge case Cin would flag: tag-less file matching against multi-disc API +# --------------------------------------------------------------------------- + + +def test_tagless_file_matches_disc1_track_with_perfect_title(): + """User has a perfectly-named file with no embedded tags — file + title in the filename matches the metadata title exactly. The + matcher should still pair it correctly even though disc info is + missing on the file side (defaults to disc 1).""" + files = ['/a/Auntie Diaries.flac'] + file_tags = { + '/a/Auntie Diaries.flac': _tags(title='', track=0, disc=1), # no tags + } + tracks = [ + {'name': 'Auntie Diaries', 'track_number': 6, 'disc_number': 2, + 'artists': [{'name': 'Kendrick Lamar'}]}, + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='Mr. Morale & The Big Steppers', + similarity=_sim, quality_rank=_qrank, + ) + # Perfect title sim (1.0 × 0.45 = 0.45) > MATCH_THRESHOLD (0.4) + # → file matches the track even with missing position info + assert len(result['matches']) == 1 + assert result['matches'][0]['file'] == '/a/Auntie Diaries.flac' + + +def test_tagless_files_against_multidisc_album_partial_match(): + """Two tag-less files with strong filename titles (one matches a + disc-1 track, one matches a disc-2 track). Both should match + correctly via title — no disc info needed.""" + files = ['/a/Father Time.flac', '/a/Mother I Sober.flac'] + file_tags = {f: _tags(title='', track=0, disc=1) for f in files} + tracks = [ + {'name': 'Father Time', 'track_number': 5, 'disc_number': 1, 'artists': []}, + {'name': 'Mother I Sober', 'track_number': 8, 'disc_number': 2, 'artists': []}, + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='Mr. Morale', similarity=_sim, quality_rank=_qrank, + ) + assert len(result['matches']) == 2 + by_track = {m['track']['name']: m['file'] for m in result['matches']} + assert by_track['Father Time'] == '/a/Father Time.flac' + assert by_track['Mother I Sober'] == '/a/Mother I Sober.flac' + + +def test_tagless_file_with_weak_title_unmatched_in_multidisc(): + """Edge case Cin would flag: tag-less file (so disc defaults to 1) + with a weak filename title against a disc-2-only API track. Pre-fix, + the position bonus fired on track_number alone, so files like this + would sneak matches via just track_number agreement. Post-fix, the + cross-disc consolation (5%) plus weak title can fall below + MATCH_THRESHOLD → file goes unmatched. + + This is the BEHAVIOR CHANGE worth pinning. For correctly-tagged + files in multi-disc albums (the user's actual case) this is the + right call. For users with weak tags this is a regression — they + now have to rely on title or fix their tags.""" + files = ['/a/track06.flac'] # weak title, no tags + file_tags = { + '/a/track06.flac': _tags(title='', track=6, disc=1), # disc defaults to 1 + } + tracks = [ + # API has only this disc-2 track 6 — file's disc-1-track-6 + # signal would have fired full position bonus pre-fix + {'name': 'Auntie Diaries', 'track_number': 6, 'disc_number': 2, + 'artists': [{'name': 'Kendrick Lamar'}]}, + ] + result = match_files_to_tracks( + files, file_tags, tracks, + target_album='Mr. Morale', similarity=_sim, quality_rank=_qrank, + ) + # Title sim "track06" vs "Auntie Diaries" is near zero (~0.10) + # × 0.45 = ~0.045. Plus cross-disc 0.05 = ~0.095. Below 0.4 + # threshold → no match. + assert not result['matches'] + assert result['unmatched_files'] == ['/a/track06.flac'] diff --git a/tests/test_import_album_match_endpoint.py b/tests/test_import_album_match_endpoint.py new file mode 100644 index 00000000..78cca0e6 --- /dev/null +++ b/tests/test_import_album_match_endpoint.py @@ -0,0 +1,118 @@ +"""Pin the ``/api/import/album/match`` endpoint's source-routing +behavior — github issue #524 regression guard. + +The bug: clicking an album in the import page POSTed only ``album_id``, +dropping the ``source`` field that the backend needs to route the +lookup to the correct metadata client. The backend silently fell back +to its primary-source-priority chain, which fails for cross-source +album_ids (Deezer numeric id vs Spotify primary, etc.) → broken +fallback dict written to the library DB. + +The frontend fix populates source on every match POST. These tests +pin the BACKEND defense: when source is dropped (curl, third-party, +regression in another caller), a clear warning lands in the logs so +the regression is grep-able instead of silent. +""" + +from __future__ import annotations + +import logging +from unittest.mock import patch + +import pytest + + +@pytest.fixture +def import_match_client(monkeypatch): + """Flask test client, with the album-match payload builder mocked + so we don't have to spin up real metadata clients.""" + with patch("web_server.add_activity_item"): + with patch("web_server.SpotifyClient"): + with patch("core.tidal_client.TidalClient"): + from web_server import app as flask_app + flask_app.config['TESTING'] = True + yield flask_app.test_client() + + +def test_missing_source_logs_warning(import_match_client, caplog): + """When the match POST omits source, backend logs a clear warning + so the regression is visible in app.log even though the request + still proceeds (best-effort lookup via primary-source priority). + """ + fake_payload = {'success': True, 'album': {}, 'matches': [], 'unmatched_files': []} + with caplog.at_level(logging.WARNING, logger='soulsync'): + with patch( + 'web_server.build_album_import_match_payload', + return_value=fake_payload, + ): + resp = import_match_client.post( + '/api/import/album/match', + json={'album_id': '1234567890'}, # no source + ) + + assert resp.status_code == 200 + # The defensive log must mention the missing source AND the album_id + # so ops can grep app.log for the offending caller. + assert any( + "Missing 'source'" in r.message and '1234567890' in r.message + for r in caplog.records + ), ( + "Expected a warning naming the missing source + album_id. " + "Got records: " + repr([r.message for r in caplog.records]) + ) + + +def test_source_provided_does_not_warn(import_match_client, caplog): + """When source IS provided (the common path), no warning fires. + Catches regression where the warning becomes noisy from firing on + every legit request.""" + fake_payload = {'success': True, 'album': {}, 'matches': [], 'unmatched_files': []} + with caplog.at_level(logging.WARNING, logger='soulsync'): + with patch( + 'web_server.build_album_import_match_payload', + return_value=fake_payload, + ): + resp = import_match_client.post( + '/api/import/album/match', + json={ + 'album_id': '1234567890', + 'source': 'deezer', + 'album_name': 'Test Album', + 'album_artist': 'Test Artist', + }, + ) + + assert resp.status_code == 200 + missing_source_warnings = [ + r for r in caplog.records if "Missing 'source'" in r.message + ] + assert not missing_source_warnings, ( + "When source is supplied, no missing-source warning should fire. " + f"Got: {[r.message for r in missing_source_warnings]}" + ) + + +def test_source_passed_through_to_payload_builder(import_match_client): + """Verify the endpoint actually forwards source to the underlying + payload builder. Without this, we'd be logging the warning correctly + but still doing the wrong lookup.""" + fake_payload = {'success': True, 'album': {}, 'matches': [], 'unmatched_files': []} + with patch( + 'web_server.build_album_import_match_payload', + return_value=fake_payload, + ) as mock_builder: + import_match_client.post( + '/api/import/album/match', + json={ + 'album_id': 'abc123', + 'source': 'spotify', + 'album_name': 'X', + 'album_artist': 'Y', + }, + ) + + mock_builder.assert_called_once() + call_kwargs = mock_builder.call_args.kwargs + assert call_kwargs['source'] == 'spotify' + assert call_kwargs['album_name'] == 'X' + assert call_kwargs['album_artist'] == 'Y' diff --git a/tests/test_import_page_album_lookup_pattern.py b/tests/test_import_page_album_lookup_pattern.py new file mode 100644 index 00000000..54bf142d --- /dev/null +++ b/tests/test_import_page_album_lookup_pattern.py @@ -0,0 +1,112 @@ +"""Pin the import-page album-lookup cache pattern in +``webui/static/stats-automations.js`` — github issue #524 regression +guard at the source-text level. + +Why a structural test instead of a behavioral JS test: + +``stats-automations.js`` is a ~7k-line file with a lot of global state ++ inline DOM rendering. Loading it into a sandboxed Node `vm` context +(the pattern used in `tests/static/test_discover_section_controller.mjs`) +would require stubbing dozens of unrelated dependencies. The file +needs to be modularized before behavioral tests are practical for +arbitrary functions in it. + +Until then, this test fails the suite if the critical pattern from +the #524 fix gets removed: + +1. The album cache (``_albumLookup`` field on ``importPageState``) +2. Card renderers populating the cache before emitting the onclick +3. The match-POST builder reading source/name/artist from the cache + +If anyone deletes the cache, the click handler, or the cache writes, +this test catches it before the regression ships. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + + +_REPO_ROOT = Path(__file__).resolve().parents[1] +_SOURCE = _REPO_ROOT / "webui" / "static" / "stats-automations.js" + + +@pytest.fixture(scope="module") +def js_source() -> str: + return _SOURCE.read_text(encoding="utf-8") + + +def test_album_lookup_cache_field_exists_on_state(js_source: str): + """importPageState must have an `_albumLookup` field. Without it, + card renderers have nowhere to stash source/name/artist for the + click handler to read.""" + assert "_albumLookup:" in js_source, ( + "importPageState._albumLookup field missing — the album cache " + "that backs the source-routing fix for issue #524 has been " + "removed. The click handler will fall back to passing only " + "album_id and the backend will silently misroute lookups again." + ) + + +def test_select_album_handler_reads_cache(js_source: str): + """importPageSelectAlbum must read source / name / artist from + the cache and include them in the match POST body. The whole + point of the fix.""" + # Find the function body + match = re.search( + r"async function importPageSelectAlbum\([^)]*\) \{(.*?)^\}", + js_source, re.DOTALL | re.MULTILINE, + ) + assert match, "importPageSelectAlbum function not found" + body = match.group(1) + + # Must read from the lookup cache + assert "_albumLookup[" in body, ( + "importPageSelectAlbum no longer reads from " + "importPageState._albumLookup — match POST will drop source " + "again, see issue #524." + ) + + # Must build a matchBody that includes source + album_name + album_artist + for required_field in ("source:", "album_name:", "album_artist:"): + assert required_field in body, ( + f"matchBody missing required field {required_field!r}. " + "Backend's get_artist_album_tracks needs source to route " + "the lookup to the correct metadata client. Without it, " + "cross-source album_ids fall through to the failure-fallback " + "dict (Unknown Artist / album_id-as-title / 0 tracks). " + "See issue #524 for the original symptom." + ) + + +def test_card_renderers_populate_cache_before_onclick(js_source: str): + """Both renderers (suggestion card + search-result card) must write + to ``_albumLookup`` before emitting the onclick — otherwise the + click handler reads an empty cache for newly-displayed albums.""" + cache_writes = re.findall( + r"_albumLookup\[a\.id\]\s*=\s*\{", + js_source, + ) + assert len(cache_writes) >= 2, ( + f"Expected >=2 _albumLookup writes (one per card renderer - " + f"suggestions + search results), found {len(cache_writes)}. " + "Adding a new card-rendering site without populating the cache " + "regresses issue #524 for that path." + ) + + +def test_cache_entry_carries_source_field(js_source: str): + """The cache must store `source:` per entry — not just id/name/artist.""" + write_blocks = re.findall( + r"_albumLookup\[a\.id\]\s*=\s*\{[^}]*\}", + js_source, + ) + assert write_blocks, "no _albumLookup writes found" + assert any("source:" in block for block in write_blocks), ( + "_albumLookup cache entries must include `source` — that's the " + "field the click handler forwards to /api/import/album/match " + "to route the lookup to the correct provider." + )