You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/tests/imports/test_album_matching_helper.py

379 lines
16 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""Direct unit tests for ``core.imports.album_matching`` — the lifted
helper that powers ``AutoImportWorker._match_tracks``.
The original test file (``test_auto_import_multi_disc_matching.py``)
exercised the matching logic via the worker, requiring monkeypatches
on ``_read_file_tags`` + mocks on the metadata client. These tests
exercise the helper directly with dict inputs / dict outputs — no I/O,
no class instantiation, no patches.
Together with the worker-level tests, the helper has full behavior
coverage:
- Dedup: same-(disc, track) collapses, cross-disc preserves
- Match: per-component scoring, threshold, position weights, cross-disc
consolation, near-position bonus
- Edge cases: tag-less files (track_number=0), missing artist tags,
cross-disc collision when one side has no disc tag
"""
from __future__ import annotations
from difflib import SequenceMatcher
from core.imports.album_matching import (
ALBUM_WEIGHT,
ARTIST_WEIGHT,
CROSS_DISC_POSITION_WEIGHT,
MATCH_THRESHOLD,
NEAR_POSITION_WEIGHT,
POSITION_WEIGHT,
TITLE_WEIGHT,
dedupe_files_by_position,
match_files_to_tracks,
score_file_against_track,
)
# ---------------------------------------------------------------------------
# Stand-in similarity + quality_rank — match real worker behavior closely
# enough that test scores reflect production behavior.
# ---------------------------------------------------------------------------
def _sim(a: str, b: str) -> float:
"""Mirror of the worker's _similarity (case-folded SequenceMatcher)."""
return SequenceMatcher(None, (a or '').lower(), (b or '').lower()).ratio()
def _qrank(ext: str) -> int:
"""Mirror of the worker's _quality_rank."""
ranks = {'.flac': 100, '.alac': 95, '.wav': 80, '.aac': 60,
'.ogg': 50, '.opus': 50, '.m4a': 60, '.mp3': 30, '.wma': 20}
return ranks.get((ext or '').lower(), 0)
def _tags(*, title='', artist='Artist', album='Album', track=0, disc=1):
return {
'title': title, 'artist': artist, 'album': album,
'track_number': track, 'disc_number': disc, 'year': '',
}
# ---------------------------------------------------------------------------
# Constants — pin the weights so accidental tweaks fail at the test boundary
# ---------------------------------------------------------------------------
def test_constants_sum_to_one():
"""Sum of TITLE + ARTIST + POSITION + ALBUM should equal 1.0 in
the happy case (perfect agreement). Catches accidental drift if
someone edits one weight without checking the rest. Float tolerance
because 0.45 + 0.15 + 0.30 + 0.10 has a 1e-16 rounding error."""
total = TITLE_WEIGHT + ARTIST_WEIGHT + POSITION_WEIGHT + ALBUM_WEIGHT
assert abs(total - 1.0) < 1e-9
def test_match_threshold_requires_more_than_position_alone():
"""Pin the design intent: a file matching ONLY on position
(perfect track + disc, zero title similarity) should NOT meet
the threshold. The matcher requires meaningful title agreement
AT LEAST in addition to position. Catches accidental threshold
drops that would let position-only matches sneak through."""
assert MATCH_THRESHOLD > POSITION_WEIGHT
# ---------------------------------------------------------------------------
# dedupe_files_by_position — pure-function tests
# ---------------------------------------------------------------------------
def test_dedupe_keeps_higher_quality_at_same_position():
files = ['/a/track1.mp3', '/a/track1.flac']
file_tags = {
'/a/track1.mp3': _tags(track=1, disc=1),
'/a/track1.flac': _tags(track=1, disc=1),
}
result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank)
assert result == ['/a/track1.flac']
def test_dedupe_preserves_same_track_across_discs():
"""The fix for the multi-disc bug: track_number=1 on disc 1 and
track_number=1 on disc 2 are different positions, both survive."""
files = ['/a/d1t1.flac', '/a/d2t1.flac']
file_tags = {
'/a/d1t1.flac': _tags(track=1, disc=1),
'/a/d2t1.flac': _tags(track=1, disc=2),
}
result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank)
assert set(result) == {'/a/d1t1.flac', '/a/d2t1.flac'}
def test_dedupe_passes_through_files_with_no_track_number():
"""Files with track_number=0 (no tag) can't be deduped — keep them
all so the matcher gets a chance to title-match them."""
files = ['/a/no_tag_a.mp3', '/a/no_tag_b.mp3', '/a/no_tag_c.mp3']
file_tags = {f: _tags(title='Untagged', track=0, disc=1) for f in files}
result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank)
assert set(result) == set(files)
def test_dedupe_keeps_first_when_quality_equal():
"""Two files at same position, same quality — first one wins."""
files = ['/a/first.flac', '/a/second.flac']
file_tags = {
'/a/first.flac': _tags(track=1, disc=1),
'/a/second.flac': _tags(track=1, disc=1),
}
result = dedupe_files_by_position(files, file_tags, quality_rank=_qrank)
assert result == ['/a/first.flac']
# ---------------------------------------------------------------------------
# score_file_against_track — per-component scoring
# ---------------------------------------------------------------------------
def test_score_perfect_agreement_equals_one():
"""Title + artist + (disc, track) + album all match → score = 1.0."""
track = {
'name': 'Song', 'track_number': 5, 'disc_number': 2,
'artists': [{'name': 'Artist'}],
}
tags = _tags(title='Song', artist='Artist', album='Album', track=5, disc=2)
score = score_file_against_track(
'/a/file.flac', tags, track,
target_album='Album', similarity=_sim,
)
assert abs(score - 1.0) < 0.001
def test_score_position_match_requires_both_disc_and_track():
"""Same track number, different disc → only CROSS_DISC bonus, not
full POSITION bonus. This is the regression fix for multi-disc
cross-collisions."""
track = {'name': 'X', 'track_number': 6, 'disc_number': 1, 'artists': []}
# File for disc 2 track 6 — same number, wrong disc
tags = _tags(title='X', track=6, disc=2)
score = score_file_against_track(
'/a/file.flac', tags, track,
target_album='', similarity=_sim,
)
# Title weight (1.0) + cross-disc consolation (0.05) + nothing else
expected = TITLE_WEIGHT + CROSS_DISC_POSITION_WEIGHT
assert abs(score - expected) < 0.001
def test_score_near_position_only_when_same_disc():
"""Off-by-one track number gets NEAR_POSITION bonus, but ONLY when
disc agrees. Cross-disc off-by-one gets nothing."""
track = {'name': 'Y', 'track_number': 5, 'disc_number': 1, 'artists': []}
same_disc = _tags(title='Y', track=6, disc=1) # off by 1 on same disc
score_same = score_file_against_track(
'/a/f.flac', same_disc, track, target_album='', similarity=_sim,
)
expected_same = TITLE_WEIGHT + NEAR_POSITION_WEIGHT
assert abs(score_same - expected_same) < 0.001
diff_disc = _tags(title='Y', track=6, disc=2) # off by 1, different disc
score_diff = score_file_against_track(
'/a/f.flac', diff_disc, track, target_album='', similarity=_sim,
)
# No position bonus at all (off-by-one + cross-disc)
expected_diff = TITLE_WEIGHT
assert abs(score_diff - expected_diff) < 0.001
def test_score_handles_missing_track_artist():
"""Track with no artists list — artist component just contributes 0."""
track = {'name': 'Z', 'track_number': 1, 'disc_number': 1, 'artists': []}
tags = _tags(title='Z', artist='Real Artist', track=1, disc=1)
score = score_file_against_track(
'/a/f.flac', tags, track, target_album='', similarity=_sim,
)
# Title (1.0) + position (0.30) + no artist bonus + no album
expected = TITLE_WEIGHT + POSITION_WEIGHT
assert abs(score - expected) < 0.001
def test_score_handles_missing_file_artist():
"""File with no artist tag — same as missing track artist, no bonus."""
track = {'name': 'Z', 'track_number': 1, 'disc_number': 1,
'artists': [{'name': 'Artist'}]}
tags = _tags(title='Z', artist='', track=1, disc=1)
score = score_file_against_track(
'/a/f.flac', tags, track, target_album='', similarity=_sim,
)
expected = TITLE_WEIGHT + POSITION_WEIGHT
assert abs(score - expected) < 0.001
def test_score_disc_field_aliases():
"""API track disc number can come from disc_number / disk_number /
discNumber depending on source. All three should be honored."""
tags = _tags(title='X', track=1, disc=2)
for disc_field in ('disc_number', 'disk_number', 'discNumber'):
track = {'name': 'X', 'track_number': 1, disc_field: 2, 'artists': []}
score = score_file_against_track(
'/a/f.flac', tags, track, target_album='', similarity=_sim,
)
# Should get full POSITION bonus
expected = TITLE_WEIGHT + POSITION_WEIGHT
assert abs(score - expected) < 0.001, (
f"Disc field '{disc_field}' should be recognised (score={score})"
)
def test_score_filename_fallback_when_title_tag_missing():
"""File with no title tag falls back to the filename stem for the
title-similarity comparison."""
track = {'name': 'Filename Title', 'track_number': 0, 'artists': []}
tags = _tags(title='', track=0, disc=1)
score = score_file_against_track(
'/a/Filename Title.flac', tags, track,
target_album='', similarity=_sim,
)
# Title fallback gives perfect match → TITLE_WEIGHT
assert abs(score - TITLE_WEIGHT) < 0.001
# ---------------------------------------------------------------------------
# match_files_to_tracks — end-to-end (still pure)
# ---------------------------------------------------------------------------
def test_match_pairs_files_to_correct_tracks():
"""Happy path — 3 files, 3 tracks, all match perfectly."""
files = ['/a/01.flac', '/a/02.flac', '/a/03.flac']
file_tags = {
'/a/01.flac': _tags(title='A', track=1, disc=1),
'/a/02.flac': _tags(title='B', track=2, disc=1),
'/a/03.flac': _tags(title='C', track=3, disc=1),
}
tracks = [
{'name': 'A', 'track_number': 1, 'disc_number': 1, 'artists': [{'name': 'Artist'}]},
{'name': 'B', 'track_number': 2, 'disc_number': 1, 'artists': [{'name': 'Artist'}]},
{'name': 'C', 'track_number': 3, 'disc_number': 1, 'artists': [{'name': 'Artist'}]},
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='Album', similarity=_sim, quality_rank=_qrank,
)
assert len(result['matches']) == 3
assert not result['unmatched_files']
def test_match_each_file_used_at_most_once():
"""Two tracks competing for the same file — only one wins, the
other gets no match."""
files = ['/a/only.flac']
file_tags = {'/a/only.flac': _tags(title='Track Name', track=1, disc=1)}
tracks = [
{'name': 'Track Name', 'track_number': 1, 'disc_number': 1, 'artists': []},
{'name': 'Track Name', 'track_number': 1, 'disc_number': 1, 'artists': []}, # dup
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='', similarity=_sim, quality_rank=_qrank,
)
assert len(result['matches']) == 1
def test_match_below_threshold_files_left_unmatched():
"""File with weak title + no other signals should be left in
unmatched_files, not force-matched."""
files = ['/a/random.flac']
file_tags = {'/a/random.flac': _tags(title='Totally Different', track=0, disc=1)}
tracks = [
{'name': 'Specific Track', 'track_number': 99, 'disc_number': 1, 'artists': []},
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='', similarity=_sim, quality_rank=_qrank,
)
assert not result['matches']
assert result['unmatched_files'] == ['/a/random.flac']
# ---------------------------------------------------------------------------
# Edge case Cin would flag: tag-less file matching against multi-disc API
# ---------------------------------------------------------------------------
def test_tagless_file_matches_disc1_track_with_perfect_title():
"""User has a perfectly-named file with no embedded tags — file
title in the filename matches the metadata title exactly. The
matcher should still pair it correctly even though disc info is
missing on the file side (defaults to disc 1)."""
files = ['/a/Auntie Diaries.flac']
file_tags = {
'/a/Auntie Diaries.flac': _tags(title='', track=0, disc=1), # no tags
}
tracks = [
{'name': 'Auntie Diaries', 'track_number': 6, 'disc_number': 2,
'artists': [{'name': 'Kendrick Lamar'}]},
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='Mr. Morale & The Big Steppers',
similarity=_sim, quality_rank=_qrank,
)
# Perfect title sim (1.0 × 0.45 = 0.45) > MATCH_THRESHOLD (0.4)
# → file matches the track even with missing position info
assert len(result['matches']) == 1
assert result['matches'][0]['file'] == '/a/Auntie Diaries.flac'
def test_tagless_files_against_multidisc_album_partial_match():
"""Two tag-less files with strong filename titles (one matches a
disc-1 track, one matches a disc-2 track). Both should match
correctly via title — no disc info needed."""
files = ['/a/Father Time.flac', '/a/Mother I Sober.flac']
file_tags = {f: _tags(title='', track=0, disc=1) for f in files}
tracks = [
{'name': 'Father Time', 'track_number': 5, 'disc_number': 1, 'artists': []},
{'name': 'Mother I Sober', 'track_number': 8, 'disc_number': 2, 'artists': []},
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='Mr. Morale', similarity=_sim, quality_rank=_qrank,
)
assert len(result['matches']) == 2
by_track = {m['track']['name']: m['file'] for m in result['matches']}
assert by_track['Father Time'] == '/a/Father Time.flac'
assert by_track['Mother I Sober'] == '/a/Mother I Sober.flac'
def test_tagless_file_with_weak_title_unmatched_in_multidisc():
"""Edge case Cin would flag: tag-less file (so disc defaults to 1)
with a weak filename title against a disc-2-only API track. Pre-fix,
the position bonus fired on track_number alone, so files like this
would sneak matches via just track_number agreement. Post-fix, the
cross-disc consolation (5%) plus weak title can fall below
MATCH_THRESHOLD → file goes unmatched.
This is the BEHAVIOR CHANGE worth pinning. For correctly-tagged
files in multi-disc albums (the user's actual case) this is the
right call. For users with weak tags this is a regression — they
now have to rely on title or fix their tags."""
files = ['/a/track06.flac'] # weak title, no tags
file_tags = {
'/a/track06.flac': _tags(title='', track=6, disc=1), # disc defaults to 1
}
tracks = [
# API has only this disc-2 track 6 — file's disc-1-track-6
# signal would have fired full position bonus pre-fix
{'name': 'Auntie Diaries', 'track_number': 6, 'disc_number': 2,
'artists': [{'name': 'Kendrick Lamar'}]},
]
result = match_files_to_tracks(
files, file_tags, tracks,
target_album='Mr. Morale', similarity=_sim, quality_rank=_qrank,
)
# Title sim "track06" vs "Auntie Diaries" is near zero (~0.10)
# × 0.45 = ~0.045. Plus cross-disc 0.05 = ~0.095. Below 0.4
# threshold → no match.
assert not result['matches']
assert result['unmatched_files'] == ['/a/track06.flac']