From 78c6f09e136e50b628d030fcf78ac446c0cff33c Mon Sep 17 00:00:00 2001 From: BoulderBadgeDad Date: Sat, 30 May 2026 19:08:03 -0700 Subject: [PATCH] Album picker #730: don't reject the right album over an edition suffix Self-review found a false-negative in the title-relevance gate I just added: it scored 'fraction of the ALBUM-NAME's words present in the title', so a stored album name with an edition/remaster suffix the torrent lacks ('Currents (Deluxe)', 'Heroes (2017 Remaster)') scored BELOW the 0.6 floor and the correct release was wrongly refused -> fell back to per-track. The very first issue example ('Heroes 2017 Remaster') would have regressed. Fix: strip edition/format/year NOISE words (deluxe, remaster, edition, flac, years, bitrates, ...) before scoring, via _significant_words(), with a fallback to the raw words so an album literally named '1989' or 'Deluxe' isn't emptied to match-everything. Verified both directions: edition suffixes now KEPT, while the wrong-album rejection (Scary Monsters for a Heroes request, Superheroes) still scores 0. Tests: +2 regression tests (edition-suffix kept; noise/number-only album name). 125 album-bundle/dispatch/plugin tests pass; compile + ruff clean. --- core/download_plugins/album_bundle.py | 40 ++++++++++++++++++++++----- tests/test_album_bundle.py | 18 ++++++++++++ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/core/download_plugins/album_bundle.py b/core/download_plugins/album_bundle.py index 7db3eac4..398a45dc 100644 --- a/core/download_plugins/album_bundle.py +++ b/core/download_plugins/album_bundle.py @@ -119,16 +119,42 @@ def _normalize_release_text(text: str) -> str: return re.sub(r"\s+", " ", cleaned).strip() +# Edition / format / qualifier words that appear in stored album names or +# release titles but say nothing about WHICH album it is. Stripped before +# scoring so "Currents" matches "Currents (Deluxe)" and "Heroes" matches +# "Heroes (2017 Remaster)" — the #730 fix must not reject the RIGHT album just +# because the DB name carries an edition suffix the torrent title lacks. +_ALBUM_NOISE_WORDS = frozenset({ + "deluxe", "edition", "remaster", "remastered", "remasters", "remix", + "expanded", "anniversary", "bonus", "version", "explicit", "clean", + "reissue", "special", "limited", "collectors", "collector", "the", + "ep", "lp", "album", "single", "disc", "cd", "vol", "volume", + "flac", "mp3", "aac", "ogg", "wav", "alac", "m4a", "320", "256", "192", + "web", "vinyl", "hi", "res", "hires", "24bit", "16bit", "original", + "soundtrack", "ost", +}) + + +def _significant_words(normalized: str) -> list: + """Words that actually identify an album: drop pure-digit tokens (years, + bitrates) and edition/format noise. Keeps at least the raw words if the + filter would empty it (e.g. an album literally named '1989' or 'Deluxe').""" + words = [w for w in normalized.split() + if w not in _ALBUM_NOISE_WORDS and not w.isdigit()] + return words or normalized.split() + + def album_title_relevance(candidate_title: str, album_name: str) -> float: """How well a release title matches the requested album, 0.0–1.0. - Word-coverage: the fraction of the album-name's words that appear as - whole words in the candidate title. Word-boundary (not substring) so - "Heroes" does NOT match "Superheroes", and a request for "Heroes" is not - satisfied by a different Bowie album whose title shares no words. + Scores the fraction of the album's SIGNIFICANT words (edition/format/year + noise removed) that appear as whole words in the candidate title. + Word-boundary, not substring, so "Heroes" does NOT match "Superheroes" and + a different album sharing no significant words scores 0 — while "Currents" + still matches "Currents (Deluxe)" and "Heroes" matches the "2017 Remaster". - Returns 1.0 when there's no album name to check against (can't gate on - nothing — preserves old behavior for callers that don't pass a title). + Returns 1.0 when there's no album name to check (can't gate on nothing — + preserves old behavior for callers that don't pass a title). """ norm_album = _normalize_release_text(album_name) if not norm_album: @@ -136,7 +162,7 @@ def album_title_relevance(candidate_title: str, album_name: str) -> float: norm_title = _normalize_release_text(candidate_title) if not norm_title: return 0.0 - album_words = norm_album.split() + album_words = _significant_words(norm_album) title_words = set(norm_title.split()) if not album_words: return 1.0 diff --git a/tests/test_album_bundle.py b/tests/test_album_bundle.py index f693a506..41ee3982 100644 --- a/tests/test_album_bundle.py +++ b/tests/test_album_bundle.py @@ -153,6 +153,24 @@ def test_relevance_no_album_name_is_neutral(): assert album_title_relevance("anything at all", "") == 1.0 +def test_relevance_ignores_edition_suffix_on_album_name(): + """The RIGHT torrent must not be rejected just because the stored album + name carries an edition/remaster/format suffix the title lacks. (Caught in + review — the naive 'all album words' version wrongly rejected these.)""" + floor = 0.6 + assert album_title_relevance("Tame Impala - Currents [FLAC]", "Currents (Deluxe)") >= floor + assert album_title_relevance("David Bowie - Heroes [FLAC]", "Heroes (2017 Remaster)") >= floor + assert album_title_relevance("Daft Punk - Discovery [FLAC]", "Discovery (Remastered Edition)") >= floor + + +def test_relevance_album_named_only_with_noise_or_number(): + # If the album name is JUST a noise/number word, don't strip it to nothing + # and match everything — keep the literal word. + assert album_title_relevance("Taylor Swift - 1989 [FLAC]", "1989") == 1.0 + assert album_title_relevance("Taylor Swift - Red [FLAC]", "1989") == 0.0 + assert album_title_relevance("Various - Deluxe [FLAC]", "Deluxe") == 1.0 + + def test_picker_refuses_wrong_album_falls_back(): """The #730 scenario: a hugely-popular WRONG album must NOT be picked over a less-popular RIGHT one — and if nothing matches, return None so the