#808: album-context qualifiers stop blocking library-presence matching

carlosjfcasero: 'Champagne Supernova (OurVinyl Sessions)' is in the library
but the artist page shows it unowned and wishlist cleanup never removes it.
Measured with the real catalogs: Deezer/iTunes title the TRACK with the
qualifier while the library track is bare (the qualifier lives in the album
title) — and _calculate_track_confidence crushed that pair to ~0.17: the
"clean" titles keep parenthetical words, so the length-ratio penalty treats
'Champagne Supernova' vs 'Champagne Supernova (OurVinyl Sessions)' as
different songs. (Also confirmed: the OurVinyl release is absent from
Deezer's discography for the artist, so the standard page's 25-release list
not showing it is the source catalog, not a bug.)

Fix 1 — core.text.title_match.strip_redundant_context_qualifiers: a
parenthetical qualifier whose text appears (word-bounded) in the db track's
ALBUM title — or in the other title — restates release context and is
stripped for a comparison variant scored with its own length guard. Genuine
version markers keep their penalty: '(Live)' on a studio album appears in no
context and still blocks; '(Live)' on 'Live at Wembley' correctly matches —
owning the live album IS owning the live cut. Wired into
_calculate_track_confidence, so every check_track_exists consumer (wishlist
cleanup, discography dedup, repair jobs) benefits.

Fix 2 — the artist-page ownership endpoint's album gate: when album-aware
narrowing eliminates EVERY library candidate (the source's album naming just
doesn't resemble the library's — 'Jillette Johnson | OurVinyl Sessions' vs
'Champagne Supernova (OurVinyl Sessions)' ~0.5), fall back to artist-wide
title matching instead of declaring everything unowned off a failed
album-NAME comparison.

Tests: 8 — the exact reported pair end-to-end through check_track_exists,
word-boundary containment ('live' in 'alive' doesn't count), version-marker
safety both ways, and prefix songs still blocked. 1125 matching/wishlist/
library tests pass.
pull/812/head
BoulderBadgeDad 5 days ago
parent 157d19f3b9
commit f250eaa228

@ -78,4 +78,41 @@ def titles_plausibly_same(
return not ta.isdisjoint(tb)
__all__ = ["titles_plausibly_same"]
_QUALIFIER_RE = re.compile(r"[\(\[]([^\)\]]*)[\)\]]")
def strip_redundant_context_qualifiers(title: str, *context_texts: str) -> str:
"""Remove parenthetical/bracket qualifiers that merely restate known context.
A qualifier whose text appears (word-bounded) in one of ``context_texts``
typically the release's album title, or the other side of a comparison —
is album context, not a version difference. #808: the wishlist held
'Champagne Supernova (OurVinyl Sessions)' while the library track was the
bare 'Champagne Supernova' on the album '… (OurVinyl Sessions)'; the
qualifier restated the album, but the length-ratio penalty treated the
pair as different songs and the cleanup never recognised the owned
edition. Version markers that do NOT appear in any context ('(Live)',
'(Remix)' on a studio album) are kept, so their mismatch penalty stands.
"""
if not title:
return title
contexts = [c.casefold() for c in context_texts if c]
if not contexts:
return title
def _drop(match: re.Match) -> str:
inner = match.group(1).strip().casefold()
if not inner:
return " "
pattern = r"\b" + re.escape(inner) + r"\b"
for ctx in contexts:
if re.search(pattern, ctx):
return " "
return match.group(0)
out = _QUALIFIER_RE.sub(_drop, title)
return re.sub(r"\s+", " ", out).strip()
__all__ = ["titles_plausibly_same", "strip_redundant_context_qualifiers"]

@ -7539,6 +7539,31 @@ class MusicDatabase:
# Titles differ in length by more than 30% — penalize heavily
best_title_similarity *= len_ratio
# #808: a parenthetical qualifier that merely RESTATES the release
# context is album context, not a version difference. Wishlist
# title 'Champagne Supernova (OurVinyl Sessions)' vs the library's
# bare 'Champagne Supernova' on the album '… (OurVinyl Sessions)':
# the qualifier appears in the album title, yet the length-ratio
# penalty above crushed the pair to ~0.17 and wishlist cleanup
# never recognised the owned edition. Strip qualifiers confirmed
# by the db album title (or by the other title) and score that
# variant with its OWN length guard — genuine version markers
# ('(Live)' on a studio album) appear in no context, keep their
# qualifier, and keep their penalty.
db_album_norm = self._normalize_for_comparison(
getattr(db_track, 'album_title', '') or '')
from core.text.title_match import strip_redundant_context_qualifiers
ctx_search = strip_redundant_context_qualifiers(
search_title_norm, db_album_norm, db_title_norm)
ctx_db = strip_redundant_context_qualifiers(
db_title_norm, db_album_norm, search_title_norm)
if (ctx_search, ctx_db) != (search_title_norm, db_title_norm) and ctx_search and ctx_db:
ctx_sim = self._string_similarity(ctx_search, ctx_db)
ctx_ratio = min(len(ctx_search), len(ctx_db)) / max(len(ctx_search), len(ctx_db))
if ctx_ratio < 0.7:
ctx_sim *= ctx_ratio # 'Believe' vs 'Believe In Me' still penalised
best_title_similarity = max(best_title_similarity, ctx_sim)
# Word-level guard: SequenceMatcher's char ratio over-credits
# different songs that share a long substring or only a stopword
# ("Dani California" vs "Californication" = 0.67; "Under The Bridge"

@ -0,0 +1,104 @@
"""#808: parenthetical qualifiers that restate album context must not block
library-presence matching.
carlosjfcasero's case: the wishlist held 'Champagne Supernova (OurVinyl
Sessions)' (Deezer/iTunes title) while the library track was on the album
'Champagne Supernova (OurVinyl Sessions)'. When one side's title carries the
qualifier and the other doesn't, the length-ratio penalty crushed the pair to
~0.17 wishlist cleanup never recognised the owned edition and the track
re-appeared every cycle. The qualifier appearing in the (db) album title
proves it's album context, not a different version.
"""
from __future__ import annotations
import os
import pytest
from core.text.title_match import strip_redundant_context_qualifiers
from database.music_database import MusicDatabase
# ── the pure helper ──────────────────────────────────────────────────────────
def test_qualifier_confirmed_by_album_is_stripped():
out = strip_redundant_context_qualifiers(
'champagne supernova (ourvinyl sessions)',
'champagne supernova (ourvinyl sessions)', # db album title
)
assert out == 'champagne supernova'
def test_version_marker_on_unrelated_album_is_kept():
assert strip_redundant_context_qualifiers('song (live)', 'studio album') == 'song (live)'
assert strip_redundant_context_qualifiers('song (remix)', 'the album') == 'song (remix)'
def test_version_marker_confirmed_by_album_is_stripped():
# Owning 'Song (Live)' on the album 'Live at Wembley' IS owning that cut.
assert strip_redundant_context_qualifiers('song (live)', 'live at wembley') == 'song'
def test_word_boundary_containment():
# 'live' inside 'alive' must NOT count as context confirmation.
assert strip_redundant_context_qualifiers('song (live)', 'alive and well') == 'song (live)'
def test_no_context_or_title_untouched():
assert strip_redundant_context_qualifiers('plain title', 'anything') == 'plain title'
assert strip_redundant_context_qualifiers('', 'ctx') == ''
assert strip_redundant_context_qualifiers('song (x)') == 'song (x)'
# ── end to end through check_track_exists (the wishlist-cleanup contract) ────
@pytest.fixture()
def lib_db(tmp_path):
db = MusicDatabase(str(tmp_path / 'm.db'))
conn = db._get_connection()
c = conn.cursor()
c.execute("INSERT INTO artists (id, name, server_source) VALUES ('a1', 'Jillette Johnson', 'plex')")
c.execute("""INSERT INTO albums (id, title, artist_id, server_source)
VALUES ('al1', 'Champagne Supernova (OurVinyl Sessions)', 'a1', 'plex')""")
c.execute("""INSERT INTO tracks (id, album_id, artist_id, title, file_path, server_source)
VALUES ('t1', 'al1', 'a1', 'Champagne Supernova', '/m/cs.mp3', 'plex')""")
# Version-safety control: a live cut on a studio-named album.
c.execute("""INSERT INTO albums (id, title, artist_id, server_source)
VALUES ('al2', 'Water In A Whale', 'a1', 'plex')""")
c.execute("""INSERT INTO tracks (id, album_id, artist_id, title, file_path, server_source)
VALUES ('t2', 'al2', 'a1', 'Cameron', '/m/c.mp3', 'plex')""")
conn.commit()
conn.close()
return db
def test_808_qualified_search_matches_bare_library_track(lib_db):
"""The reported direction: source/wishlist title carries the qualifier,
library title is bare, the library ALBUM carries the qualifier."""
match, conf = lib_db.check_track_exists(
'Champagne Supernova (OurVinyl Sessions)', 'Jillette Johnson',
confidence_threshold=0.7, server_source='plex',
album='Jillette Johnson | OurVinyl Sessions',
)
assert match is not None and conf >= 0.7
def test_version_marker_still_blocks_without_album_confirmation(lib_db):
"""'Cameron (Live)' must NOT match the studio 'Cameron' — the qualifier
appears in no album context, so the mismatch penalty stands."""
match, conf = lib_db.check_track_exists(
'Cameron (Live)', 'Jillette Johnson',
confidence_threshold=0.7, server_source='plex',
)
assert conf < 0.7
def test_different_song_prefix_still_blocked(lib_db):
"""'Champagne' alone is a different (hypothetical) song — the length
penalty on the reduced forms still applies."""
match, conf = lib_db.check_track_exists(
'Champagne', 'Jillette Johnson',
confidence_threshold=0.7, server_source='plex',
)
assert conf < 0.7

@ -9201,6 +9201,15 @@ def library_check_tracks():
album_entries.append(entry)
else:
other_entries.append(entry)
# #808: when the album gate narrows to NOTHING, the source's album
# naming simply doesn't resemble the library's (Deezer's
# 'Jillette Johnson | OurVinyl Sessions' vs the library's
# 'Champagne Supernova (OurVinyl Sessions)' scores ~0.5). Marking
# every track unowned off a failed ALBUM-name comparison is wrong —
# fall back to artist-wide title matching, which is exactly the
# pre-album-aware behavior and still holds the 0.7 title bar.
if not album_entries:
album_entries = other_entries
else:
other_entries = db_title_entries

Loading…
Cancel
Save