diff --git a/core/text/title_match.py b/core/text/title_match.py index ed8ca4a2..cb56334b 100644 --- a/core/text/title_match.py +++ b/core/text/title_match.py @@ -78,4 +78,41 @@ def titles_plausibly_same( return not ta.isdisjoint(tb) -__all__ = ["titles_plausibly_same"] +_QUALIFIER_RE = re.compile(r"[\(\[]([^\)\]]*)[\)\]]") + + +def strip_redundant_context_qualifiers(title: str, *context_texts: str) -> str: + """Remove parenthetical/bracket qualifiers that merely restate known context. + + A qualifier whose text appears (word-bounded) in one of ``context_texts`` + — typically the release's album title, or the other side of a comparison — + is album context, not a version difference. #808: the wishlist held + 'Champagne Supernova (OurVinyl Sessions)' while the library track was the + bare 'Champagne Supernova' on the album '… (OurVinyl Sessions)'; the + qualifier restated the album, but the length-ratio penalty treated the + pair as different songs and the cleanup never recognised the owned + edition. Version markers that do NOT appear in any context ('(Live)', + '(Remix)' on a studio album) are kept, so their mismatch penalty stands. + """ + if not title: + return title + + contexts = [c.casefold() for c in context_texts if c] + if not contexts: + return title + + def _drop(match: re.Match) -> str: + inner = match.group(1).strip().casefold() + if not inner: + return " " + pattern = r"\b" + re.escape(inner) + r"\b" + for ctx in contexts: + if re.search(pattern, ctx): + return " " + return match.group(0) + + out = _QUALIFIER_RE.sub(_drop, title) + return re.sub(r"\s+", " ", out).strip() + + +__all__ = ["titles_plausibly_same", "strip_redundant_context_qualifiers"] diff --git a/database/music_database.py b/database/music_database.py index f9ba7144..6d95d422 100644 --- a/database/music_database.py +++ b/database/music_database.py @@ -7539,6 +7539,31 @@ class MusicDatabase: # Titles differ in length by more than 30% — penalize heavily best_title_similarity *= len_ratio + # #808: a parenthetical qualifier that merely RESTATES the release + # context is album context, not a version difference. Wishlist + # title 'Champagne Supernova (OurVinyl Sessions)' vs the library's + # bare 'Champagne Supernova' on the album '… (OurVinyl Sessions)': + # the qualifier appears in the album title, yet the length-ratio + # penalty above crushed the pair to ~0.17 and wishlist cleanup + # never recognised the owned edition. Strip qualifiers confirmed + # by the db album title (or by the other title) and score that + # variant with its OWN length guard — genuine version markers + # ('(Live)' on a studio album) appear in no context, keep their + # qualifier, and keep their penalty. + db_album_norm = self._normalize_for_comparison( + getattr(db_track, 'album_title', '') or '') + from core.text.title_match import strip_redundant_context_qualifiers + ctx_search = strip_redundant_context_qualifiers( + search_title_norm, db_album_norm, db_title_norm) + ctx_db = strip_redundant_context_qualifiers( + db_title_norm, db_album_norm, search_title_norm) + if (ctx_search, ctx_db) != (search_title_norm, db_title_norm) and ctx_search and ctx_db: + ctx_sim = self._string_similarity(ctx_search, ctx_db) + ctx_ratio = min(len(ctx_search), len(ctx_db)) / max(len(ctx_search), len(ctx_db)) + if ctx_ratio < 0.7: + ctx_sim *= ctx_ratio # 'Believe' vs 'Believe In Me' still penalised + best_title_similarity = max(best_title_similarity, ctx_sim) + # Word-level guard: SequenceMatcher's char ratio over-credits # different songs that share a long substring or only a stopword # ("Dani California" vs "Californication" = 0.67; "Under The Bridge" diff --git a/tests/test_context_qualifier_match.py b/tests/test_context_qualifier_match.py new file mode 100644 index 00000000..79ac0bcf --- /dev/null +++ b/tests/test_context_qualifier_match.py @@ -0,0 +1,104 @@ +"""#808: parenthetical qualifiers that restate album context must not block +library-presence matching. + +carlosjfcasero's case: the wishlist held 'Champagne Supernova (OurVinyl +Sessions)' (Deezer/iTunes title) while the library track was on the album +'Champagne Supernova (OurVinyl Sessions)'. When one side's title carries the +qualifier and the other doesn't, the length-ratio penalty crushed the pair to +~0.17 — wishlist cleanup never recognised the owned edition and the track +re-appeared every cycle. The qualifier appearing in the (db) album title +proves it's album context, not a different version. +""" + +from __future__ import annotations + +import os + +import pytest + +from core.text.title_match import strip_redundant_context_qualifiers +from database.music_database import MusicDatabase + + +# ── the pure helper ────────────────────────────────────────────────────────── + +def test_qualifier_confirmed_by_album_is_stripped(): + out = strip_redundant_context_qualifiers( + 'champagne supernova (ourvinyl sessions)', + 'champagne supernova (ourvinyl sessions)', # db album title + ) + assert out == 'champagne supernova' + + +def test_version_marker_on_unrelated_album_is_kept(): + assert strip_redundant_context_qualifiers('song (live)', 'studio album') == 'song (live)' + assert strip_redundant_context_qualifiers('song (remix)', 'the album') == 'song (remix)' + + +def test_version_marker_confirmed_by_album_is_stripped(): + # Owning 'Song (Live)' on the album 'Live at Wembley' IS owning that cut. + assert strip_redundant_context_qualifiers('song (live)', 'live at wembley') == 'song' + + +def test_word_boundary_containment(): + # 'live' inside 'alive' must NOT count as context confirmation. + assert strip_redundant_context_qualifiers('song (live)', 'alive and well') == 'song (live)' + + +def test_no_context_or_title_untouched(): + assert strip_redundant_context_qualifiers('plain title', 'anything') == 'plain title' + assert strip_redundant_context_qualifiers('', 'ctx') == '' + assert strip_redundant_context_qualifiers('song (x)') == 'song (x)' + + +# ── end to end through check_track_exists (the wishlist-cleanup contract) ──── + +@pytest.fixture() +def lib_db(tmp_path): + db = MusicDatabase(str(tmp_path / 'm.db')) + conn = db._get_connection() + c = conn.cursor() + c.execute("INSERT INTO artists (id, name, server_source) VALUES ('a1', 'Jillette Johnson', 'plex')") + c.execute("""INSERT INTO albums (id, title, artist_id, server_source) + VALUES ('al1', 'Champagne Supernova (OurVinyl Sessions)', 'a1', 'plex')""") + c.execute("""INSERT INTO tracks (id, album_id, artist_id, title, file_path, server_source) + VALUES ('t1', 'al1', 'a1', 'Champagne Supernova', '/m/cs.mp3', 'plex')""") + # Version-safety control: a live cut on a studio-named album. + c.execute("""INSERT INTO albums (id, title, artist_id, server_source) + VALUES ('al2', 'Water In A Whale', 'a1', 'plex')""") + c.execute("""INSERT INTO tracks (id, album_id, artist_id, title, file_path, server_source) + VALUES ('t2', 'al2', 'a1', 'Cameron', '/m/c.mp3', 'plex')""") + conn.commit() + conn.close() + return db + + +def test_808_qualified_search_matches_bare_library_track(lib_db): + """The reported direction: source/wishlist title carries the qualifier, + library title is bare, the library ALBUM carries the qualifier.""" + match, conf = lib_db.check_track_exists( + 'Champagne Supernova (OurVinyl Sessions)', 'Jillette Johnson', + confidence_threshold=0.7, server_source='plex', + album='Jillette Johnson | OurVinyl Sessions', + ) + assert match is not None and conf >= 0.7 + + +def test_version_marker_still_blocks_without_album_confirmation(lib_db): + """'Cameron (Live)' must NOT match the studio 'Cameron' — the qualifier + appears in no album context, so the mismatch penalty stands.""" + match, conf = lib_db.check_track_exists( + 'Cameron (Live)', 'Jillette Johnson', + confidence_threshold=0.7, server_source='plex', + ) + assert conf < 0.7 + + +def test_different_song_prefix_still_blocked(lib_db): + """'Champagne' alone is a different (hypothetical) song — the length + penalty on the reduced forms still applies.""" + match, conf = lib_db.check_track_exists( + 'Champagne', 'Jillette Johnson', + confidence_threshold=0.7, server_source='plex', + ) + assert conf < 0.7 diff --git a/web_server.py b/web_server.py index 5cb50195..f2dad731 100644 --- a/web_server.py +++ b/web_server.py @@ -9201,6 +9201,15 @@ def library_check_tracks(): album_entries.append(entry) else: other_entries.append(entry) + # #808: when the album gate narrows to NOTHING, the source's album + # naming simply doesn't resemble the library's (Deezer's + # 'Jillette Johnson | OurVinyl Sessions' vs the library's + # 'Champagne Supernova (OurVinyl Sessions)' scores ~0.5). Marking + # every track unowned off a failed ALBUM-name comparison is wrong — + # fall back to artist-wide title matching, which is exactly the + # pre-album-aware behavior and still holds the 0.7 title bar. + if not album_entries: + album_entries = other_entries else: other_entries = db_title_entries