Match featured-artist tracks across discography completion

Discord-reported scenario: a single "Super Single" by Artist1 feat. Artist2 is also on Artist1's "Super Album". When the album is fully owned, Artist1's discography correctly shows the single as complete, but Artist2's discography (where the same track also appears as a single) shows it as missing. Two layers needed for the fix: Scanner: the Jellyfin/Emby path was keeping only ArtistItems[0], which is almost always equal to the album artist — so the distinguishing per-track credit was silently suppressed. Now joins every ArtistItems entry with "; " and stores the value when there are multiple credits OR when the single credit differs from the album artist. Plex's originalTitle already carries the full multi- artist tag, so Plex users benefit without needing the scanner change. Scorer: _calculate_track_confidence now splits track_artist on the common multi-artist delimiters real-world tags use (",", ";", "&", "feat.", "ft.", "featuring", "vs.", "x") and scores each piece independently against the search artist, taking the max along with the whole-string similarity as the floor. Never reduces a score — purely additive matching for previously-missed featured-artist credits. Adds 12 regression tests covering the reported scenario, primary- artist back-compat, every delimiter variant (parametrized), no- regression on exact match, and the scanner storing every ArtistItem. Existing Jellyfin-scanned rows persist their old single-artist value until the next library scan rewrites them; Plex rows benefit immediately on next match without needing a rescan.
1 month ago · ddef904414
parent cbafd07009
commit ddef904414
2 changed files with 246 additions and 6 deletions
--- a/database/music_database.py
+++ b/database/music_database.py
@ -4944,15 +4944,29 @@ class MusicDatabase:
                plex_original = getattr(track_obj, 'originalTitle', None)
                if plex_original and plex_original.strip():
                    track_artist = plex_original.strip()
-                # Jellyfin/Emby: ArtistItems[0] is the track artist, may differ from album artist
+                # Jellyfin/Emby: store ALL ArtistItems, not just [0]. A track
+                # like "Super Single" by Artist1 feat. Artist2 has both names in
+                # ArtistItems; if we kept only the first, completion checks for
+                # Artist2's discography (where the same track also appears as a
+                # single) would never find this row in the library. Joining with
+                # "; " matches Jellyfin's own UI convention and lets the search
+                # path treat each name as a separate artist credit.
                if not track_artist and hasattr(track_obj, '_data'):
                    raw = getattr(track_obj, '_data', {}) or {}
                    artist_items = raw.get('ArtistItems', [])
                    if artist_items:
-                        jf_track_artist = artist_items[0].get('Name', '')
+                        jf_track_artist_names = [
+                            a.get('Name', '') for a in artist_items if a.get('Name')
+                        ]
+                        jf_track_artist = '; '.join(jf_track_artist_names)
                        album_artists = raw.get('AlbumArtists', [])
                        jf_album_artist = album_artists[0].get('Name', '') if album_artists else ''
-                        if jf_track_artist and jf_track_artist != jf_album_artist:
+                        # Store when the track has multiple artists OR when the
+                        # single-artist credit differs from the album artist.
+                        if jf_track_artist and (
+                            len(jf_track_artist_names) > 1
+                            or jf_track_artist != jf_album_artist
+                        ):
                            track_artist = jf_track_artist
                # Navidrome/Subsonic: artist attribute is per-track
                if not track_artist and hasattr(track_obj, 'artist') and isinstance(getattr(track_obj, 'artist', None), str):
@ -6288,13 +6302,33 @@ class MusicDatabase:
            # Lin-Manuel Miranda but "Where You Are" is performed by Christopher
            # Jackson). Score against tracks.track_artist too and take the better
            # match so playlist sync can find these.
+            #
+            # Featured artists: tracks with multiple credits ("Artist1, Artist2",
+            # "Artist1 feat. Artist2", "Artist1 & Artist2") split on common
+            # delimiters and score each piece independently. Without this, a
+            # discography completion check for Artist2 would miss a track stored
+            # in the library under Artist1's album with a "feat. Artist2" credit.
            db_track_artist = getattr(db_track, 'track_artist', None)
            if db_track_artist:
                db_track_artist_norm = self._normalize_for_comparison(db_track_artist)
-                artist_similarity = max(
-                    artist_similarity,
-                    self._string_similarity(search_artist_norm, db_track_artist_norm),
+                # Whole-string similarity first as the floor.
+                track_artist_sim = self._string_similarity(search_artist_norm, db_track_artist_norm)
+                # Then split on multi-artist delimiters and score each piece —
+                # Spotify's "feat.", "ft.", commas, semicolons, ampersands, and
+                # "x" between names all show up here in real-world tags.
+                pieces = re.split(
+                    r'\s*(?:[;,&]|\bfeat\.?\b|\bft\.?\b|\bfeaturing\b|\bvs\.?\b|\bx\b)\s*',
+                    db_track_artist_norm,
+                    flags=re.IGNORECASE,
                )
+                for piece in pieces:
+                    piece = piece.strip()
+                    if not piece:
+                        continue
+                    piece_sim = self._string_similarity(search_artist_norm, piece)
+                    if piece_sim > track_artist_sim:
+                        track_artist_sim = piece_sim
+                artist_similarity = max(artist_similarity, track_artist_sim)
            
            # Also try with cleaned versions (removing parentheses, brackets, etc.)
            clean_search_title = self._clean_track_title_for_comparison(search_title)
--- a/tests/test_featured_artist_completion.py
+++ b/tests/test_featured_artist_completion.py
@ -0,0 +1,206 @@
+"""Regression tests for featured-artist track matching.
+
+Discord-reported scenario: a single "super single" by Artist1 feat.
+Artist2 also appears on the album "super album" (Artist1). When the
+album is fully owned, Artist1's discography shows the single as
+complete, but Artist2's discography (which lists the same track as
+their own single) shows it as missing — even though the same
+recording exists in the library under Artist1's album.
+
+Two layers of fix pinned by these tests:
+
+- Scanner: store ALL Jellyfin/Emby ArtistItems in tracks.track_artist
+  (joined with "; "), not just ArtistItems[0]. The first artist
+  often equals the album artist and used to suppress the row.
+- Scoring: split track_artist on common multi-artist delimiters
+  (",", ";", "&", "feat.", "ft.", "featuring", "vs.", "x") and
+  score each piece independently against the search artist.
+"""
+
+import sqlite3
+from pathlib import Path
+
+import pytest
+
+from database.music_database import DatabaseTrack, MusicDatabase
+
+
+@pytest.fixture
+def db_with_feat_track(tmp_path: Path):
+    """Build a real MusicDatabase with the featured-artist scenario.
+
+    "Super Single" by "Artist1, Artist2" stored under Artist1's
+    album. Mirrors what the Jellyfin scanner now writes when a
+    track has multiple ArtistItems.
+    """
+    db_path = tmp_path / "feat.db"
+    db = MusicDatabase(database_path=str(db_path))
+    conn = db._get_connection()
+    cursor = conn.cursor()
+    cursor.execute(
+        "INSERT INTO artists (id, name, server_source) VALUES (?, ?, ?)",
+        ("ar-1", "Artist1", "jellyfin"),
+    )
+    cursor.execute(
+        "INSERT INTO albums (id, artist_id, title, server_source) VALUES (?, ?, ?, ?)",
+        ("al-1", "ar-1", "Super Album", "jellyfin"),
+    )
+    cursor.execute(
+        """
+        INSERT INTO tracks (
+            id, album_id, artist_id, title, track_number, duration,
+            file_path, bitrate, server_source, track_artist
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+        ("tr-1", "al-1", "ar-1", "Super Single", 3, 200000,
+         "/m/super.mp3", 320, "jellyfin", "Artist1; Artist2"),
+    )
+    conn.commit()
+    conn.close()
+    return db
+
+
+# ---------------------------------------------------------------------------
+# Scoring: featured artist matches via split
+# ---------------------------------------------------------------------------
+
+
+def test_featured_artist_matches_via_track_artist_split(db_with_feat_track: MusicDatabase) -> None:
+    """The reported scenario: searching for the featured artist
+    (Artist2) finds the track stored under the primary artist's
+    album because track_artist contains both names."""
+    track, confidence = db_with_feat_track.check_track_exists(
+        title="Super Single",
+        artist="Artist2",
+        confidence_threshold=0.7,
+    )
+    assert track is not None
+    assert confidence >= 0.7
+
+
+def test_primary_artist_still_matches(db_with_feat_track: MusicDatabase) -> None:
+    """Forward compat: searching for the primary artist must still
+    work — the original album-artist match path is preserved."""
+    track, confidence = db_with_feat_track.check_track_exists(
+        title="Super Single",
+        artist="Artist1",
+        confidence_threshold=0.7,
+    )
+    assert track is not None
+    assert confidence >= 0.7
+
+
+@pytest.mark.parametrize("track_artist_value", [
+    "Artist1, Artist2",
+    "Artist1; Artist2",
+    "Artist1 & Artist2",
+    "Artist1 feat. Artist2",
+    "Artist1 ft. Artist2",
+    "Artist1 featuring Artist2",
+    "Artist1 vs. Artist2",
+    "Artist1 x Artist2",
+])
+def test_scoring_handles_common_multi_artist_separators(
+    db_with_feat_track: MusicDatabase, track_artist_value: str,
+) -> None:
+    """Score must find the featured artist regardless of which
+    delimiter the metadata source / tag uses."""
+    track = DatabaseTrack(
+        id="x", album_id="y", artist_id="z",
+        title="Super Single", track_number=1, duration=200000,
+        file_path="/m/x.mp3", bitrate=320,
+    )
+    track.artist_name = "Artist1"
+    track.track_artist = track_artist_value
+    conf = db_with_feat_track._calculate_track_confidence(
+        "Super Single", "Artist2", track,
+    )
+    assert conf >= 0.7, (
+        f"separator '{track_artist_value}' should still let Artist2 match"
+    )
+
+
+def test_split_does_not_inflate_score_beyond_whole_string_floor(
+    db_with_feat_track: MusicDatabase,
+) -> None:
+    """Splitting must only ADD to the score (best-of), never pull it
+    below the whole-string baseline. Same artist on both sides should
+    score 1.0 the same way it always did, with or without delimiters."""
+    track = DatabaseTrack(
+        id="x", album_id="y", artist_id="z",
+        title="Solo Song", track_number=1, duration=200000,
+        file_path="/m/x.mp3", bitrate=320,
+    )
+    track.artist_name = "Solo Artist"
+    track.track_artist = "Solo Artist"  # No delimiters at all
+    conf = db_with_feat_track._calculate_track_confidence(
+        "Solo Song", "Solo Artist", track,
+    )
+    assert conf >= 0.99, "exact-match score must not regress"
+
+
+# ---------------------------------------------------------------------------
+# Scanner: Jellyfin ArtistItems propagation
+# ---------------------------------------------------------------------------
+
+
+class _StubJellyfinTrack:
+    """Minimal stub mimicking JellyfinTrack: real attributes the scanner
+    reads (ratingKey, title, trackNumber, duration, path, bitRate) plus
+    the ``_data`` raw dict where ArtistItems live."""
+    def __init__(self, track_id, title, track_artists, album_artist,
+                 track_number=1, duration=200000, file_path="/m/x.mp3",
+                 bit_rate=320):
+        self.ratingKey = track_id
+        self.title = title
+        self.trackNumber = track_number
+        self.duration = duration
+        self.path = file_path
+        self.bitRate = bit_rate
+        self._data = {
+            'ArtistItems': [{'Name': n} for n in track_artists],
+            'AlbumArtists': [{'Name': album_artist}],
+        }
+
+
+def test_jellyfin_scanner_stores_all_track_artists(tmp_path: Path) -> None:
+    """The scanner must persist EVERY name from ArtistItems, not just
+    the first. Pre-fix the scanner kept only [0] which was usually
+    equal to the album artist, so nothing distinguishing was stored.
+    """
+    db = MusicDatabase(database_path=str(tmp_path / "scan.db"))
+    conn = db._get_connection()
+    cursor = conn.cursor()
+
+    # Seed the artist + album the track will hang off
+    cursor.execute(
+        "INSERT INTO artists (id, name, server_source) VALUES (?, ?, ?)",
+        ("ar-1", "Artist1", "jellyfin"),
+    )
+    cursor.execute(
+        "INSERT INTO albums (id, artist_id, title, server_source) VALUES (?, ?, ?, ?)",
+        ("al-1", "ar-1", "Super Album", "jellyfin"),
+    )
+    conn.commit()
+    conn.close()
+
+    track_obj = _StubJellyfinTrack(
+        track_id="tr-1",
+        title="Super Single",
+        track_artists=["Artist1", "Artist2"],
+        album_artist="Artist1",
+    )
+    db.insert_or_update_media_track(
+        track_obj, album_id="al-1", artist_id="ar-1", server_source="jellyfin",
+    )
+
+    conn = db._get_connection()
+    cursor = conn.cursor()
+    cursor.execute("SELECT track_artist FROM tracks WHERE id = ?", ("tr-1",))
+    row = cursor.fetchone()
+    conn.close()
+    assert row is not None
+    assert row[0] is not None, "scanner should not drop multi-artist track credits"
+    assert "Artist2" in row[0], (
+        f"track_artist must contain every ArtistItem — got {row[0]!r}"
+    )