From e77dbf3f1ef646b0f68345d39367446e78d96030 Mon Sep 17 00:00:00 2001
From: Broque Thomas <batmanesignal@gmail.com>
Date: Wed, 25 Feb 2026 22:43:33 -0800
Subject: [PATCH] Refactor matching + use improved discovery scoring

Introduce a generic score_track_match(...) in core/matching_engine.py and make calculate_match_confidence(...) delegate to it. The new scorer is source-agnostic, consolidates artist/title/duration logic (core-title fast path, cleaned similarity, weighted 60/30/10 scoring) and improves artist matching.

In web_server.py add cache-validation (_validate_discovery_cache_artist) and a reusable _discovery_score_candidates(...) helper that calls the new scorer. Propagate per-match confidence through discovery flows (Tidal, YouTube, ListenBrainz, Beatport), increase Spotify/iTunes search limits, add an extended high-limit search strategy, tighten per-source thresholds, and save match confidence to the discovery cache. Overall this centralizes and standardizes matching logic and improves accuracy/validation for cached discovery results.
---
 core/matching_engine.py |  84 ++--
 web_server.py           | 849 +++++++++++++++++-----------------------
 2 files changed, 410 insertions(+), 523 deletions(-)

diff --git a/core/matching_engine.py b/core/matching_engine.py
index 75a95336..bc7ec1ed 100644
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@@ -238,49 +238,67 @@ class MusicMatchingEngine:
         diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
         return max(0, 1.0 - diff_ratio * 5)
 
-    def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
-        """Calculates a confidence score using a prioritized model, starting with a strict 'core' title check."""
-        
-        # --- Artist Scoring (calculated once) ---
-        spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a]
-        plex_artist_normalized = self.normalize_string(plex_track.artist)
-        plex_artist_cleaned = self.clean_artist(plex_track.artist)
+    def score_track_match(self, source_title: str, source_artists: List[str],
+                          source_duration_ms: int, candidate_title: str,
+                          candidate_artists: List[str], candidate_duration_ms: int) -> Tuple[float, str]:
+        """Generic track matching — same logic as calculate_match_confidence but type-agnostic.
+
+        Works for any two tracks regardless of source (Spotify, iTunes, YouTube, Tidal, etc.).
+        Uses clean_title/clean_artist for proper feat. stripping, core title fast path,
+        duration similarity, and 60/30/10 weighted scoring.
+
+        Returns (confidence, match_type) tuple.
+        """
+        # --- Artist Scoring ---
+        source_artists_cleaned = [self.clean_artist(a) for a in source_artists if a]
 
         best_artist_score = 0.0
-        for spotify_artist in spotify_artists_cleaned:
-            if spotify_artist and spotify_artist in plex_artist_normalized:
-                best_artist_score = 1.0
+        for src_artist in source_artists_cleaned:
+            for raw_cand_artist in candidate_artists:
+                if not raw_cand_artist:
+                    continue
+                cand_artist_normalized = self.normalize_string(raw_cand_artist)
+                cand_artist_cleaned = self.clean_artist(raw_cand_artist)
+                # Check containment (e.g., "drake" in "drake 21 savage")
+                if src_artist and src_artist in cand_artist_normalized:
+                    best_artist_score = 1.0
+                    break
+                score = self.similarity_score(src_artist, cand_artist_cleaned)
+                if score > best_artist_score:
+                    best_artist_score = score
+            if best_artist_score >= 1.0:
                 break
-            score = self.similarity_score(spotify_artist, plex_artist_cleaned)
-            if score > best_artist_score:
-                best_artist_score = score
         artist_score = best_artist_score
-        
-        # --- Priority 1: Core Title Match (for exact matches like "Girls", "APT.", "LIL DEMON") ---
-        spotify_core_title = self.get_core_string(spotify_track.name)
-        plex_core_title = self.get_core_string(plex_track.title)
 
-        if spotify_core_title and spotify_core_title == plex_core_title:
-            # SAFETY CHECK: Only give high confidence if artist also matches reasonably well
-            # This prevents "Artist A - Girls" from matching "Artist Z - Girls" with high confidence
-            if artist_score >= 0.75:  # Require decent artist match
-                # If the core titles are identical and artists match, we are highly confident
-                confidence = 0.90 + (artist_score * 0.09) # Max score of 0.99
+        # --- Priority 1: Core Title Match ---
+        source_core_title = self.get_core_string(source_title)
+        candidate_core_title = self.get_core_string(candidate_title)
+
+        if source_core_title and source_core_title == candidate_core_title:
+            if artist_score >= 0.75:
+                confidence = 0.90 + (artist_score * 0.09)
                 return confidence, "core_title_match"
-            # If artist score is too low, fall through to standard weighted calculation
 
-        # --- Priority 2: Fuzzy Title Match (for variations, typos, etc.) ---
-        spotify_title_cleaned = self.clean_title(spotify_track.name)
-        plex_title_cleaned = self.clean_title(plex_track.title)
-        
-        title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
-        duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
+        # --- Priority 2: Fuzzy Title Match ---
+        source_title_cleaned = self.clean_title(source_title)
+        candidate_title_cleaned = self.clean_title(candidate_title)
+
+        title_score = self.similarity_score(source_title_cleaned, candidate_title_cleaned)
+        duration_score = self.duration_similarity(source_duration_ms, candidate_duration_ms)
 
-        # Use a standard weighted calculation if the core titles didn't match
         confidence = (title_score * 0.60) + (artist_score * 0.30) + (duration_score * 0.10)
-        match_type = "standard_match"
+        return confidence, "standard_match"
 
-        return confidence, match_type
+    def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
+        """Calculates a confidence score using a prioritized model, starting with a strict 'core' title check."""
+        return self.score_track_match(
+            source_title=spotify_track.name,
+            source_artists=spotify_track.artists,
+            source_duration_ms=spotify_track.duration_ms,
+            candidate_title=plex_track.title,
+            candidate_artists=[plex_track.artist] if plex_track.artist else [],
+            candidate_duration_ms=plex_track.duration if plex_track.duration else 0
+        )
     
     def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult:
         """Finds the best Plex track match from a list of candidates."""
diff --git a/web_server.py b/web_server.py
index a3926db7..80ebb146 100644
--- a/web_server.py
+++ b/web_server.py
@@ -18075,6 +18075,107 @@ def _get_discovery_cache_key(title, artist):
     return (norm_title, norm_artist)
 
 
+def _validate_discovery_cache_artist(source_artist, cached_match):
+    """Check if a cached discovery match has a valid artist. Returns False if the
+    cached result's artist doesn't match the source artist (stale/wrong cache entry)."""
+    min_artist_similarity = 0.4
+    source_artist_cleaned = matching_engine.clean_artist(source_artist)
+    if not source_artist_cleaned:
+        return True  # No source artist to validate against
+
+    cached_artists = cached_match.get('artists', [])
+    if not cached_artists:
+        return True  # No cached artist to check
+
+    best_sim = 0.0
+    for cand_artist in cached_artists:
+        if not cand_artist:
+            continue
+        cand_normalized = matching_engine.normalize_string(cand_artist)
+        if source_artist_cleaned in cand_normalized:
+            return True
+        cand_cleaned = matching_engine.clean_artist(cand_artist)
+        sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned)
+        if sim > best_sim:
+            best_sim = sim
+
+    if best_sim < min_artist_similarity:
+        print(f"🚫 Cache artist mismatch: source='{source_artist}' vs cached='{cached_artists[0]}' (sim={best_sim:.2f}), re-searching")
+        return False
+    return True
+
+
+def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results):
+    """Score search results against a source track using the matching engine.
+
+    Uses matching_engine.score_track_match() which applies clean_title/clean_artist,
+    core title fast path, duration similarity, and 60/30/10 weighted scoring.
+
+    Includes a minimum artist similarity check to prevent "right title, wrong artist"
+    matches (e.g., "Yoga" by bbno$ matching "Yoga" by Sleep Music Lullabies).
+
+    Args:
+        source_title: The source track title (already cleaned for YouTube, raw for others)
+        source_artist: The source track primary artist
+        source_duration_ms: The source track duration in ms (0 if unknown)
+        search_results: List of Track objects (Spotify or iTunes) from search
+
+    Returns:
+        (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results
+    """
+    best_match = None
+    best_confidence = 0.0
+    best_index = -1
+    min_artist_similarity = 0.4  # Reject candidates where artist doesn't match at all
+
+    source_artist_cleaned = matching_engine.clean_artist(source_artist)
+
+    for idx, result in enumerate(search_results):
+        try:
+            result_artists = result.artists if hasattr(result, 'artists') and result.artists else []
+            result_name = result.name if hasattr(result, 'name') else ''
+            result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0
+
+            # Quick artist sanity check — reject if artist doesn't match at all
+            # This prevents "right title, wrong artist" false positives
+            best_artist_sim = 0.0
+            for cand_artist in result_artists:
+                if not cand_artist:
+                    continue
+                cand_cleaned = matching_engine.clean_artist(cand_artist)
+                # Check containment (e.g., "drake" in "drake 21 savage")
+                cand_normalized = matching_engine.normalize_string(cand_artist)
+                if source_artist_cleaned and source_artist_cleaned in cand_normalized:
+                    best_artist_sim = 1.0
+                    break
+                sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned)
+                if sim > best_artist_sim:
+                    best_artist_sim = sim
+
+            if best_artist_sim < min_artist_similarity:
+                continue
+
+            confidence, match_type = matching_engine.score_track_match(
+                source_title=source_title,
+                source_artists=[source_artist],
+                source_duration_ms=source_duration_ms,
+                candidate_title=result_name,
+                candidate_artists=result_artists,
+                candidate_duration_ms=result_duration
+            )
+
+            if confidence > best_confidence:
+                best_confidence = confidence
+                best_match = result
+                best_index = idx
+
+        except Exception as e:
+            print(f"⚠️ Error scoring candidate {idx}: {e}")
+            continue
+
+    return best_match, best_confidence, best_index
+
+
 def _run_tidal_discovery_worker(playlist_id):
     """Background worker for Tidal discovery process (Spotify preferred, iTunes fallback)"""
     try:
@@ -18096,10 +18197,6 @@ def _run_tidal_discovery_worker(playlist_id):
         # Store discovery source in state for frontend
         state['discovery_source'] = discovery_source
 
-        # Import matching engine for validation (like sync.py)
-        from core.matching_engine import MusicMatchingEngine
-        matching_engine = MusicMatchingEngine()
-
         successful_discoveries = 0
 
         for i, tidal_track in enumerate(playlist.tracks):
@@ -18114,7 +18211,7 @@ def _run_tidal_discovery_worker(playlist_id):
                 try:
                     cache_db = get_database()
                     cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source)
-                    if cached_match:
+                    if cached_match and _validate_discovery_cache_artist(tidal_track.artists[0] if tidal_track.artists else '', cached_match):
                         print(f"⚡ CACHE HIT [{i+1}/{len(playlist.tracks)}]: {tidal_track.name} by {', '.join(tidal_track.artists)}")
                         result = {
                             'tidal_track': {
@@ -18159,51 +18256,38 @@ def _run_tidal_discovery_worker(playlist_id):
                     'discovery_source': discovery_source
                 }
 
+                match_confidence = 0.0
+
                 if use_spotify and isinstance(track_result, tuple):
-                    # Spotify: Function returns (Track, raw_data)
-                    track_obj, raw_track_data = track_result
-                    # Use full album object from raw API response
+                    # Spotify: Function returns (Track, raw_data, confidence)
+                    track_obj, raw_track_data, match_confidence = track_result
                     album_obj = raw_track_data.get('album', {}) if raw_track_data else {}
 
                     match_data = {
                         'id': track_obj.id,
                         'name': track_obj.name,
-                        'artists': track_obj.artists,  # Already a list of strings
-                        'album': album_obj,  # Full album object with images
+                        'artists': track_obj.artists,
+                        'album': album_obj,
                         'duration_ms': track_obj.duration_ms,
                         'external_urls': track_obj.external_urls,
                         'source': 'spotify'
                     }
-                    result['spotify_data'] = match_data  # Backwards compatibility
+                    result['spotify_data'] = match_data
                     result['match_data'] = match_data
                     result['status'] = 'found'
+                    result['confidence'] = match_confidence
                     successful_discoveries += 1
                     state['spotify_matches'] = successful_discoveries
 
                 elif not use_spotify and track_result and isinstance(track_result, dict):
-                    # iTunes: Function returns a dict with track data
+                    # iTunes: Function returns a dict with track data (includes 'confidence' key)
+                    match_confidence = track_result.pop('confidence', 0.80)
                     match_data = track_result
                     match_data['source'] = 'itunes'
-                    result['spotify_data'] = match_data  # Use same field for frontend compatibility
-                    result['match_data'] = match_data
-                    result['status'] = 'found'
-                    successful_discoveries += 1
-                    state['spotify_matches'] = successful_discoveries
-
-                elif use_spotify and track_result:
-                    # Spotify fallback for old format (shouldn't happen after update)
-                    match_data = {
-                        'id': track_result.id,
-                        'name': track_result.name,
-                        'artists': track_result.artists,
-                        'album': {'name': track_result.album, 'album_type': 'album', 'images': []},
-                        'duration_ms': track_result.duration_ms,
-                        'external_urls': track_result.external_urls,
-                        'source': 'spotify'
-                    }
                     result['spotify_data'] = match_data
                     result['match_data'] = match_data
                     result['status'] = 'found'
+                    result['confidence'] = match_confidence
                     successful_discoveries += 1
                     state['spotify_matches'] = successful_discoveries
 
@@ -18212,11 +18296,11 @@ def _run_tidal_discovery_worker(playlist_id):
                     try:
                         cache_db = get_database()
                         cache_db.save_discovery_cache_match(
-                            cache_key[0], cache_key[1], discovery_source, 0.80,
+                            cache_key[0], cache_key[1], discovery_source, match_confidence,
                             result['match_data'], tidal_track.name,
                             tidal_track.artists[0] if tidal_track.artists else ''
                         )
-                        print(f"💾 CACHE SAVED: {tidal_track.name}")
+                        print(f"💾 CACHE SAVED: {tidal_track.name} (confidence: {match_confidence:.3f})")
                     except Exception as cache_err:
                         print(f"⚠️ Cache save error: {cache_err}")
 
@@ -18269,8 +18353,8 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client
         itunes_client: iTunes client instance (required when use_spotify=False)
 
     Returns:
-        For Spotify: (Track, raw_data) tuple or None
-        For iTunes: dict with track data or None
+        For Spotify: (Track, raw_data, confidence) tuple or None
+        For iTunes: dict with track data (includes 'confidence' key) or None
     """
     if use_spotify:
         if not spotify_client or not spotify_client.is_authenticated():
@@ -18288,13 +18372,13 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client
             return None
 
         artist_name = artists[0]  # Use primary artist
+        source_duration = getattr(tidal_track, 'duration_ms', 0) or 0
         source_name = "Spotify" if use_spotify else "iTunes"
 
         print(f"🔍 Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})")
 
         # Use matching engine to generate search queries (with fallback)
         try:
-            # Create a temporary SpotifyTrack-like object for the matching engine
             temp_track = type('TempTrack', (), {
                 'name': track_name,
                 'artists': [artist_name],
@@ -18304,133 +18388,53 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client
             print(f"🔍 Generated {len(search_queries)} search queries for Tidal track")
         except Exception as e:
             print(f"⚠️ Matching engine failed for Tidal, falling back to basic queries: {e}")
-            # Fallback to original simple queries
-            search_queries = [
-                f'track:"{track_name}" artist:"{artist_name}"',
-                f'"{track_name}" "{artist_name}"',
-                f'{track_name} {artist_name}'
-            ]
+            if use_spotify:
+                search_queries = [
+                    f'track:"{track_name}" artist:"{artist_name}"',
+                    f'"{track_name}" "{artist_name}"',
+                    f'{track_name} {artist_name}'
+                ]
+            else:
+                search_queries = [
+                    f'{artist_name} {track_name}',
+                    f'{track_name} {artist_name}',
+                    track_name
+                ]
 
-        # Find best match using confidence scoring
         best_match = None
-        best_match_raw = None  # Store raw Spotify API data for full album info
+        best_match_raw = None
         best_confidence = 0.0
-        min_confidence = 0.7  # Higher threshold for Tidal since data is cleaner
+        min_confidence = 0.7
 
         for query_idx, search_query in enumerate(search_queries):
             try:
                 print(f"🔍 Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})")
 
                 if use_spotify:
-                    # SPOTIFY PATH: Get raw Spotify API response to access full album object with images
-                    raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5)
+                    raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10)
                     if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']:
                         continue
-
-                    # Also get Track objects for matching logic
-                    results = spotify_client.search_tracks(search_query, limit=5)
-
+                    results = spotify_client.search_tracks(search_query, limit=10)
                     if not results:
                         continue
-
-                    # Score each result using matching engine
-                    for idx, result in enumerate(results):
-                        raw_track = raw_results['tracks']['items'][idx] if idx < len(raw_results['tracks']['items']) else None
-                        try:
-                            # Calculate confidence using matching engine's similarity scoring (with fallback)
-                            try:
-                                artist_confidence = 0.0
-                                if result.artists:
-                                    # Get best artist match confidence
-                                    for result_artist in result.artists:
-                                        artist_sim = matching_engine.similarity_score(
-                                            matching_engine.normalize_string(artist_name),
-                                            matching_engine.normalize_string(result_artist)
-                                        )
-                                        artist_confidence = max(artist_confidence, artist_sim)
-
-                                # Calculate title confidence
-                                title_confidence = matching_engine.similarity_score(
-                                    matching_engine.normalize_string(track_name),
-                                    matching_engine.normalize_string(result.name)
-                                )
-
-                                # Combined confidence (equal weighting for Tidal clean data)
-                                combined_confidence = (artist_confidence * 0.5 + title_confidence * 0.5)
-                            except Exception as e:
-                                print(f"⚠️ Matching engine scoring failed for Tidal, using first match: {e}")
-                                # Fallback: just take the first result if matching engine fails
-                                combined_confidence = 1.0  # Set high to accept this match
-                                best_match = result
-                                break
-
-                            print(f"🔍 Tidal candidate: '{result.artists[0]}' - '{result.name}' (confidence: {combined_confidence:.3f})")
-
-                            # Update best match if this is better
-                            if combined_confidence > best_confidence and combined_confidence >= min_confidence:
-                                best_confidence = combined_confidence
-                                best_match = result
-                                best_match_raw = raw_track  # Store raw data with full album object
-                                print(f"✅ New best Tidal match: {result.artists[0]} - {result.name} (confidence: {combined_confidence:.3f})")
-
-                        except Exception as e:
-                            print(f"❌ Error processing Tidal search result: {e}")
-                            continue
-
                 else:
-                    # ITUNES PATH: Search using iTunes client
-                    # For iTunes, use a simpler query format
-                    simple_query = f"{artist_name} {track_name}"
-                    itunes_results = itunes_client.search_tracks(simple_query, limit=5)
-
-                    if not itunes_results:
+                    raw_results = None
+                    results = itunes_client.search_tracks(search_query, limit=10)
+                    if not results:
                         continue
 
-                    # Score each iTunes result
-                    # Note: iTunes returns Track dataclass objects with 'artists' (list), not 'artist'
-                    for result in itunes_results:
-                        try:
-                            # Calculate confidence using matching engine
-                            try:
-                                artist_confidence = 0.0
-                                # iTunes Track has 'artists' as a list
-                                result_artists = result.artists if hasattr(result, 'artists') else []
-                                result_artist = result_artists[0] if result_artists else ''
-                                if result_artist:
-                                    artist_sim = matching_engine.similarity_score(
-                                        matching_engine.normalize_string(artist_name),
-                                        matching_engine.normalize_string(result_artist)
-                                    )
-                                    artist_confidence = artist_sim
-
-                                # Calculate title confidence
-                                result_name = result.name if hasattr(result, 'name') else ''
-                                title_confidence = matching_engine.similarity_score(
-                                    matching_engine.normalize_string(track_name),
-                                    matching_engine.normalize_string(result_name)
-                                )
-
-                                combined_confidence = (artist_confidence * 0.5 + title_confidence * 0.5)
-                            except Exception as e:
-                                print(f"⚠️ Matching engine scoring failed for iTunes Tidal, using first match: {e}")
-                                combined_confidence = 1.0
-                                best_match = result
-                                break
-
-                            result_artist_display = result_artists[0] if result_artists else 'Unknown'
-                            result_name_display = result.name if hasattr(result, 'name') else 'Unknown'
-                            print(f"🔍 iTunes Tidal candidate: '{result_artist_display}' - '{result_name_display}' (confidence: {combined_confidence:.3f})")
-
-                            if combined_confidence > best_confidence and combined_confidence >= min_confidence:
-                                best_confidence = combined_confidence
-                                best_match = result
-                                print(f"✅ New best iTunes Tidal match: {result_artist_display} - {result_name_display} (confidence: {combined_confidence:.3f})")
+                # Score all results using the matching engine
+                match, confidence, match_idx = _discovery_score_candidates(
+                    track_name, artist_name, source_duration, results
+                )
 
-                        except Exception as e:
-                            print(f"❌ Error processing iTunes Tidal search result: {e}")
-                            continue
+                if match and confidence > best_confidence and confidence >= min_confidence:
+                    best_confidence = confidence
+                    best_match = match
+                    if use_spotify and raw_results:
+                        best_match_raw = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None
+                    print(f"✅ New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                # If we found a very high confidence match, stop searching
                 if best_confidence >= 0.9:
                     print(f"🎯 High confidence Tidal match found ({best_confidence:.3f}), stopping search")
                     break
@@ -18439,19 +18443,33 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client
                 print(f"❌ Error in Tidal {source_name} search for query '{search_query}': {e}")
                 continue
 
+        # Strategy 4: Extended search with higher limit (last resort)
+        if not best_match:
+            print(f"🔄 Tidal Strategy 4: Extended search with limit=50")
+            query = f"{artist_name} {track_name}"
+            if use_spotify:
+                extended_results = spotify_client.search_tracks(query, limit=50)
+            else:
+                extended_results = itunes_client.search_tracks(query, limit=50)
+            if extended_results:
+                match, confidence, match_idx = _discovery_score_candidates(
+                    track_name, artist_name, source_duration, extended_results
+                )
+                if match and confidence >= min_confidence:
+                    best_match = match
+                    best_confidence = confidence
+                    print(f"✅ Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
         if best_match:
             if use_spotify:
                 print(f"✅ Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})")
-                return (best_match, best_match_raw)  # Return both Track object and raw data
+                return (best_match, best_match_raw, best_confidence)
             else:
-                # For iTunes, return a dict with normalized data
-                # Note: iTunes Track dataclass has 'artists' (list) and 'image_url', not 'artist' and 'artwork_url'
                 result_artists = best_match.artists if hasattr(best_match, 'artists') else []
                 result_artist = result_artists[0] if result_artists else 'Unknown'
                 result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown'
                 print(f"✅ Final Tidal iTunes match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})")
 
-                # Build iTunes result dict with album info
                 album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album'
                 image_url = best_match.image_url if hasattr(best_match, 'image_url') else ''
                 track_id = best_match.id if hasattr(best_match, 'id') else ''
@@ -18467,7 +18485,8 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client
                         'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else []
                     },
                     'duration_ms': duration_ms,
-                    'source': 'itunes'
+                    'source': 'itunes',
+                    'confidence': best_confidence
                 }
         else:
             print(f"❌ No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})")
@@ -18880,7 +18899,7 @@ def _run_youtube_discovery_worker(url_hash):
                 try:
                     cache_db = get_database()
                     cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source)
-                    if cached_match:
+                    if cached_match and _validate_discovery_cache_artist(cleaned_artist, cached_match):
                         print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {cleaned_artist} - {cleaned_title}")
                         result = {
                             'index': i,
@@ -18902,14 +18921,15 @@ def _run_youtube_discovery_worker(url_hash):
                 except Exception as cache_err:
                     print(f"⚠️ Cache lookup error: {cache_err}")
 
-                # Try multiple search strategies using matching_engine for better accuracy
+                # Try multiple search strategies using matching engine
                 matched_track = None
                 best_confidence = 0.0
-                min_confidence = 0.6  # Keep same threshold as before
+                best_raw_track = None
+                min_confidence = 0.6
+                source_duration = track.get('duration_ms', 0) or 0
 
-                # Strategy 1: Use matching_engine search queries (with fallback)
+                # Strategy 1: Use matching_engine search queries
                 try:
-                    # Create a temporary SpotifyTrack-like object for the matching engine
                     temp_track = type('TempTrack', (), {
                         'name': cleaned_title,
                         'artists': [cleaned_artist],
@@ -18919,95 +18939,40 @@ def _run_youtube_discovery_worker(url_hash):
                     print(f"🔍 Generated {len(search_queries)} search queries for YouTube track")
                 except Exception as e:
                     print(f"⚠️ Matching engine failed for YouTube, falling back to basic query: {e}")
-                    # Fallback to original simple query
-                    search_queries = [f"artist:{cleaned_artist} track:{cleaned_title}"]
-
-                # Store raw data for best match
-                best_raw_track = None
+                    search_queries = [f"{cleaned_artist} {cleaned_title}", cleaned_title]
 
                 for query_idx, search_query in enumerate(search_queries):
                     try:
                         print(f"🔍 YouTube query {query_idx + 1}/{len(search_queries)}: {search_query}")
 
-                        # Search using appropriate provider
                         raw_results = None
                         search_results = None
 
                         if use_spotify:
-                            # Get raw Spotify API response to access full album object with images
-                            raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5)
+                            raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10)
                             if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']:
                                 continue
-                            search_results = spotify_client.search_tracks(search_query, limit=5)
+                            search_results = spotify_client.search_tracks(search_query, limit=10)
                         else:
-                            # Use iTunes search
-                            search_results = itunes_client.search_tracks(search_query, limit=5)
+                            search_results = itunes_client.search_tracks(search_query, limit=10)
 
                         if not search_results:
                             continue
 
-                        # Score each result using matching engine
-                        for result_idx, search_result in enumerate(search_results):
-                            raw_track = None
-                            if use_spotify and raw_results:
-                                raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None
-                            try:
-                                # Calculate confidence using matching engine's similarity scoring (with fallback)
-                                try:
-                                    artist_confidence = 0.0
-                                    if search_result.artists:
-                                        # Get best artist match confidence
-                                        for result_artist in search_result.artists:
-                                            artist_sim = matching_engine.similarity_score(
-                                                matching_engine.normalize_string(cleaned_artist),
-                                                matching_engine.normalize_string(result_artist)
-                                            )
-                                            artist_confidence = max(artist_confidence, artist_sim)
-
-                                    # Calculate title confidence
-                                    title_confidence = matching_engine.similarity_score(
-                                        matching_engine.normalize_string(cleaned_title),
-                                        matching_engine.normalize_string(search_result.name)
-                                    )
-
-                                    # Combined confidence (70% title, 30% artist - same as original)
-                                    combined_confidence = (title_confidence * 0.7 + artist_confidence * 0.3)
-                                except Exception as e:
-                                    print(f"⚠️ Matching engine scoring failed for YouTube, using basic similarity: {e}")
-                                    # Fallback to original character overlap method
-                                    def _calculate_similarity_fallback(str1, str2):
-                                        if not str1 or not str2:
-                                            return 0
-                                        str1 = str1.lower().strip()
-                                        str2 = str2.lower().strip()
-                                        if str1 == str2:
-                                            return 1.0
-                                        set1 = set(str1.replace(' ', ''))
-                                        set2 = set(str2.replace(' ', ''))
-                                        if not set1 or not set2:
-                                            return 0
-                                        intersection = len(set1.intersection(set2))
-                                        union = len(set1.union(set2))
-                                        return intersection / union if union > 0 else 0
-
-                                    title_score = _calculate_similarity_fallback(cleaned_title, search_result.name)
-                                    artist_score = _calculate_similarity_fallback(cleaned_artist, search_result.artists[0] if search_result.artists else "")
-                                    combined_confidence = (title_score * 0.7) + (artist_score * 0.3)
-
-                                print(f"🔍 YouTube candidate: '{search_result.artists[0]}' - '{search_result.name}' (confidence: {combined_confidence:.3f})")
-
-                                # Update best match if this is better
-                                if combined_confidence > best_confidence and combined_confidence >= min_confidence:
-                                    best_confidence = combined_confidence
-                                    matched_track = search_result
-                                    best_raw_track = raw_track  # Store raw data with full album object (Spotify only)
-                                    print(f"✅ New best YouTube match: {search_result.artists[0]} - {search_result.name} (confidence: {combined_confidence:.3f})")
+                        # Score all results using the matching engine
+                        match, confidence, match_idx = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, search_results
+                        )
 
-                            except Exception as e:
-                                print(f"❌ Error processing YouTube search result: {e}")
-                                continue
+                        if match and confidence > best_confidence and confidence >= min_confidence:
+                            best_confidence = confidence
+                            matched_track = match
+                            if use_spotify and raw_results:
+                                best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None
+                            else:
+                                best_raw_track = None
+                            print(f"✅ New best YouTube match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                        # If we found a very high confidence match, stop searching
                         if best_confidence >= 0.9:
                             print(f"🎯 High confidence YouTube match found ({best_confidence:.3f}), stopping search")
                             break
@@ -19019,31 +18984,59 @@ def _run_youtube_discovery_worker(url_hash):
                 if matched_track:
                     print(f"✅ Strategy 1 YouTube match: {matched_track.artists[0]} - {matched_track.name} (confidence: {best_confidence:.3f})")
 
-                # Strategy 2: Swapped search (if first failed) - keep simple for fallback
+                # Strategy 2: Swapped search (if first failed) - score results properly
                 if not matched_track:
                     print("🔄 YouTube Strategy 2: Trying swapped search (artist/title reversed)")
-                    query = f"artist:{cleaned_title} track:{cleaned_artist}"
                     if use_spotify:
-                        fallback_results = spotify_client.search_tracks(query, limit=3)
+                        query = f"artist:{cleaned_title} track:{cleaned_artist}"
+                        fallback_results = spotify_client.search_tracks(query, limit=5)
                     else:
-                        fallback_results = itunes_client.search_tracks(query, limit=3)
+                        query = f"{cleaned_title} {cleaned_artist}"
+                        fallback_results = itunes_client.search_tracks(query, limit=5)
                     if fallback_results:
-                        matched_track = fallback_results[0]
-                        print(f"✅ Strategy 2 YouTube match (swapped): {matched_track.artists[0]} - {matched_track.name}")
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, fallback_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 2 YouTube match (swapped): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                # Strategy 3: Raw data search (if still failed) - keep simple for fallback
+                # Strategy 3: Raw data search (if still failed) - score results properly
                 if not matched_track:
-                    raw_title = track['raw_title']
-                    raw_artist = track['raw_artist']
+                    raw_title = track.get('raw_title', cleaned_title)
+                    raw_artist = track.get('raw_artist', cleaned_artist)
                     print(f"🔄 YouTube Strategy 3: Trying raw data search: '{raw_artist} {raw_title}'")
                     query = f"{raw_artist} {raw_title}"
                     if use_spotify:
-                        fallback_results = spotify_client.search_tracks(query, limit=3)
+                        fallback_results = spotify_client.search_tracks(query, limit=5)
                     else:
-                        fallback_results = itunes_client.search_tracks(query, limit=3)
+                        fallback_results = itunes_client.search_tracks(query, limit=5)
                     if fallback_results:
-                        matched_track = fallback_results[0]
-                        print(f"✅ Strategy 3 YouTube match (raw): {matched_track.artists[0]} - {matched_track.name}")
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, fallback_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 3 YouTube match (raw): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
+                # Strategy 4: Extended search with higher limit (last resort)
+                if not matched_track:
+                    print(f"🔄 YouTube Strategy 4: Extended search with limit=50")
+                    query = f"{cleaned_artist} {cleaned_title}"
+                    if use_spotify:
+                        extended_results = spotify_client.search_tracks(query, limit=50)
+                    else:
+                        extended_results = itunes_client.search_tracks(query, limit=50)
+                    if extended_results:
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, extended_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 4 YouTube match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
                 # Create result entry
                 result = {
@@ -19056,24 +19049,23 @@ def _run_youtube_discovery_worker(url_hash):
                     'spotify_artist': matched_track.artists[0] if matched_track else '',
                     'spotify_album': matched_track.album if matched_track else '',
                     'duration': f"{track['duration_ms'] // 60000}:{(track['duration_ms'] % 60000) // 1000:02d}" if track['duration_ms'] else '0:00',
-                    'discovery_source': discovery_source
+                    'discovery_source': discovery_source,
+                    'confidence': best_confidence
                 }
 
                 if matched_track:
-                    state['spotify_matches'] += 1  # Keep key name for compatibility
+                    state['spotify_matches'] += 1
 
                     # Build album data based on provider
                     if use_spotify and best_raw_track:
                         album_data = best_raw_track.get('album', {})
                     else:
-                        # For iTunes or when raw data unavailable
                         album_data = {
                             'name': matched_track.album,
                             'album_type': 'album',
                             'images': [{'url': matched_track.image_url}] if hasattr(matched_track, 'image_url') and matched_track.image_url else []
                         }
 
-                    # Store track data with source info
                     result['matched_data'] = {
                         'id': matched_track.id,
                         'name': matched_track.name,
@@ -19082,10 +19074,9 @@ def _run_youtube_discovery_worker(url_hash):
                         'duration_ms': matched_track.duration_ms,
                         'source': discovery_source
                     }
-                    # Keep spotify_data for backward compatibility
                     result['spotify_data'] = result['matched_data']
 
-                    # Save to discovery cache (only Strategy 1 high-confidence matches)
+                    # Save to discovery cache (only high-confidence matches)
                     if best_confidence >= 0.7:
                         try:
                             cache_db = get_database()
@@ -19103,7 +19094,6 @@ def _run_youtube_discovery_worker(url_hash):
 
             except Exception as e:
                 print(f"❌ Error processing track {i}: {e}")
-                # Add failed result
                 result = {
                     'index': i,
                     'yt_track': track['name'],
@@ -19116,19 +19106,18 @@ def _run_youtube_discovery_worker(url_hash):
                     'duration': '0:00'
                 }
                 state['discovery_results'].append(result)
-        
+
         # Complete discovery
         state['phase'] = 'discovered'
         state['status'] = 'complete'
         state['discovery_progress'] = 100
-        
-        # Add activity for discovery completion
+
         playlist_name = playlist['name']
         source_label = 'Spotify' if use_spotify else 'iTunes'
         add_activity_item("✅", f"YouTube Discovery Complete ({source_label})", f"'{playlist_name}' - {state['spotify_matches']}/{len(tracks)} tracks found", "Now")
 
         print(f"✅ YouTube discovery complete ({discovery_source}): {state['spotify_matches']}/{len(tracks)} tracks matched")
-        
+
     except Exception as e:
         print(f"❌ Error in YouTube discovery worker: {e}")
         state['status'] = 'error'
@@ -19173,7 +19162,7 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                 try:
                     cache_db = get_database()
                     cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source)
-                    if cached_match:
+                    if cached_match and _validate_discovery_cache_artist(cleaned_artist, cached_match):
                         print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {cleaned_artist} - {cleaned_title}")
                         result = {
                             'index': i,
@@ -19195,14 +19184,15 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                 except Exception as cache_err:
                     print(f"⚠️ Cache lookup error: {cache_err}")
 
-                # Try multiple search strategies using matching_engine for better accuracy
+                # Try multiple search strategies using matching engine
                 matched_track = None
                 best_confidence = 0.0
-                min_confidence = 0.6  # Keep same threshold as YouTube
+                best_raw_track = None
+                min_confidence = 0.6
+                source_duration = duration_ms or 0
 
-                # Strategy 1: Use matching_engine search queries (with fallback)
+                # Strategy 1: Use matching_engine search queries
                 try:
-                    # Create a temporary SpotifyTrack-like object for the matching engine
                     temp_track = type('TempTrack', (), {
                         'name': cleaned_title,
                         'artists': [cleaned_artist],
@@ -19212,95 +19202,40 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                     print(f"🔍 Generated {len(search_queries)} search queries for ListenBrainz track")
                 except Exception as e:
                     print(f"⚠️ Matching engine failed for ListenBrainz, falling back to basic query: {e}")
-                    # Fallback to original simple query
-                    search_queries = [f"artist:{cleaned_artist} track:{cleaned_title}"]
-
-                # Store raw data for best match
-                best_raw_track = None
+                    search_queries = [f"{cleaned_artist} {cleaned_title}", cleaned_title]
 
                 for query_idx, search_query in enumerate(search_queries):
                     try:
                         print(f"🔍 ListenBrainz query {query_idx + 1}/{len(search_queries)}: {search_query}")
 
-                        # Search using appropriate provider
                         raw_results = None
                         search_results = None
 
                         if use_spotify:
-                            # Get raw Spotify API response to access full album object with images
-                            raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5)
+                            raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10)
                             if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']:
                                 continue
-                            search_results = spotify_client.search_tracks(search_query, limit=5)
+                            search_results = spotify_client.search_tracks(search_query, limit=10)
                         else:
-                            # Use iTunes search
-                            search_results = itunes_client.search_tracks(search_query, limit=5)
+                            search_results = itunes_client.search_tracks(search_query, limit=10)
 
                         if not search_results:
                             continue
 
-                        # Score each result using matching engine
-                        for result_idx, search_result in enumerate(search_results):
-                            raw_track = None
-                            if use_spotify and raw_results:
-                                raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None
-                            try:
-                                # Calculate confidence using matching engine's similarity scoring (with fallback)
-                                try:
-                                    artist_confidence = 0.0
-                                    if search_result.artists:
-                                        # Get best artist match confidence
-                                        for result_artist in search_result.artists:
-                                            artist_sim = matching_engine.similarity_score(
-                                                matching_engine.normalize_string(cleaned_artist),
-                                                matching_engine.normalize_string(result_artist)
-                                            )
-                                            artist_confidence = max(artist_confidence, artist_sim)
-
-                                    # Calculate title confidence
-                                    title_confidence = matching_engine.similarity_score(
-                                        matching_engine.normalize_string(cleaned_title),
-                                        matching_engine.normalize_string(search_result.name)
-                                    )
-
-                                    # Combined confidence (70% title, 30% artist - same as YouTube)
-                                    combined_confidence = (title_confidence * 0.7 + artist_confidence * 0.3)
-                                except Exception as e:
-                                    print(f"⚠️ Matching engine scoring failed for ListenBrainz, using basic similarity: {e}")
-                                    # Fallback to original character overlap method
-                                    def _calculate_similarity_fallback(str1, str2):
-                                        if not str1 or not str2:
-                                            return 0
-                                        str1 = str1.lower().strip()
-                                        str2 = str2.lower().strip()
-                                        if str1 == str2:
-                                            return 1.0
-                                        set1 = set(str1.replace(' ', ''))
-                                        set2 = set(str2.replace(' ', ''))
-                                        if not set1 or not set2:
-                                            return 0
-                                        intersection = len(set1.intersection(set2))
-                                        union = len(set1.union(set2))
-                                        return intersection / union if union > 0 else 0
-
-                                    title_score = _calculate_similarity_fallback(cleaned_title, search_result.name)
-                                    artist_score = _calculate_similarity_fallback(cleaned_artist, search_result.artists[0] if search_result.artists else "")
-                                    combined_confidence = (title_score * 0.7) + (artist_score * 0.3)
-
-                                print(f"🔍 ListenBrainz candidate: '{search_result.artists[0]}' - '{search_result.name}' (confidence: {combined_confidence:.3f})")
-
-                                # Update best match if this is better
-                                if combined_confidence > best_confidence and combined_confidence >= min_confidence:
-                                    best_confidence = combined_confidence
-                                    matched_track = search_result
-                                    best_raw_track = raw_track  # Store raw data with full album object (Spotify only)
-                                    print(f"✅ New best ListenBrainz match: {search_result.artists[0]} - {search_result.name} (confidence: {combined_confidence:.3f})")
+                        # Score all results using the matching engine
+                        match, confidence, match_idx = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, search_results
+                        )
 
-                            except Exception as e:
-                                print(f"❌ Error processing ListenBrainz search result: {e}")
-                                continue
+                        if match and confidence > best_confidence and confidence >= min_confidence:
+                            best_confidence = confidence
+                            matched_track = match
+                            if use_spotify and raw_results:
+                                best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None
+                            else:
+                                best_raw_track = None
+                            print(f"✅ New best ListenBrainz match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                        # If we found a very high confidence match, stop searching
                         if best_confidence >= 0.9:
                             print(f"🎯 High confidence ListenBrainz match found ({best_confidence:.3f}), stopping search")
                             break
@@ -19312,29 +19247,58 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                 if matched_track:
                     print(f"✅ Strategy 1 ListenBrainz match: {matched_track.artists[0]} - {matched_track.name} (confidence: {best_confidence:.3f})")
 
-                # Strategy 2: Swapped search (if first failed) - keep simple for fallback
+                # Strategy 2: Swapped search (if first failed) - score results properly
                 if not matched_track:
                     print("🔄 ListenBrainz Strategy 2: Trying swapped search (artist/title reversed)")
-                    query = f"artist:{cleaned_title} track:{cleaned_artist}"
                     if use_spotify:
-                        fallback_results = spotify_client.search_tracks(query, limit=3)
+                        query = f"artist:{cleaned_title} track:{cleaned_artist}"
+                        fallback_results = spotify_client.search_tracks(query, limit=5)
                     else:
-                        fallback_results = itunes_client.search_tracks(query, limit=3)
+                        query = f"{cleaned_title} {cleaned_artist}"
+                        fallback_results = itunes_client.search_tracks(query, limit=5)
                     if fallback_results:
-                        matched_track = fallback_results[0]
-                        print(f"✅ Strategy 2 ListenBrainz match (swapped): {matched_track.artists[0]} - {matched_track.name}")
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, fallback_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 2 ListenBrainz match (swapped): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                # Strategy 3: Album-based search (if still failed and we have album name)
+                # Strategy 3: Album-based search (if still failed and we have album name) - score results properly
                 if not matched_track and album_name:
                     print(f"🔄 ListenBrainz Strategy 3: Trying album-based search: '{cleaned_artist} {album_name} {cleaned_title}'")
-                    query = f"artist:{cleaned_artist} album:{album_name} track:{cleaned_title}"
                     if use_spotify:
-                        fallback_results = spotify_client.search_tracks(query, limit=3)
+                        query = f"artist:{cleaned_artist} album:{album_name} track:{cleaned_title}"
+                        fallback_results = spotify_client.search_tracks(query, limit=5)
                     else:
-                        fallback_results = itunes_client.search_tracks(query, limit=3)
+                        query = f"{cleaned_artist} {album_name} {cleaned_title}"
+                        fallback_results = itunes_client.search_tracks(query, limit=5)
                     if fallback_results:
-                        matched_track = fallback_results[0]
-                        print(f"✅ Strategy 3 ListenBrainz match (album): {matched_track.artists[0]} - {matched_track.name}")
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, fallback_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 3 ListenBrainz match (album): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
+                # Strategy 4: Extended search with higher limit (last resort)
+                if not matched_track:
+                    print(f"🔄 ListenBrainz Strategy 4: Extended search with limit=50")
+                    query = f"{cleaned_artist} {cleaned_title}"
+                    if use_spotify:
+                        extended_results = spotify_client.search_tracks(query, limit=50)
+                    else:
+                        extended_results = itunes_client.search_tracks(query, limit=50)
+                    if extended_results:
+                        match, confidence, _ = _discovery_score_candidates(
+                            cleaned_title, cleaned_artist, source_duration, extended_results
+                        )
+                        if match and confidence >= min_confidence:
+                            matched_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 4 ListenBrainz match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
                 # Create result entry
                 result = {
@@ -19347,24 +19311,23 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                     'spotify_artist': matched_track.artists[0] if matched_track else '',
                     'spotify_album': matched_track.album if matched_track else '',
                     'duration': f"{duration_ms // 60000}:{(duration_ms % 60000) // 1000:02d}" if duration_ms else '0:00',
-                    'discovery_source': discovery_source
+                    'discovery_source': discovery_source,
+                    'confidence': best_confidence
                 }
 
                 if matched_track:
-                    state['spotify_matches'] += 1  # Keep key name for compatibility
+                    state['spotify_matches'] += 1
 
                     # Build album data based on provider
                     if use_spotify and best_raw_track:
                         album_data = best_raw_track.get('album', {})
                     else:
-                        # For iTunes or when raw data unavailable
                         album_data = {
                             'name': matched_track.album,
                             'album_type': 'album',
                             'images': [{'url': matched_track.image_url}] if hasattr(matched_track, 'image_url') and matched_track.image_url else []
                         }
 
-                    # Store track data with source info
                     result['matched_data'] = {
                         'id': matched_track.id,
                         'name': matched_track.name,
@@ -19373,10 +19336,9 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
                         'duration_ms': matched_track.duration_ms,
                         'source': discovery_source
                     }
-                    # Keep spotify_data for backward compatibility
                     result['spotify_data'] = result['matched_data']
 
-                    # Save to discovery cache (only Strategy 1 high-confidence matches)
+                    # Save to discovery cache (only high-confidence matches)
                     if best_confidence >= 0.7:
                         try:
                             cache_db = get_database()
@@ -19394,7 +19356,6 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
 
             except Exception as e:
                 print(f"❌ Error processing track {i}: {e}")
-                # Add failed result
                 result = {
                     'index': i,
                     'lb_track': track['track_name'],
@@ -19413,7 +19374,6 @@ def _run_listenbrainz_discovery_worker(playlist_mbid):
         state['status'] = 'complete'
         state['discovery_progress'] = 100
 
-        # Add activity for discovery completion
         playlist_name = playlist['name']
         source_label = 'Spotify' if use_spotify else 'iTunes'
         add_activity_item("✅", f"ListenBrainz Discovery Complete ({source_label})", f"'{playlist_name}' - {state['spotify_matches']}/{len(tracks)} tracks found", "Now")
@@ -25707,7 +25667,7 @@ def _run_beatport_discovery_worker(url_hash):
                 try:
                     cache_db = get_database()
                     cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source)
-                    if cached_match:
+                    if cached_match and _validate_discovery_cache_artist(track_artist, cached_match):
                         print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {track_artist} - {track_title}")
                         # Convert artists from ['str'] to [{'name': 'str'}] for Beatport frontend format
                         beatport_artists = cached_match.get('artists', [])
@@ -25730,12 +25690,14 @@ def _run_beatport_discovery_worker(url_hash):
                 except Exception as cache_err:
                     print(f"⚠️ Cache lookup error: {cache_err}")
 
-                # Use matching engine for sophisticated track matching (like other discovery processes)
+                # Use matching engine for track matching
                 found_track = None
+                best_confidence = 0.0
+                best_raw_track = None
+                min_confidence = 0.75  # Higher threshold for Beatport to avoid bad matches
 
                 # Generate search queries using matching engine (with fallback)
                 try:
-                    # Create a temporary SpotifyTrack-like object for the matching engine
                     temp_track = type('TempTrack', (), {
                         'name': track_title,
                         'artists': [track_artist],
@@ -25745,154 +25707,51 @@ def _run_beatport_discovery_worker(url_hash):
                     print(f"🔍 Generated {len(search_queries)} search queries using matching engine")
                 except Exception as e:
                     print(f"⚠️ Matching engine failed for Beatport, falling back to basic queries: {e}")
-                    # Fallback to basic search queries
-                    search_queries = [
-                        f"{track_artist} {track_title}",
-                        f'artist:"{track_artist}" track:"{track_title}"',
-                        f'"{track_artist}" "{track_title}"'
-                    ]
-
-                # Try each search query until we find a good match
-                best_match = None
-                best_raw_track = None  # Store raw Spotify data for full album info
-                best_confidence = 0.0
-                min_confidence = 0.75  # Increased threshold to avoid bad matches like "Dolce" for "Fancy $hit"
+                    if use_spotify:
+                        search_queries = [
+                            f"{track_artist} {track_title}",
+                            f'artist:"{track_artist}" track:"{track_title}"',
+                            f'"{track_artist}" "{track_title}"'
+                        ]
+                    else:
+                        search_queries = [
+                            f"{track_artist} {track_title}",
+                            f"{track_title} {track_artist}",
+                            track_title
+                        ]
 
                 for query_idx, search_query in enumerate(search_queries):
                     try:
                         print(f"🔍 Query {query_idx + 1}/{len(search_queries)}: {search_query} ({discovery_source.upper()})")
 
+                        raw_results = None
+                        search_results = None
+
                         if use_spotify:
-                            # SPOTIFY PATH: Get raw Spotify API response to access full album object with images
                             raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10)
                             if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']:
                                 continue
-
                             search_results = spotify_client.search_tracks(search_query, limit=10)
-
-                            if not search_results:
-                                continue
-
-                            # Use matching engine to find the best match from search results
-                            for result_idx, result in enumerate(search_results):
-                                raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None
-                                try:
-                                    # Calculate confidence using matching engine's similarity scoring (with fallback)
-                                    try:
-                                        artist_confidence = 0.0
-                                        if result.artists:
-                                            # Get best artist match confidence
-                                            result_artist_names = [artist for artist in result.artists]
-                                            for result_artist in result_artist_names:
-                                                artist_sim = matching_engine.similarity_score(
-                                                    matching_engine.normalize_string(track_artist),
-                                                    matching_engine.normalize_string(result_artist)
-                                                )
-                                                artist_confidence = max(artist_confidence, artist_sim)
-
-                                        # Calculate title confidence
-                                        title_confidence = matching_engine.similarity_score(
-                                            matching_engine.normalize_string(track_title),
-                                            matching_engine.normalize_string(result.name)
-                                        )
-
-                                        # Combined confidence (more balanced to avoid bad matches from same artist)
-                                        combined_confidence = (artist_confidence * 0.4 + title_confidence * 0.6)
-                                    except Exception as e:
-                                        print(f"⚠️ Matching engine scoring failed for Beatport, using basic matching: {e}")
-                                        # Fallback to simple string matching
-                                        artist_match = any(track_artist.lower() in artist.lower() for artist in result.artists) if result.artists else False
-                                        title_match = track_title.lower() in result.name.lower() or result.name.lower() in track_title.lower()
-                                        combined_confidence = 0.8 if (artist_match and title_match) else 0.4 if (artist_match or title_match) else 0.1
-
-                                    print(f"🔍 Match candidate: '{result.artists[0]}' - '{result.name}'")
-                                    print(f"    Artist confidence: {artist_confidence:.3f} ('{track_artist}' vs '{result.artists[0]}')")
-                                    print(f"    Title confidence: {title_confidence:.3f} ('{track_title}' vs '{result.name}')")
-                                    print(f"    Combined confidence: {combined_confidence:.3f} (threshold: {min_confidence})")
-
-                                    # Additional check for core title similarity (excluding version keywords)
-                                    def remove_version_keywords(title):
-                                        keywords = ['extended mix', 'radio mix', 'club mix', 'remix', 'extended', 'version', 'mix', 'original']
-                                        clean_title = title.lower()
-                                        for keyword in keywords:
-                                            clean_title = clean_title.replace(keyword, '').strip(' -()[]')
-                                        return clean_title.strip()
-
-                                    core_title1 = remove_version_keywords(track_title)
-                                    core_title2 = remove_version_keywords(result.name)
-                                    core_title_confidence = matching_engine.similarity_score(core_title1, core_title2)
-
-                                    print(f"    Core title confidence: {core_title_confidence:.3f} ('{core_title1}' vs '{core_title2}')")
-
-                                    # Update best match if this is better AND meets all similarity requirements
-                                    min_title_confidence = 0.5  # Require at least 50% title similarity
-                                    min_core_title_confidence = 0.4  # Require at least 40% core title similarity
-                                    if (combined_confidence > best_confidence and
-                                        combined_confidence >= min_confidence and
-                                        title_confidence >= min_title_confidence and
-                                        core_title_confidence >= min_core_title_confidence):
-                                        best_confidence = combined_confidence
-                                        best_match = result
-                                        best_raw_track = raw_track  # Store raw data with full album object
-                                        print(f"✅ New best match: {result.artists[0]} - {result.name} (confidence: {combined_confidence:.3f})")
-
-                                except Exception as e:
-                                    print(f"❌ Error processing search result: {e}")
-                                    continue
-
                         else:
-                            # ITUNES PATH: Search using iTunes client
-                            simple_query = f"{track_artist} {track_title}"
-                            itunes_results = itunes_client_instance.search_tracks(simple_query, limit=10)
+                            search_results = itunes_client_instance.search_tracks(search_query, limit=10)
 
-                            if not itunes_results:
-                                continue
-
-                            # Score each iTunes result
-                            # Note: iTunes returns Track dataclass objects with 'artists' (list), not 'artist'
-                            for result in itunes_results:
-                                try:
-                                    # Calculate confidence using matching engine
-                                    try:
-                                        artist_confidence = 0.0
-                                        # iTunes Track has 'artists' as a list
-                                        result_artists = result.artists if hasattr(result, 'artists') else []
-                                        result_artist = result_artists[0] if result_artists else ''
-                                        if result_artist:
-                                            artist_sim = matching_engine.similarity_score(
-                                                matching_engine.normalize_string(track_artist),
-                                                matching_engine.normalize_string(result_artist)
-                                            )
-                                            artist_confidence = artist_sim
-
-                                        # Calculate title confidence
-                                        result_name = result.name if hasattr(result, 'name') else ''
-                                        title_confidence = matching_engine.similarity_score(
-                                            matching_engine.normalize_string(track_title),
-                                            matching_engine.normalize_string(result_name)
-                                        )
-
-                                        combined_confidence = (artist_confidence * 0.4 + title_confidence * 0.6)
-                                    except Exception as e:
-                                        print(f"⚠️ Matching engine scoring failed for iTunes Beatport, using first match: {e}")
-                                        combined_confidence = 1.0
-                                        best_match = result
-                                        break
-
-                                    result_artist_display = result_artists[0] if result_artists else 'Unknown'
-                                    result_name_display = result.name if hasattr(result, 'name') else 'Unknown'
-                                    print(f"🔍 iTunes Beatport candidate: '{result_artist_display}' - '{result_name_display}' (confidence: {combined_confidence:.3f})")
+                        if not search_results:
+                            continue
 
-                                    if combined_confidence > best_confidence and combined_confidence >= min_confidence:
-                                        best_confidence = combined_confidence
-                                        best_match = result
-                                        print(f"✅ New best iTunes Beatport match: {result_artist_display} - {result_name_display} (confidence: {combined_confidence:.3f})")
+                        # Score all results using the matching engine
+                        match, confidence, match_idx = _discovery_score_candidates(
+                            track_title, track_artist, 0, search_results
+                        )
 
-                                except Exception as e:
-                                    print(f"❌ Error processing iTunes Beatport search result: {e}")
-                                    continue
+                        if match and confidence > best_confidence and confidence >= min_confidence:
+                            best_confidence = confidence
+                            found_track = match
+                            if use_spotify and raw_results:
+                                best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None
+                            else:
+                                best_raw_track = None
+                            print(f"✅ New best Beatport match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
 
-                        # If we found a very high confidence match, stop searching
                         if best_confidence >= 0.9:
                             print(f"🎯 High confidence match found ({best_confidence:.3f}), stopping search")
                             break
@@ -25901,16 +25760,25 @@ def _run_beatport_discovery_worker(url_hash):
                         print(f"❌ Error in {discovery_source.upper()} search for query '{search_query}': {e}")
                         continue
 
-                found_track = best_match
-                if found_track:
+                # Strategy 4: Extended search with higher limit (last resort)
+                if not found_track:
+                    print(f"🔄 Beatport Strategy 4: Extended search with limit=50")
+                    query = f"{track_artist} {track_title}"
                     if use_spotify:
-                        print(f"✅ Final Spotify match selected: {found_track.artists[0]} - {found_track.name} (confidence: {best_confidence:.3f})")
+                        extended_results = spotify_client.search_tracks(query, limit=50)
                     else:
-                        # iTunes Track has 'artists' (list), not 'artist'
-                        found_artists = found_track.artists if hasattr(found_track, 'artists') else []
-                        found_artist = found_artists[0] if found_artists else 'Unknown'
-                        found_name = found_track.name if hasattr(found_track, 'name') else 'Unknown'
-                        print(f"✅ Final iTunes match selected: {found_artist} - {found_name} (confidence: {best_confidence:.3f})")
+                        extended_results = itunes_client_instance.search_tracks(query, limit=50)
+                    if extended_results:
+                        match, confidence, _ = _discovery_score_candidates(
+                            track_title, track_artist, 0, extended_results
+                        )
+                        if match and confidence >= min_confidence:
+                            found_track = match
+                            best_confidence = confidence
+                            print(f"✅ Strategy 4 Beatport match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
+                if found_track:
+                    print(f"✅ Final Beatport match: {found_track.artists[0]} - {found_track.name} (confidence: {best_confidence:.3f})")
                 else:
                     print(f"❌ No suitable match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})")
 
@@ -25922,8 +25790,9 @@ def _run_beatport_discovery_worker(url_hash):
                         'artist': track_artist
                     },
                     'status': 'found' if found_track else 'not_found',
-                    'status_class': 'found' if found_track else 'not-found',  # Add status class for CSS styling
-                    'discovery_source': discovery_source
+                    'status_class': 'found' if found_track else 'not-found',
+                    'discovery_source': discovery_source,
+                    'confidence': best_confidence
                 }
 
                 if found_track: