From e77dbf3f1ef646b0f68345d39367446e78d96030 Mon Sep 17 00:00:00 2001 From: Broque Thomas Date: Wed, 25 Feb 2026 22:43:33 -0800 Subject: [PATCH] Refactor matching + use improved discovery scoring Introduce a generic score_track_match(...) in core/matching_engine.py and make calculate_match_confidence(...) delegate to it. The new scorer is source-agnostic, consolidates artist/title/duration logic (core-title fast path, cleaned similarity, weighted 60/30/10 scoring) and improves artist matching. In web_server.py add cache-validation (_validate_discovery_cache_artist) and a reusable _discovery_score_candidates(...) helper that calls the new scorer. Propagate per-match confidence through discovery flows (Tidal, YouTube, ListenBrainz, Beatport), increase Spotify/iTunes search limits, add an extended high-limit search strategy, tighten per-source thresholds, and save match confidence to the discovery cache. Overall this centralizes and standardizes matching logic and improves accuracy/validation for cached discovery results. --- core/matching_engine.py | 84 ++-- web_server.py | 849 +++++++++++++++++----------------------- 2 files changed, 410 insertions(+), 523 deletions(-) diff --git a/core/matching_engine.py b/core/matching_engine.py index 75a95336..bc7ec1ed 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -238,49 +238,67 @@ class MusicMatchingEngine: diff_ratio = abs(duration1 - duration2) / max(duration1, duration2) return max(0, 1.0 - diff_ratio * 5) - def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: - """Calculates a confidence score using a prioritized model, starting with a strict 'core' title check.""" - - # --- Artist Scoring (calculated once) --- - spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a] - plex_artist_normalized = self.normalize_string(plex_track.artist) - plex_artist_cleaned = self.clean_artist(plex_track.artist) + def score_track_match(self, source_title: str, source_artists: List[str], + source_duration_ms: int, candidate_title: str, + candidate_artists: List[str], candidate_duration_ms: int) -> Tuple[float, str]: + """Generic track matching — same logic as calculate_match_confidence but type-agnostic. + + Works for any two tracks regardless of source (Spotify, iTunes, YouTube, Tidal, etc.). + Uses clean_title/clean_artist for proper feat. stripping, core title fast path, + duration similarity, and 60/30/10 weighted scoring. + + Returns (confidence, match_type) tuple. + """ + # --- Artist Scoring --- + source_artists_cleaned = [self.clean_artist(a) for a in source_artists if a] best_artist_score = 0.0 - for spotify_artist in spotify_artists_cleaned: - if spotify_artist and spotify_artist in plex_artist_normalized: - best_artist_score = 1.0 + for src_artist in source_artists_cleaned: + for raw_cand_artist in candidate_artists: + if not raw_cand_artist: + continue + cand_artist_normalized = self.normalize_string(raw_cand_artist) + cand_artist_cleaned = self.clean_artist(raw_cand_artist) + # Check containment (e.g., "drake" in "drake 21 savage") + if src_artist and src_artist in cand_artist_normalized: + best_artist_score = 1.0 + break + score = self.similarity_score(src_artist, cand_artist_cleaned) + if score > best_artist_score: + best_artist_score = score + if best_artist_score >= 1.0: break - score = self.similarity_score(spotify_artist, plex_artist_cleaned) - if score > best_artist_score: - best_artist_score = score artist_score = best_artist_score - - # --- Priority 1: Core Title Match (for exact matches like "Girls", "APT.", "LIL DEMON") --- - spotify_core_title = self.get_core_string(spotify_track.name) - plex_core_title = self.get_core_string(plex_track.title) - if spotify_core_title and spotify_core_title == plex_core_title: - # SAFETY CHECK: Only give high confidence if artist also matches reasonably well - # This prevents "Artist A - Girls" from matching "Artist Z - Girls" with high confidence - if artist_score >= 0.75: # Require decent artist match - # If the core titles are identical and artists match, we are highly confident - confidence = 0.90 + (artist_score * 0.09) # Max score of 0.99 + # --- Priority 1: Core Title Match --- + source_core_title = self.get_core_string(source_title) + candidate_core_title = self.get_core_string(candidate_title) + + if source_core_title and source_core_title == candidate_core_title: + if artist_score >= 0.75: + confidence = 0.90 + (artist_score * 0.09) return confidence, "core_title_match" - # If artist score is too low, fall through to standard weighted calculation - # --- Priority 2: Fuzzy Title Match (for variations, typos, etc.) --- - spotify_title_cleaned = self.clean_title(spotify_track.name) - plex_title_cleaned = self.clean_title(plex_track.title) - - title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned) - duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0) + # --- Priority 2: Fuzzy Title Match --- + source_title_cleaned = self.clean_title(source_title) + candidate_title_cleaned = self.clean_title(candidate_title) + + title_score = self.similarity_score(source_title_cleaned, candidate_title_cleaned) + duration_score = self.duration_similarity(source_duration_ms, candidate_duration_ms) - # Use a standard weighted calculation if the core titles didn't match confidence = (title_score * 0.60) + (artist_score * 0.30) + (duration_score * 0.10) - match_type = "standard_match" + return confidence, "standard_match" - return confidence, match_type + def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: + """Calculates a confidence score using a prioritized model, starting with a strict 'core' title check.""" + return self.score_track_match( + source_title=spotify_track.name, + source_artists=spotify_track.artists, + source_duration_ms=spotify_track.duration_ms, + candidate_title=plex_track.title, + candidate_artists=[plex_track.artist] if plex_track.artist else [], + candidate_duration_ms=plex_track.duration if plex_track.duration else 0 + ) def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult: """Finds the best Plex track match from a list of candidates.""" diff --git a/web_server.py b/web_server.py index a3926db7..80ebb146 100644 --- a/web_server.py +++ b/web_server.py @@ -18075,6 +18075,107 @@ def _get_discovery_cache_key(title, artist): return (norm_title, norm_artist) +def _validate_discovery_cache_artist(source_artist, cached_match): + """Check if a cached discovery match has a valid artist. Returns False if the + cached result's artist doesn't match the source artist (stale/wrong cache entry).""" + min_artist_similarity = 0.4 + source_artist_cleaned = matching_engine.clean_artist(source_artist) + if not source_artist_cleaned: + return True # No source artist to validate against + + cached_artists = cached_match.get('artists', []) + if not cached_artists: + return True # No cached artist to check + + best_sim = 0.0 + for cand_artist in cached_artists: + if not cand_artist: + continue + cand_normalized = matching_engine.normalize_string(cand_artist) + if source_artist_cleaned in cand_normalized: + return True + cand_cleaned = matching_engine.clean_artist(cand_artist) + sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned) + if sim > best_sim: + best_sim = sim + + if best_sim < min_artist_similarity: + print(f"🚫 Cache artist mismatch: source='{source_artist}' vs cached='{cached_artists[0]}' (sim={best_sim:.2f}), re-searching") + return False + return True + + +def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results): + """Score search results against a source track using the matching engine. + + Uses matching_engine.score_track_match() which applies clean_title/clean_artist, + core title fast path, duration similarity, and 60/30/10 weighted scoring. + + Includes a minimum artist similarity check to prevent "right title, wrong artist" + matches (e.g., "Yoga" by bbno$ matching "Yoga" by Sleep Music Lullabies). + + Args: + source_title: The source track title (already cleaned for YouTube, raw for others) + source_artist: The source track primary artist + source_duration_ms: The source track duration in ms (0 if unknown) + search_results: List of Track objects (Spotify or iTunes) from search + + Returns: + (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results + """ + best_match = None + best_confidence = 0.0 + best_index = -1 + min_artist_similarity = 0.4 # Reject candidates where artist doesn't match at all + + source_artist_cleaned = matching_engine.clean_artist(source_artist) + + for idx, result in enumerate(search_results): + try: + result_artists = result.artists if hasattr(result, 'artists') and result.artists else [] + result_name = result.name if hasattr(result, 'name') else '' + result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0 + + # Quick artist sanity check — reject if artist doesn't match at all + # This prevents "right title, wrong artist" false positives + best_artist_sim = 0.0 + for cand_artist in result_artists: + if not cand_artist: + continue + cand_cleaned = matching_engine.clean_artist(cand_artist) + # Check containment (e.g., "drake" in "drake 21 savage") + cand_normalized = matching_engine.normalize_string(cand_artist) + if source_artist_cleaned and source_artist_cleaned in cand_normalized: + best_artist_sim = 1.0 + break + sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned) + if sim > best_artist_sim: + best_artist_sim = sim + + if best_artist_sim < min_artist_similarity: + continue + + confidence, match_type = matching_engine.score_track_match( + source_title=source_title, + source_artists=[source_artist], + source_duration_ms=source_duration_ms, + candidate_title=result_name, + candidate_artists=result_artists, + candidate_duration_ms=result_duration + ) + + if confidence > best_confidence: + best_confidence = confidence + best_match = result + best_index = idx + + except Exception as e: + print(f"⚠️ Error scoring candidate {idx}: {e}") + continue + + return best_match, best_confidence, best_index + + def _run_tidal_discovery_worker(playlist_id): """Background worker for Tidal discovery process (Spotify preferred, iTunes fallback)""" try: @@ -18096,10 +18197,6 @@ def _run_tidal_discovery_worker(playlist_id): # Store discovery source in state for frontend state['discovery_source'] = discovery_source - # Import matching engine for validation (like sync.py) - from core.matching_engine import MusicMatchingEngine - matching_engine = MusicMatchingEngine() - successful_discoveries = 0 for i, tidal_track in enumerate(playlist.tracks): @@ -18114,7 +18211,7 @@ def _run_tidal_discovery_worker(playlist_id): try: cache_db = get_database() cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source) - if cached_match: + if cached_match and _validate_discovery_cache_artist(tidal_track.artists[0] if tidal_track.artists else '', cached_match): print(f"⚡ CACHE HIT [{i+1}/{len(playlist.tracks)}]: {tidal_track.name} by {', '.join(tidal_track.artists)}") result = { 'tidal_track': { @@ -18159,51 +18256,38 @@ def _run_tidal_discovery_worker(playlist_id): 'discovery_source': discovery_source } + match_confidence = 0.0 + if use_spotify and isinstance(track_result, tuple): - # Spotify: Function returns (Track, raw_data) - track_obj, raw_track_data = track_result - # Use full album object from raw API response + # Spotify: Function returns (Track, raw_data, confidence) + track_obj, raw_track_data, match_confidence = track_result album_obj = raw_track_data.get('album', {}) if raw_track_data else {} match_data = { 'id': track_obj.id, 'name': track_obj.name, - 'artists': track_obj.artists, # Already a list of strings - 'album': album_obj, # Full album object with images + 'artists': track_obj.artists, + 'album': album_obj, 'duration_ms': track_obj.duration_ms, 'external_urls': track_obj.external_urls, 'source': 'spotify' } - result['spotify_data'] = match_data # Backwards compatibility + result['spotify_data'] = match_data result['match_data'] = match_data result['status'] = 'found' + result['confidence'] = match_confidence successful_discoveries += 1 state['spotify_matches'] = successful_discoveries elif not use_spotify and track_result and isinstance(track_result, dict): - # iTunes: Function returns a dict with track data + # iTunes: Function returns a dict with track data (includes 'confidence' key) + match_confidence = track_result.pop('confidence', 0.80) match_data = track_result match_data['source'] = 'itunes' - result['spotify_data'] = match_data # Use same field for frontend compatibility - result['match_data'] = match_data - result['status'] = 'found' - successful_discoveries += 1 - state['spotify_matches'] = successful_discoveries - - elif use_spotify and track_result: - # Spotify fallback for old format (shouldn't happen after update) - match_data = { - 'id': track_result.id, - 'name': track_result.name, - 'artists': track_result.artists, - 'album': {'name': track_result.album, 'album_type': 'album', 'images': []}, - 'duration_ms': track_result.duration_ms, - 'external_urls': track_result.external_urls, - 'source': 'spotify' - } result['spotify_data'] = match_data result['match_data'] = match_data result['status'] = 'found' + result['confidence'] = match_confidence successful_discoveries += 1 state['spotify_matches'] = successful_discoveries @@ -18212,11 +18296,11 @@ def _run_tidal_discovery_worker(playlist_id): try: cache_db = get_database() cache_db.save_discovery_cache_match( - cache_key[0], cache_key[1], discovery_source, 0.80, + cache_key[0], cache_key[1], discovery_source, match_confidence, result['match_data'], tidal_track.name, tidal_track.artists[0] if tidal_track.artists else '' ) - print(f"💾 CACHE SAVED: {tidal_track.name}") + print(f"💾 CACHE SAVED: {tidal_track.name} (confidence: {match_confidence:.3f})") except Exception as cache_err: print(f"⚠️ Cache save error: {cache_err}") @@ -18269,8 +18353,8 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client itunes_client: iTunes client instance (required when use_spotify=False) Returns: - For Spotify: (Track, raw_data) tuple or None - For iTunes: dict with track data or None + For Spotify: (Track, raw_data, confidence) tuple or None + For iTunes: dict with track data (includes 'confidence' key) or None """ if use_spotify: if not spotify_client or not spotify_client.is_authenticated(): @@ -18288,13 +18372,13 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client return None artist_name = artists[0] # Use primary artist + source_duration = getattr(tidal_track, 'duration_ms', 0) or 0 source_name = "Spotify" if use_spotify else "iTunes" print(f"🔍 Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})") # Use matching engine to generate search queries (with fallback) try: - # Create a temporary SpotifyTrack-like object for the matching engine temp_track = type('TempTrack', (), { 'name': track_name, 'artists': [artist_name], @@ -18304,133 +18388,53 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client print(f"🔍 Generated {len(search_queries)} search queries for Tidal track") except Exception as e: print(f"⚠️ Matching engine failed for Tidal, falling back to basic queries: {e}") - # Fallback to original simple queries - search_queries = [ - f'track:"{track_name}" artist:"{artist_name}"', - f'"{track_name}" "{artist_name}"', - f'{track_name} {artist_name}' - ] + if use_spotify: + search_queries = [ + f'track:"{track_name}" artist:"{artist_name}"', + f'"{track_name}" "{artist_name}"', + f'{track_name} {artist_name}' + ] + else: + search_queries = [ + f'{artist_name} {track_name}', + f'{track_name} {artist_name}', + track_name + ] - # Find best match using confidence scoring best_match = None - best_match_raw = None # Store raw Spotify API data for full album info + best_match_raw = None best_confidence = 0.0 - min_confidence = 0.7 # Higher threshold for Tidal since data is cleaner + min_confidence = 0.7 for query_idx, search_query in enumerate(search_queries): try: print(f"🔍 Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})") if use_spotify: - # SPOTIFY PATH: Get raw Spotify API response to access full album object with images - raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5) + raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10) if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']: continue - - # Also get Track objects for matching logic - results = spotify_client.search_tracks(search_query, limit=5) - + results = spotify_client.search_tracks(search_query, limit=10) if not results: continue - - # Score each result using matching engine - for idx, result in enumerate(results): - raw_track = raw_results['tracks']['items'][idx] if idx < len(raw_results['tracks']['items']) else None - try: - # Calculate confidence using matching engine's similarity scoring (with fallback) - try: - artist_confidence = 0.0 - if result.artists: - # Get best artist match confidence - for result_artist in result.artists: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(artist_name), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = max(artist_confidence, artist_sim) - - # Calculate title confidence - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(track_name), - matching_engine.normalize_string(result.name) - ) - - # Combined confidence (equal weighting for Tidal clean data) - combined_confidence = (artist_confidence * 0.5 + title_confidence * 0.5) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for Tidal, using first match: {e}") - # Fallback: just take the first result if matching engine fails - combined_confidence = 1.0 # Set high to accept this match - best_match = result - break - - print(f"🔍 Tidal candidate: '{result.artists[0]}' - '{result.name}' (confidence: {combined_confidence:.3f})") - - # Update best match if this is better - if combined_confidence > best_confidence and combined_confidence >= min_confidence: - best_confidence = combined_confidence - best_match = result - best_match_raw = raw_track # Store raw data with full album object - print(f"✅ New best Tidal match: {result.artists[0]} - {result.name} (confidence: {combined_confidence:.3f})") - - except Exception as e: - print(f"❌ Error processing Tidal search result: {e}") - continue - else: - # ITUNES PATH: Search using iTunes client - # For iTunes, use a simpler query format - simple_query = f"{artist_name} {track_name}" - itunes_results = itunes_client.search_tracks(simple_query, limit=5) - - if not itunes_results: + raw_results = None + results = itunes_client.search_tracks(search_query, limit=10) + if not results: continue - # Score each iTunes result - # Note: iTunes returns Track dataclass objects with 'artists' (list), not 'artist' - for result in itunes_results: - try: - # Calculate confidence using matching engine - try: - artist_confidence = 0.0 - # iTunes Track has 'artists' as a list - result_artists = result.artists if hasattr(result, 'artists') else [] - result_artist = result_artists[0] if result_artists else '' - if result_artist: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(artist_name), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = artist_sim - - # Calculate title confidence - result_name = result.name if hasattr(result, 'name') else '' - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(track_name), - matching_engine.normalize_string(result_name) - ) - - combined_confidence = (artist_confidence * 0.5 + title_confidence * 0.5) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for iTunes Tidal, using first match: {e}") - combined_confidence = 1.0 - best_match = result - break - - result_artist_display = result_artists[0] if result_artists else 'Unknown' - result_name_display = result.name if hasattr(result, 'name') else 'Unknown' - print(f"🔍 iTunes Tidal candidate: '{result_artist_display}' - '{result_name_display}' (confidence: {combined_confidence:.3f})") - - if combined_confidence > best_confidence and combined_confidence >= min_confidence: - best_confidence = combined_confidence - best_match = result - print(f"✅ New best iTunes Tidal match: {result_artist_display} - {result_name_display} (confidence: {combined_confidence:.3f})") + # Score all results using the matching engine + match, confidence, match_idx = _discovery_score_candidates( + track_name, artist_name, source_duration, results + ) - except Exception as e: - print(f"❌ Error processing iTunes Tidal search result: {e}") - continue + if match and confidence > best_confidence and confidence >= min_confidence: + best_confidence = confidence + best_match = match + if use_spotify and raw_results: + best_match_raw = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None + print(f"✅ New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # If we found a very high confidence match, stop searching if best_confidence >= 0.9: print(f"🎯 High confidence Tidal match found ({best_confidence:.3f}), stopping search") break @@ -18439,19 +18443,33 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client print(f"❌ Error in Tidal {source_name} search for query '{search_query}': {e}") continue + # Strategy 4: Extended search with higher limit (last resort) + if not best_match: + print(f"🔄 Tidal Strategy 4: Extended search with limit=50") + query = f"{artist_name} {track_name}" + if use_spotify: + extended_results = spotify_client.search_tracks(query, limit=50) + else: + extended_results = itunes_client.search_tracks(query, limit=50) + if extended_results: + match, confidence, match_idx = _discovery_score_candidates( + track_name, artist_name, source_duration, extended_results + ) + if match and confidence >= min_confidence: + best_match = match + best_confidence = confidence + print(f"✅ Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + if best_match: if use_spotify: print(f"✅ Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})") - return (best_match, best_match_raw) # Return both Track object and raw data + return (best_match, best_match_raw, best_confidence) else: - # For iTunes, return a dict with normalized data - # Note: iTunes Track dataclass has 'artists' (list) and 'image_url', not 'artist' and 'artwork_url' result_artists = best_match.artists if hasattr(best_match, 'artists') else [] result_artist = result_artists[0] if result_artists else 'Unknown' result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown' print(f"✅ Final Tidal iTunes match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})") - # Build iTunes result dict with album info album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album' image_url = best_match.image_url if hasattr(best_match, 'image_url') else '' track_id = best_match.id if hasattr(best_match, 'id') else '' @@ -18467,7 +18485,8 @@ def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client 'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else [] }, 'duration_ms': duration_ms, - 'source': 'itunes' + 'source': 'itunes', + 'confidence': best_confidence } else: print(f"❌ No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})") @@ -18880,7 +18899,7 @@ def _run_youtube_discovery_worker(url_hash): try: cache_db = get_database() cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source) - if cached_match: + if cached_match and _validate_discovery_cache_artist(cleaned_artist, cached_match): print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {cleaned_artist} - {cleaned_title}") result = { 'index': i, @@ -18902,14 +18921,15 @@ def _run_youtube_discovery_worker(url_hash): except Exception as cache_err: print(f"⚠️ Cache lookup error: {cache_err}") - # Try multiple search strategies using matching_engine for better accuracy + # Try multiple search strategies using matching engine matched_track = None best_confidence = 0.0 - min_confidence = 0.6 # Keep same threshold as before + best_raw_track = None + min_confidence = 0.6 + source_duration = track.get('duration_ms', 0) or 0 - # Strategy 1: Use matching_engine search queries (with fallback) + # Strategy 1: Use matching_engine search queries try: - # Create a temporary SpotifyTrack-like object for the matching engine temp_track = type('TempTrack', (), { 'name': cleaned_title, 'artists': [cleaned_artist], @@ -18919,95 +18939,40 @@ def _run_youtube_discovery_worker(url_hash): print(f"🔍 Generated {len(search_queries)} search queries for YouTube track") except Exception as e: print(f"⚠️ Matching engine failed for YouTube, falling back to basic query: {e}") - # Fallback to original simple query - search_queries = [f"artist:{cleaned_artist} track:{cleaned_title}"] - - # Store raw data for best match - best_raw_track = None + search_queries = [f"{cleaned_artist} {cleaned_title}", cleaned_title] for query_idx, search_query in enumerate(search_queries): try: print(f"🔍 YouTube query {query_idx + 1}/{len(search_queries)}: {search_query}") - # Search using appropriate provider raw_results = None search_results = None if use_spotify: - # Get raw Spotify API response to access full album object with images - raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5) + raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10) if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']: continue - search_results = spotify_client.search_tracks(search_query, limit=5) + search_results = spotify_client.search_tracks(search_query, limit=10) else: - # Use iTunes search - search_results = itunes_client.search_tracks(search_query, limit=5) + search_results = itunes_client.search_tracks(search_query, limit=10) if not search_results: continue - # Score each result using matching engine - for result_idx, search_result in enumerate(search_results): - raw_track = None - if use_spotify and raw_results: - raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None - try: - # Calculate confidence using matching engine's similarity scoring (with fallback) - try: - artist_confidence = 0.0 - if search_result.artists: - # Get best artist match confidence - for result_artist in search_result.artists: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(cleaned_artist), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = max(artist_confidence, artist_sim) - - # Calculate title confidence - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(cleaned_title), - matching_engine.normalize_string(search_result.name) - ) - - # Combined confidence (70% title, 30% artist - same as original) - combined_confidence = (title_confidence * 0.7 + artist_confidence * 0.3) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for YouTube, using basic similarity: {e}") - # Fallback to original character overlap method - def _calculate_similarity_fallback(str1, str2): - if not str1 or not str2: - return 0 - str1 = str1.lower().strip() - str2 = str2.lower().strip() - if str1 == str2: - return 1.0 - set1 = set(str1.replace(' ', '')) - set2 = set(str2.replace(' ', '')) - if not set1 or not set2: - return 0 - intersection = len(set1.intersection(set2)) - union = len(set1.union(set2)) - return intersection / union if union > 0 else 0 - - title_score = _calculate_similarity_fallback(cleaned_title, search_result.name) - artist_score = _calculate_similarity_fallback(cleaned_artist, search_result.artists[0] if search_result.artists else "") - combined_confidence = (title_score * 0.7) + (artist_score * 0.3) - - print(f"🔍 YouTube candidate: '{search_result.artists[0]}' - '{search_result.name}' (confidence: {combined_confidence:.3f})") - - # Update best match if this is better - if combined_confidence > best_confidence and combined_confidence >= min_confidence: - best_confidence = combined_confidence - matched_track = search_result - best_raw_track = raw_track # Store raw data with full album object (Spotify only) - print(f"✅ New best YouTube match: {search_result.artists[0]} - {search_result.name} (confidence: {combined_confidence:.3f})") + # Score all results using the matching engine + match, confidence, match_idx = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, search_results + ) - except Exception as e: - print(f"❌ Error processing YouTube search result: {e}") - continue + if match and confidence > best_confidence and confidence >= min_confidence: + best_confidence = confidence + matched_track = match + if use_spotify and raw_results: + best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None + else: + best_raw_track = None + print(f"✅ New best YouTube match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # If we found a very high confidence match, stop searching if best_confidence >= 0.9: print(f"🎯 High confidence YouTube match found ({best_confidence:.3f}), stopping search") break @@ -19019,31 +18984,59 @@ def _run_youtube_discovery_worker(url_hash): if matched_track: print(f"✅ Strategy 1 YouTube match: {matched_track.artists[0]} - {matched_track.name} (confidence: {best_confidence:.3f})") - # Strategy 2: Swapped search (if first failed) - keep simple for fallback + # Strategy 2: Swapped search (if first failed) - score results properly if not matched_track: print("🔄 YouTube Strategy 2: Trying swapped search (artist/title reversed)") - query = f"artist:{cleaned_title} track:{cleaned_artist}" if use_spotify: - fallback_results = spotify_client.search_tracks(query, limit=3) + query = f"artist:{cleaned_title} track:{cleaned_artist}" + fallback_results = spotify_client.search_tracks(query, limit=5) else: - fallback_results = itunes_client.search_tracks(query, limit=3) + query = f"{cleaned_title} {cleaned_artist}" + fallback_results = itunes_client.search_tracks(query, limit=5) if fallback_results: - matched_track = fallback_results[0] - print(f"✅ Strategy 2 YouTube match (swapped): {matched_track.artists[0]} - {matched_track.name}") + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, fallback_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 2 YouTube match (swapped): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # Strategy 3: Raw data search (if still failed) - keep simple for fallback + # Strategy 3: Raw data search (if still failed) - score results properly if not matched_track: - raw_title = track['raw_title'] - raw_artist = track['raw_artist'] + raw_title = track.get('raw_title', cleaned_title) + raw_artist = track.get('raw_artist', cleaned_artist) print(f"🔄 YouTube Strategy 3: Trying raw data search: '{raw_artist} {raw_title}'") query = f"{raw_artist} {raw_title}" if use_spotify: - fallback_results = spotify_client.search_tracks(query, limit=3) + fallback_results = spotify_client.search_tracks(query, limit=5) else: - fallback_results = itunes_client.search_tracks(query, limit=3) + fallback_results = itunes_client.search_tracks(query, limit=5) if fallback_results: - matched_track = fallback_results[0] - print(f"✅ Strategy 3 YouTube match (raw): {matched_track.artists[0]} - {matched_track.name}") + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, fallback_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 3 YouTube match (raw): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + + # Strategy 4: Extended search with higher limit (last resort) + if not matched_track: + print(f"🔄 YouTube Strategy 4: Extended search with limit=50") + query = f"{cleaned_artist} {cleaned_title}" + if use_spotify: + extended_results = spotify_client.search_tracks(query, limit=50) + else: + extended_results = itunes_client.search_tracks(query, limit=50) + if extended_results: + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, extended_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 4 YouTube match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") # Create result entry result = { @@ -19056,24 +19049,23 @@ def _run_youtube_discovery_worker(url_hash): 'spotify_artist': matched_track.artists[0] if matched_track else '', 'spotify_album': matched_track.album if matched_track else '', 'duration': f"{track['duration_ms'] // 60000}:{(track['duration_ms'] % 60000) // 1000:02d}" if track['duration_ms'] else '0:00', - 'discovery_source': discovery_source + 'discovery_source': discovery_source, + 'confidence': best_confidence } if matched_track: - state['spotify_matches'] += 1 # Keep key name for compatibility + state['spotify_matches'] += 1 # Build album data based on provider if use_spotify and best_raw_track: album_data = best_raw_track.get('album', {}) else: - # For iTunes or when raw data unavailable album_data = { 'name': matched_track.album, 'album_type': 'album', 'images': [{'url': matched_track.image_url}] if hasattr(matched_track, 'image_url') and matched_track.image_url else [] } - # Store track data with source info result['matched_data'] = { 'id': matched_track.id, 'name': matched_track.name, @@ -19082,10 +19074,9 @@ def _run_youtube_discovery_worker(url_hash): 'duration_ms': matched_track.duration_ms, 'source': discovery_source } - # Keep spotify_data for backward compatibility result['spotify_data'] = result['matched_data'] - # Save to discovery cache (only Strategy 1 high-confidence matches) + # Save to discovery cache (only high-confidence matches) if best_confidence >= 0.7: try: cache_db = get_database() @@ -19103,7 +19094,6 @@ def _run_youtube_discovery_worker(url_hash): except Exception as e: print(f"❌ Error processing track {i}: {e}") - # Add failed result result = { 'index': i, 'yt_track': track['name'], @@ -19116,19 +19106,18 @@ def _run_youtube_discovery_worker(url_hash): 'duration': '0:00' } state['discovery_results'].append(result) - + # Complete discovery state['phase'] = 'discovered' state['status'] = 'complete' state['discovery_progress'] = 100 - - # Add activity for discovery completion + playlist_name = playlist['name'] source_label = 'Spotify' if use_spotify else 'iTunes' add_activity_item("✅", f"YouTube Discovery Complete ({source_label})", f"'{playlist_name}' - {state['spotify_matches']}/{len(tracks)} tracks found", "Now") print(f"✅ YouTube discovery complete ({discovery_source}): {state['spotify_matches']}/{len(tracks)} tracks matched") - + except Exception as e: print(f"❌ Error in YouTube discovery worker: {e}") state['status'] = 'error' @@ -19173,7 +19162,7 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): try: cache_db = get_database() cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source) - if cached_match: + if cached_match and _validate_discovery_cache_artist(cleaned_artist, cached_match): print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {cleaned_artist} - {cleaned_title}") result = { 'index': i, @@ -19195,14 +19184,15 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): except Exception as cache_err: print(f"⚠️ Cache lookup error: {cache_err}") - # Try multiple search strategies using matching_engine for better accuracy + # Try multiple search strategies using matching engine matched_track = None best_confidence = 0.0 - min_confidence = 0.6 # Keep same threshold as YouTube + best_raw_track = None + min_confidence = 0.6 + source_duration = duration_ms or 0 - # Strategy 1: Use matching_engine search queries (with fallback) + # Strategy 1: Use matching_engine search queries try: - # Create a temporary SpotifyTrack-like object for the matching engine temp_track = type('TempTrack', (), { 'name': cleaned_title, 'artists': [cleaned_artist], @@ -19212,95 +19202,40 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): print(f"🔍 Generated {len(search_queries)} search queries for ListenBrainz track") except Exception as e: print(f"⚠️ Matching engine failed for ListenBrainz, falling back to basic query: {e}") - # Fallback to original simple query - search_queries = [f"artist:{cleaned_artist} track:{cleaned_title}"] - - # Store raw data for best match - best_raw_track = None + search_queries = [f"{cleaned_artist} {cleaned_title}", cleaned_title] for query_idx, search_query in enumerate(search_queries): try: print(f"🔍 ListenBrainz query {query_idx + 1}/{len(search_queries)}: {search_query}") - # Search using appropriate provider raw_results = None search_results = None if use_spotify: - # Get raw Spotify API response to access full album object with images - raw_results = spotify_client.sp.search(q=search_query, type='track', limit=5) + raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10) if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']: continue - search_results = spotify_client.search_tracks(search_query, limit=5) + search_results = spotify_client.search_tracks(search_query, limit=10) else: - # Use iTunes search - search_results = itunes_client.search_tracks(search_query, limit=5) + search_results = itunes_client.search_tracks(search_query, limit=10) if not search_results: continue - # Score each result using matching engine - for result_idx, search_result in enumerate(search_results): - raw_track = None - if use_spotify and raw_results: - raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None - try: - # Calculate confidence using matching engine's similarity scoring (with fallback) - try: - artist_confidence = 0.0 - if search_result.artists: - # Get best artist match confidence - for result_artist in search_result.artists: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(cleaned_artist), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = max(artist_confidence, artist_sim) - - # Calculate title confidence - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(cleaned_title), - matching_engine.normalize_string(search_result.name) - ) - - # Combined confidence (70% title, 30% artist - same as YouTube) - combined_confidence = (title_confidence * 0.7 + artist_confidence * 0.3) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for ListenBrainz, using basic similarity: {e}") - # Fallback to original character overlap method - def _calculate_similarity_fallback(str1, str2): - if not str1 or not str2: - return 0 - str1 = str1.lower().strip() - str2 = str2.lower().strip() - if str1 == str2: - return 1.0 - set1 = set(str1.replace(' ', '')) - set2 = set(str2.replace(' ', '')) - if not set1 or not set2: - return 0 - intersection = len(set1.intersection(set2)) - union = len(set1.union(set2)) - return intersection / union if union > 0 else 0 - - title_score = _calculate_similarity_fallback(cleaned_title, search_result.name) - artist_score = _calculate_similarity_fallback(cleaned_artist, search_result.artists[0] if search_result.artists else "") - combined_confidence = (title_score * 0.7) + (artist_score * 0.3) - - print(f"🔍 ListenBrainz candidate: '{search_result.artists[0]}' - '{search_result.name}' (confidence: {combined_confidence:.3f})") - - # Update best match if this is better - if combined_confidence > best_confidence and combined_confidence >= min_confidence: - best_confidence = combined_confidence - matched_track = search_result - best_raw_track = raw_track # Store raw data with full album object (Spotify only) - print(f"✅ New best ListenBrainz match: {search_result.artists[0]} - {search_result.name} (confidence: {combined_confidence:.3f})") + # Score all results using the matching engine + match, confidence, match_idx = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, search_results + ) - except Exception as e: - print(f"❌ Error processing ListenBrainz search result: {e}") - continue + if match and confidence > best_confidence and confidence >= min_confidence: + best_confidence = confidence + matched_track = match + if use_spotify and raw_results: + best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None + else: + best_raw_track = None + print(f"✅ New best ListenBrainz match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # If we found a very high confidence match, stop searching if best_confidence >= 0.9: print(f"🎯 High confidence ListenBrainz match found ({best_confidence:.3f}), stopping search") break @@ -19312,29 +19247,58 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): if matched_track: print(f"✅ Strategy 1 ListenBrainz match: {matched_track.artists[0]} - {matched_track.name} (confidence: {best_confidence:.3f})") - # Strategy 2: Swapped search (if first failed) - keep simple for fallback + # Strategy 2: Swapped search (if first failed) - score results properly if not matched_track: print("🔄 ListenBrainz Strategy 2: Trying swapped search (artist/title reversed)") - query = f"artist:{cleaned_title} track:{cleaned_artist}" if use_spotify: - fallback_results = spotify_client.search_tracks(query, limit=3) + query = f"artist:{cleaned_title} track:{cleaned_artist}" + fallback_results = spotify_client.search_tracks(query, limit=5) else: - fallback_results = itunes_client.search_tracks(query, limit=3) + query = f"{cleaned_title} {cleaned_artist}" + fallback_results = itunes_client.search_tracks(query, limit=5) if fallback_results: - matched_track = fallback_results[0] - print(f"✅ Strategy 2 ListenBrainz match (swapped): {matched_track.artists[0]} - {matched_track.name}") + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, fallback_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 2 ListenBrainz match (swapped): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # Strategy 3: Album-based search (if still failed and we have album name) + # Strategy 3: Album-based search (if still failed and we have album name) - score results properly if not matched_track and album_name: print(f"🔄 ListenBrainz Strategy 3: Trying album-based search: '{cleaned_artist} {album_name} {cleaned_title}'") - query = f"artist:{cleaned_artist} album:{album_name} track:{cleaned_title}" if use_spotify: - fallback_results = spotify_client.search_tracks(query, limit=3) + query = f"artist:{cleaned_artist} album:{album_name} track:{cleaned_title}" + fallback_results = spotify_client.search_tracks(query, limit=5) else: - fallback_results = itunes_client.search_tracks(query, limit=3) + query = f"{cleaned_artist} {album_name} {cleaned_title}" + fallback_results = itunes_client.search_tracks(query, limit=5) if fallback_results: - matched_track = fallback_results[0] - print(f"✅ Strategy 3 ListenBrainz match (album): {matched_track.artists[0]} - {matched_track.name}") + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, fallback_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 3 ListenBrainz match (album): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + + # Strategy 4: Extended search with higher limit (last resort) + if not matched_track: + print(f"🔄 ListenBrainz Strategy 4: Extended search with limit=50") + query = f"{cleaned_artist} {cleaned_title}" + if use_spotify: + extended_results = spotify_client.search_tracks(query, limit=50) + else: + extended_results = itunes_client.search_tracks(query, limit=50) + if extended_results: + match, confidence, _ = _discovery_score_candidates( + cleaned_title, cleaned_artist, source_duration, extended_results + ) + if match and confidence >= min_confidence: + matched_track = match + best_confidence = confidence + print(f"✅ Strategy 4 ListenBrainz match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") # Create result entry result = { @@ -19347,24 +19311,23 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): 'spotify_artist': matched_track.artists[0] if matched_track else '', 'spotify_album': matched_track.album if matched_track else '', 'duration': f"{duration_ms // 60000}:{(duration_ms % 60000) // 1000:02d}" if duration_ms else '0:00', - 'discovery_source': discovery_source + 'discovery_source': discovery_source, + 'confidence': best_confidence } if matched_track: - state['spotify_matches'] += 1 # Keep key name for compatibility + state['spotify_matches'] += 1 # Build album data based on provider if use_spotify and best_raw_track: album_data = best_raw_track.get('album', {}) else: - # For iTunes or when raw data unavailable album_data = { 'name': matched_track.album, 'album_type': 'album', 'images': [{'url': matched_track.image_url}] if hasattr(matched_track, 'image_url') and matched_track.image_url else [] } - # Store track data with source info result['matched_data'] = { 'id': matched_track.id, 'name': matched_track.name, @@ -19373,10 +19336,9 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): 'duration_ms': matched_track.duration_ms, 'source': discovery_source } - # Keep spotify_data for backward compatibility result['spotify_data'] = result['matched_data'] - # Save to discovery cache (only Strategy 1 high-confidence matches) + # Save to discovery cache (only high-confidence matches) if best_confidence >= 0.7: try: cache_db = get_database() @@ -19394,7 +19356,6 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): except Exception as e: print(f"❌ Error processing track {i}: {e}") - # Add failed result result = { 'index': i, 'lb_track': track['track_name'], @@ -19413,7 +19374,6 @@ def _run_listenbrainz_discovery_worker(playlist_mbid): state['status'] = 'complete' state['discovery_progress'] = 100 - # Add activity for discovery completion playlist_name = playlist['name'] source_label = 'Spotify' if use_spotify else 'iTunes' add_activity_item("✅", f"ListenBrainz Discovery Complete ({source_label})", f"'{playlist_name}' - {state['spotify_matches']}/{len(tracks)} tracks found", "Now") @@ -25707,7 +25667,7 @@ def _run_beatport_discovery_worker(url_hash): try: cache_db = get_database() cached_match = cache_db.get_discovery_cache_match(cache_key[0], cache_key[1], discovery_source) - if cached_match: + if cached_match and _validate_discovery_cache_artist(track_artist, cached_match): print(f"⚡ CACHE HIT [{i+1}/{len(tracks)}]: {track_artist} - {track_title}") # Convert artists from ['str'] to [{'name': 'str'}] for Beatport frontend format beatport_artists = cached_match.get('artists', []) @@ -25730,12 +25690,14 @@ def _run_beatport_discovery_worker(url_hash): except Exception as cache_err: print(f"⚠️ Cache lookup error: {cache_err}") - # Use matching engine for sophisticated track matching (like other discovery processes) + # Use matching engine for track matching found_track = None + best_confidence = 0.0 + best_raw_track = None + min_confidence = 0.75 # Higher threshold for Beatport to avoid bad matches # Generate search queries using matching engine (with fallback) try: - # Create a temporary SpotifyTrack-like object for the matching engine temp_track = type('TempTrack', (), { 'name': track_title, 'artists': [track_artist], @@ -25745,154 +25707,51 @@ def _run_beatport_discovery_worker(url_hash): print(f"🔍 Generated {len(search_queries)} search queries using matching engine") except Exception as e: print(f"⚠️ Matching engine failed for Beatport, falling back to basic queries: {e}") - # Fallback to basic search queries - search_queries = [ - f"{track_artist} {track_title}", - f'artist:"{track_artist}" track:"{track_title}"', - f'"{track_artist}" "{track_title}"' - ] - - # Try each search query until we find a good match - best_match = None - best_raw_track = None # Store raw Spotify data for full album info - best_confidence = 0.0 - min_confidence = 0.75 # Increased threshold to avoid bad matches like "Dolce" for "Fancy $hit" + if use_spotify: + search_queries = [ + f"{track_artist} {track_title}", + f'artist:"{track_artist}" track:"{track_title}"', + f'"{track_artist}" "{track_title}"' + ] + else: + search_queries = [ + f"{track_artist} {track_title}", + f"{track_title} {track_artist}", + track_title + ] for query_idx, search_query in enumerate(search_queries): try: print(f"🔍 Query {query_idx + 1}/{len(search_queries)}: {search_query} ({discovery_source.upper()})") + raw_results = None + search_results = None + if use_spotify: - # SPOTIFY PATH: Get raw Spotify API response to access full album object with images raw_results = spotify_client.sp.search(q=search_query, type='track', limit=10) if not raw_results or 'tracks' not in raw_results or not raw_results['tracks']['items']: continue - search_results = spotify_client.search_tracks(search_query, limit=10) - - if not search_results: - continue - - # Use matching engine to find the best match from search results - for result_idx, result in enumerate(search_results): - raw_track = raw_results['tracks']['items'][result_idx] if result_idx < len(raw_results['tracks']['items']) else None - try: - # Calculate confidence using matching engine's similarity scoring (with fallback) - try: - artist_confidence = 0.0 - if result.artists: - # Get best artist match confidence - result_artist_names = [artist for artist in result.artists] - for result_artist in result_artist_names: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(track_artist), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = max(artist_confidence, artist_sim) - - # Calculate title confidence - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(track_title), - matching_engine.normalize_string(result.name) - ) - - # Combined confidence (more balanced to avoid bad matches from same artist) - combined_confidence = (artist_confidence * 0.4 + title_confidence * 0.6) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for Beatport, using basic matching: {e}") - # Fallback to simple string matching - artist_match = any(track_artist.lower() in artist.lower() for artist in result.artists) if result.artists else False - title_match = track_title.lower() in result.name.lower() or result.name.lower() in track_title.lower() - combined_confidence = 0.8 if (artist_match and title_match) else 0.4 if (artist_match or title_match) else 0.1 - - print(f"🔍 Match candidate: '{result.artists[0]}' - '{result.name}'") - print(f" Artist confidence: {artist_confidence:.3f} ('{track_artist}' vs '{result.artists[0]}')") - print(f" Title confidence: {title_confidence:.3f} ('{track_title}' vs '{result.name}')") - print(f" Combined confidence: {combined_confidence:.3f} (threshold: {min_confidence})") - - # Additional check for core title similarity (excluding version keywords) - def remove_version_keywords(title): - keywords = ['extended mix', 'radio mix', 'club mix', 'remix', 'extended', 'version', 'mix', 'original'] - clean_title = title.lower() - for keyword in keywords: - clean_title = clean_title.replace(keyword, '').strip(' -()[]') - return clean_title.strip() - - core_title1 = remove_version_keywords(track_title) - core_title2 = remove_version_keywords(result.name) - core_title_confidence = matching_engine.similarity_score(core_title1, core_title2) - - print(f" Core title confidence: {core_title_confidence:.3f} ('{core_title1}' vs '{core_title2}')") - - # Update best match if this is better AND meets all similarity requirements - min_title_confidence = 0.5 # Require at least 50% title similarity - min_core_title_confidence = 0.4 # Require at least 40% core title similarity - if (combined_confidence > best_confidence and - combined_confidence >= min_confidence and - title_confidence >= min_title_confidence and - core_title_confidence >= min_core_title_confidence): - best_confidence = combined_confidence - best_match = result - best_raw_track = raw_track # Store raw data with full album object - print(f"✅ New best match: {result.artists[0]} - {result.name} (confidence: {combined_confidence:.3f})") - - except Exception as e: - print(f"❌ Error processing search result: {e}") - continue - else: - # ITUNES PATH: Search using iTunes client - simple_query = f"{track_artist} {track_title}" - itunes_results = itunes_client_instance.search_tracks(simple_query, limit=10) + search_results = itunes_client_instance.search_tracks(search_query, limit=10) - if not itunes_results: - continue - - # Score each iTunes result - # Note: iTunes returns Track dataclass objects with 'artists' (list), not 'artist' - for result in itunes_results: - try: - # Calculate confidence using matching engine - try: - artist_confidence = 0.0 - # iTunes Track has 'artists' as a list - result_artists = result.artists if hasattr(result, 'artists') else [] - result_artist = result_artists[0] if result_artists else '' - if result_artist: - artist_sim = matching_engine.similarity_score( - matching_engine.normalize_string(track_artist), - matching_engine.normalize_string(result_artist) - ) - artist_confidence = artist_sim - - # Calculate title confidence - result_name = result.name if hasattr(result, 'name') else '' - title_confidence = matching_engine.similarity_score( - matching_engine.normalize_string(track_title), - matching_engine.normalize_string(result_name) - ) - - combined_confidence = (artist_confidence * 0.4 + title_confidence * 0.6) - except Exception as e: - print(f"⚠️ Matching engine scoring failed for iTunes Beatport, using first match: {e}") - combined_confidence = 1.0 - best_match = result - break - - result_artist_display = result_artists[0] if result_artists else 'Unknown' - result_name_display = result.name if hasattr(result, 'name') else 'Unknown' - print(f"🔍 iTunes Beatport candidate: '{result_artist_display}' - '{result_name_display}' (confidence: {combined_confidence:.3f})") + if not search_results: + continue - if combined_confidence > best_confidence and combined_confidence >= min_confidence: - best_confidence = combined_confidence - best_match = result - print(f"✅ New best iTunes Beatport match: {result_artist_display} - {result_name_display} (confidence: {combined_confidence:.3f})") + # Score all results using the matching engine + match, confidence, match_idx = _discovery_score_candidates( + track_title, track_artist, 0, search_results + ) - except Exception as e: - print(f"❌ Error processing iTunes Beatport search result: {e}") - continue + if match and confidence > best_confidence and confidence >= min_confidence: + best_confidence = confidence + found_track = match + if use_spotify and raw_results: + best_raw_track = raw_results['tracks']['items'][match_idx] if match_idx < len(raw_results['tracks']['items']) else None + else: + best_raw_track = None + print(f"✅ New best Beatport match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - # If we found a very high confidence match, stop searching if best_confidence >= 0.9: print(f"🎯 High confidence match found ({best_confidence:.3f}), stopping search") break @@ -25901,16 +25760,25 @@ def _run_beatport_discovery_worker(url_hash): print(f"❌ Error in {discovery_source.upper()} search for query '{search_query}': {e}") continue - found_track = best_match - if found_track: + # Strategy 4: Extended search with higher limit (last resort) + if not found_track: + print(f"🔄 Beatport Strategy 4: Extended search with limit=50") + query = f"{track_artist} {track_title}" if use_spotify: - print(f"✅ Final Spotify match selected: {found_track.artists[0]} - {found_track.name} (confidence: {best_confidence:.3f})") + extended_results = spotify_client.search_tracks(query, limit=50) else: - # iTunes Track has 'artists' (list), not 'artist' - found_artists = found_track.artists if hasattr(found_track, 'artists') else [] - found_artist = found_artists[0] if found_artists else 'Unknown' - found_name = found_track.name if hasattr(found_track, 'name') else 'Unknown' - print(f"✅ Final iTunes match selected: {found_artist} - {found_name} (confidence: {best_confidence:.3f})") + extended_results = itunes_client_instance.search_tracks(query, limit=50) + if extended_results: + match, confidence, _ = _discovery_score_candidates( + track_title, track_artist, 0, extended_results + ) + if match and confidence >= min_confidence: + found_track = match + best_confidence = confidence + print(f"✅ Strategy 4 Beatport match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + + if found_track: + print(f"✅ Final Beatport match: {found_track.artists[0]} - {found_track.name} (confidence: {best_confidence:.3f})") else: print(f"❌ No suitable match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})") @@ -25922,8 +25790,9 @@ def _run_beatport_discovery_worker(url_hash): 'artist': track_artist }, 'status': 'found' if found_track else 'not_found', - 'status_class': 'found' if found_track else 'not-found', # Add status class for CSS styling - 'discovery_source': discovery_source + 'status_class': 'found' if found_track else 'not-found', + 'discovery_source': discovery_source, + 'confidence': best_confidence } if found_track: