From a4eccff4a59120fcdfb02721d47bc9da20711816 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Thu, 30 Apr 2026 09:08:47 -0700 Subject: [PATCH] Lift discovery scoring + tidal-track search to core/discovery/scoring.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both function bodies (_discovery_score_candidates and _search_spotify_for_tidal_track) are byte-identical to the originals. The shared matching_engine instance is injected via init() right after _init_connection_test; the spotify proxy + _get_metadata_fallback_source shim follow the same pattern used elsewhere. web_server.py: 36019 → 35753 (-266 lines). --- core/discovery/scoring.py | 323 ++++++++++++++++++++++++++++++++++++++ web_server.py | 280 +-------------------------------- 2 files changed, 330 insertions(+), 273 deletions(-) create mode 100644 core/discovery/scoring.py diff --git a/core/discovery/scoring.py b/core/discovery/scoring.py new file mode 100644 index 00000000..873369b9 --- /dev/null +++ b/core/discovery/scoring.py @@ -0,0 +1,323 @@ +"""Discovery scoring + tidal-track search — lifted from web_server.py. + +Both function bodies are byte-identical to the originals. The +``spotify_client`` proxy and ``_get_metadata_fallback_source`` shim +let the bodies resolve their original names without modification. +``matching_engine`` is injected via init() because it is constructed +in web_server.py and referenced by name throughout the bodies. +""" +import logging + +from core.metadata.cache import get_metadata_cache +from core.metadata.registry import get_primary_source, get_spotify_client +from core.spotify_client import _is_globally_rate_limited as _spotify_rate_limited + +logger = logging.getLogger(__name__) + + +def _get_metadata_fallback_source(): + """Mirror of web_server._get_metadata_fallback_source — delegates to registry.""" + return get_primary_source() + + +class _SpotifyClientProxy: + """Resolves the global Spotify client lazily through core.metadata.registry.""" + + def __getattr__(self, name): + client = get_spotify_client() + if client is None: + raise AttributeError(name) + return getattr(client, name) + + def __bool__(self): + return get_spotify_client() is not None + + +spotify_client = _SpotifyClientProxy() + + +# Injected at runtime via init(). +matching_engine = None + + +def init(matching_engine_obj): + """Bind the shared matching engine instance from web_server.""" + global matching_engine + matching_engine = matching_engine_obj + + +def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results): + """Score search results against a source track using the matching engine. + + Both artist AND title must independently pass minimum similarity floors. + This prevents weighted scoring from allowing a perfect artist to carry a + garbage title (or vice versa). If either dimension doesn't match, the + candidate is rejected — no match is better than a wrong match. + + Args: + source_title: The source track title (already cleaned for YouTube, raw for others) + source_artist: The source track primary artist + source_duration_ms: The source track duration in ms (0 if unknown) + search_results: List of Track objects (Spotify or iTunes) from search + + Returns: + (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results + """ + best_match = None + best_confidence = 0.0 + best_index = -1 + min_artist_similarity = 0.5 + min_title_similarity = 0.5 + + source_artist_cleaned = matching_engine.clean_artist(source_artist) + source_title_cleaned = matching_engine.clean_title(source_title) + source_core_title = matching_engine.get_core_string(source_title) + + for idx, result in enumerate(search_results): + try: + result_artists = result.artists if hasattr(result, 'artists') and result.artists else [] + result_name = result.name if hasattr(result, 'name') else '' + result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0 + + # Artist floor — both must match, not just the weighted score + best_artist_sim = 0.0 + for cand_artist in result_artists: + if not cand_artist: + continue + cand_cleaned = matching_engine.clean_artist(cand_artist) + cand_normalized = matching_engine.normalize_string(cand_artist) + if source_artist_cleaned and source_artist_cleaned in cand_normalized: + best_artist_sim = 1.0 + break + sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned) + if sim > best_artist_sim: + best_artist_sim = sim + + if best_artist_sim < min_artist_similarity: + continue + + # Title floor — both must match, not just the weighted score + cand_title_cleaned = matching_engine.clean_title(result_name) + cand_core_title = matching_engine.get_core_string(result_name) + + # Core title exact match bypasses the floor (e.g., "edamame" == "edamame") + title_passes = False + if source_core_title and cand_core_title and source_core_title == cand_core_title: + title_passes = True + else: + title_sim = matching_engine.similarity_score(source_title_cleaned, cand_title_cleaned) + if title_sim >= min_title_similarity: + title_passes = True + + if not title_passes: + continue + + # Both floors passed — now do full scoring + confidence, match_type = matching_engine.score_track_match( + source_title=source_title, + source_artists=[source_artist], + source_duration_ms=source_duration_ms, + candidate_title=result_name, + candidate_artists=result_artists, + candidate_duration_ms=result_duration + ) + + if confidence > best_confidence: + best_confidence = confidence + best_match = result + best_index = idx + + except Exception as e: + logger.error(f"Error scoring candidate {idx}: {e}") + continue + + return best_match, best_confidence, best_index + + +def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client=None): + """Search Spotify/fallback for a Tidal track using matching_engine for better accuracy + + Args: + tidal_track: The Tidal track to search for + use_spotify: If True, use Spotify; if False, use fallback source + itunes_client: Fallback client instance (required when use_spotify=False) + + Returns: + For Spotify: (Track, raw_data, confidence) tuple or None + For fallback: dict with track data (includes 'confidence' key) or None + """ + if use_spotify: + if not spotify_client or not spotify_client.is_authenticated(): + return None + else: + if not itunes_client: + return None + + try: + # Get track info + track_name = tidal_track.name + artists = tidal_track.artists or [] + + if not artists: + return None + + artist_name = artists[0] # Use primary artist + source_duration = getattr(tidal_track, 'duration_ms', 0) or 0 + source_name = "Spotify" if use_spotify else _get_metadata_fallback_source().capitalize() + + logger.info(f"Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})") + + # Use matching engine to generate search queries (with fallback) + try: + temp_track = type('TempTrack', (), { + 'name': track_name, + 'artists': [artist_name], + 'album': None + })() + search_queries = matching_engine.generate_download_queries(temp_track) + logger.info(f"Generated {len(search_queries)} search queries for Tidal track") + except Exception as e: + logger.error(f"Matching engine failed for Tidal, falling back to basic queries: {e}") + if use_spotify: + search_queries = [ + f'track:"{track_name}" artist:"{artist_name}"', + f'"{track_name}" "{artist_name}"', + f'{track_name} {artist_name}' + ] + else: + search_queries = [ + f'{artist_name} {track_name}', + f'{track_name} {artist_name}', + track_name + ] + + best_match = None + best_match_raw = None + best_confidence = 0.0 + min_confidence = 0.9 + + for query_idx, search_query in enumerate(search_queries): + try: + logger.debug(f"Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})") + + if use_spotify and not _spotify_rate_limited(): + results = spotify_client.search_tracks(search_query, limit=10) + if not results: + continue + else: + results = itunes_client.search_tracks(search_query, limit=10) + if not results: + continue + + # Score all results using the matching engine + match, confidence, match_idx = _discovery_score_candidates( + track_name, artist_name, source_duration, results + ) + + if match and confidence > best_confidence and confidence >= min_confidence: + best_confidence = confidence + best_match = match + if use_spotify and match.id: + _cache = get_metadata_cache() + best_match_raw = _cache.get_entity('spotify', 'track', match.id) + else: + best_match_raw = None + logger.info(f"New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + + if best_confidence >= 0.9: + logger.info(f"High confidence Tidal match found ({best_confidence:.3f}), stopping search") + break + + except Exception as e: + logger.debug(f"Error in Tidal {source_name} search for query '{search_query}': {e}") + continue + + # Strategy 4: Extended search with higher limit (last resort) + if not best_match: + logger.info("Tidal Strategy 4: Extended search with limit=50") + query = f"{artist_name} {track_name}" + if use_spotify: + extended_results = spotify_client.search_tracks(query, limit=50) + else: + extended_results = itunes_client.search_tracks(query, limit=50) + if extended_results: + match, confidence, match_idx = _discovery_score_candidates( + track_name, artist_name, source_duration, extended_results + ) + if match and confidence >= min_confidence: + best_match = match + best_confidence = confidence + logger.info(f"Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") + + if best_match: + if use_spotify: + logger.info(f"Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})") + return (best_match, best_match_raw, best_confidence) + else: + result_artists = best_match.artists if hasattr(best_match, 'artists') else [] + result_artist = result_artists[0] if result_artists else 'Unknown' + result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown' + logger.info(f"Final Tidal {source_name} match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})") + + album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album' + image_url = best_match.image_url if hasattr(best_match, 'image_url') else '' + track_id = best_match.id if hasattr(best_match, 'id') else '' + duration_ms = best_match.duration_ms if hasattr(best_match, 'duration_ms') else 0 + + # Fetch full track details to get album ID, track_number, etc. + # The Track dataclass strips this data — the API has it + album_obj = { + 'name': album_name, + 'album_type': 'album', + 'release_date': getattr(best_match, 'release_date', '') or '', + 'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else [] + } + track_number = None + disc_number = None + if track_id: + try: + detailed = itunes_client.get_track_details(track_id) + if detailed and isinstance(detailed.get('album'), dict): + dt_album = detailed['album'] + if dt_album.get('id'): + album_obj['id'] = dt_album['id'] + if dt_album.get('total_tracks'): + album_obj['total_tracks'] = dt_album['total_tracks'] + if dt_album.get('release_date') and not album_obj.get('release_date'): + album_obj['release_date'] = dt_album['release_date'] + if dt_album.get('album_type'): + album_obj['album_type'] = dt_album['album_type'] + if dt_album.get('images') and not album_obj.get('images'): + album_obj['images'] = dt_album['images'] + if dt_album.get('artists'): + album_obj['artists'] = dt_album['artists'] + if detailed: + track_number = detailed.get('track_number') + disc_number = detailed.get('disc_number') + logger.info(f"[Discovery Enrich] {result_name}: track_number={track_number}, disc={disc_number}") + else: + logger.info(f"[Discovery Enrich] get_track_details returned None for ID {track_id} ({result_name})") + except Exception as _enrich_err: + logger.error(f"[Discovery Enrich] Failed for {result_name} (ID {track_id}): {_enrich_err}") + + result_data = { + 'id': track_id, + 'name': result_name, + 'artists': [result_artist], + 'album': album_obj, + 'duration_ms': duration_ms, + 'source': _get_metadata_fallback_source(), + 'confidence': best_confidence + } + if track_number: + result_data['track_number'] = track_number + if disc_number: + result_data['disc_number'] = disc_number + return result_data + else: + logger.warning(f"No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})") + return None + + except Exception as e: + logger.error(f"Error searching Spotify for Tidal track: {e}") + return None diff --git a/web_server.py b/web_server.py index 4814169c..bcc14a90 100644 --- a/web_server.py +++ b/web_server.py @@ -21240,92 +21240,11 @@ def _validate_discovery_cache_artist(source_artist, cached_match): return True -def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results): - """Score search results against a source track using the matching engine. - - Both artist AND title must independently pass minimum similarity floors. - This prevents weighted scoring from allowing a perfect artist to carry a - garbage title (or vice versa). If either dimension doesn't match, the - candidate is rejected — no match is better than a wrong match. - - Args: - source_title: The source track title (already cleaned for YouTube, raw for others) - source_artist: The source track primary artist - source_duration_ms: The source track duration in ms (0 if unknown) - search_results: List of Track objects (Spotify or iTunes) from search - - Returns: - (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results - """ - best_match = None - best_confidence = 0.0 - best_index = -1 - min_artist_similarity = 0.5 - min_title_similarity = 0.5 - - source_artist_cleaned = matching_engine.clean_artist(source_artist) - source_title_cleaned = matching_engine.clean_title(source_title) - source_core_title = matching_engine.get_core_string(source_title) - - for idx, result in enumerate(search_results): - try: - result_artists = result.artists if hasattr(result, 'artists') and result.artists else [] - result_name = result.name if hasattr(result, 'name') else '' - result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0 - - # Artist floor — both must match, not just the weighted score - best_artist_sim = 0.0 - for cand_artist in result_artists: - if not cand_artist: - continue - cand_cleaned = matching_engine.clean_artist(cand_artist) - cand_normalized = matching_engine.normalize_string(cand_artist) - if source_artist_cleaned and source_artist_cleaned in cand_normalized: - best_artist_sim = 1.0 - break - sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned) - if sim > best_artist_sim: - best_artist_sim = sim - - if best_artist_sim < min_artist_similarity: - continue - - # Title floor — both must match, not just the weighted score - cand_title_cleaned = matching_engine.clean_title(result_name) - cand_core_title = matching_engine.get_core_string(result_name) - - # Core title exact match bypasses the floor (e.g., "edamame" == "edamame") - title_passes = False - if source_core_title and cand_core_title and source_core_title == cand_core_title: - title_passes = True - else: - title_sim = matching_engine.similarity_score(source_title_cleaned, cand_title_cleaned) - if title_sim >= min_title_similarity: - title_passes = True - - if not title_passes: - continue - - # Both floors passed — now do full scoring - confidence, match_type = matching_engine.score_track_match( - source_title=source_title, - source_artists=[source_artist], - source_duration_ms=source_duration_ms, - candidate_title=result_name, - candidate_artists=result_artists, - candidate_duration_ms=result_duration - ) - - if confidence > best_confidence: - best_confidence = confidence - best_match = result - best_index = idx - - except Exception as e: - logger.error(f"Error scoring candidate {idx}: {e}") - continue - - return best_match, best_confidence, best_index +from core.discovery.scoring import ( + _discovery_score_candidates, + _search_spotify_for_tidal_track, + init as _init_discovery_scoring, +) # Tidal discovery worker logic lives in core/discovery/tidal.py. @@ -21356,193 +21275,6 @@ def _run_tidal_discovery_worker(playlist_id): -def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client=None): - """Search Spotify/fallback for a Tidal track using matching_engine for better accuracy - - Args: - tidal_track: The Tidal track to search for - use_spotify: If True, use Spotify; if False, use fallback source - itunes_client: Fallback client instance (required when use_spotify=False) - - Returns: - For Spotify: (Track, raw_data, confidence) tuple or None - For fallback: dict with track data (includes 'confidence' key) or None - """ - if use_spotify: - if not spotify_client or not spotify_client.is_authenticated(): - return None - else: - if not itunes_client: - return None - - try: - # Get track info - track_name = tidal_track.name - artists = tidal_track.artists or [] - - if not artists: - return None - - artist_name = artists[0] # Use primary artist - source_duration = getattr(tidal_track, 'duration_ms', 0) or 0 - source_name = "Spotify" if use_spotify else _get_metadata_fallback_source().capitalize() - - logger.info(f"Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})") - - # Use matching engine to generate search queries (with fallback) - try: - temp_track = type('TempTrack', (), { - 'name': track_name, - 'artists': [artist_name], - 'album': None - })() - search_queries = matching_engine.generate_download_queries(temp_track) - logger.info(f"Generated {len(search_queries)} search queries for Tidal track") - except Exception as e: - logger.error(f"Matching engine failed for Tidal, falling back to basic queries: {e}") - if use_spotify: - search_queries = [ - f'track:"{track_name}" artist:"{artist_name}"', - f'"{track_name}" "{artist_name}"', - f'{track_name} {artist_name}' - ] - else: - search_queries = [ - f'{artist_name} {track_name}', - f'{track_name} {artist_name}', - track_name - ] - - best_match = None - best_match_raw = None - best_confidence = 0.0 - min_confidence = 0.9 - - for query_idx, search_query in enumerate(search_queries): - try: - logger.debug(f"Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})") - - if use_spotify and not _spotify_rate_limited(): - results = spotify_client.search_tracks(search_query, limit=10) - if not results: - continue - else: - results = itunes_client.search_tracks(search_query, limit=10) - if not results: - continue - - # Score all results using the matching engine - match, confidence, match_idx = _discovery_score_candidates( - track_name, artist_name, source_duration, results - ) - - if match and confidence > best_confidence and confidence >= min_confidence: - best_confidence = confidence - best_match = match - if use_spotify and match.id: - _cache = get_metadata_cache() - best_match_raw = _cache.get_entity('spotify', 'track', match.id) - else: - best_match_raw = None - logger.info(f"New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - - if best_confidence >= 0.9: - logger.info(f"High confidence Tidal match found ({best_confidence:.3f}), stopping search") - break - - except Exception as e: - logger.debug(f"Error in Tidal {source_name} search for query '{search_query}': {e}") - continue - - # Strategy 4: Extended search with higher limit (last resort) - if not best_match: - logger.info("Tidal Strategy 4: Extended search with limit=50") - query = f"{artist_name} {track_name}" - if use_spotify: - extended_results = spotify_client.search_tracks(query, limit=50) - else: - extended_results = itunes_client.search_tracks(query, limit=50) - if extended_results: - match, confidence, match_idx = _discovery_score_candidates( - track_name, artist_name, source_duration, extended_results - ) - if match and confidence >= min_confidence: - best_match = match - best_confidence = confidence - logger.info(f"Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})") - - if best_match: - if use_spotify: - logger.info(f"Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})") - return (best_match, best_match_raw, best_confidence) - else: - result_artists = best_match.artists if hasattr(best_match, 'artists') else [] - result_artist = result_artists[0] if result_artists else 'Unknown' - result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown' - logger.info(f"Final Tidal {source_name} match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})") - - album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album' - image_url = best_match.image_url if hasattr(best_match, 'image_url') else '' - track_id = best_match.id if hasattr(best_match, 'id') else '' - duration_ms = best_match.duration_ms if hasattr(best_match, 'duration_ms') else 0 - - # Fetch full track details to get album ID, track_number, etc. - # The Track dataclass strips this data — the API has it - album_obj = { - 'name': album_name, - 'album_type': 'album', - 'release_date': getattr(best_match, 'release_date', '') or '', - 'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else [] - } - track_number = None - disc_number = None - if track_id: - try: - detailed = itunes_client.get_track_details(track_id) - if detailed and isinstance(detailed.get('album'), dict): - dt_album = detailed['album'] - if dt_album.get('id'): - album_obj['id'] = dt_album['id'] - if dt_album.get('total_tracks'): - album_obj['total_tracks'] = dt_album['total_tracks'] - if dt_album.get('release_date') and not album_obj.get('release_date'): - album_obj['release_date'] = dt_album['release_date'] - if dt_album.get('album_type'): - album_obj['album_type'] = dt_album['album_type'] - if dt_album.get('images') and not album_obj.get('images'): - album_obj['images'] = dt_album['images'] - if dt_album.get('artists'): - album_obj['artists'] = dt_album['artists'] - if detailed: - track_number = detailed.get('track_number') - disc_number = detailed.get('disc_number') - logger.info(f"[Discovery Enrich] {result_name}: track_number={track_number}, disc={disc_number}") - else: - logger.info(f"[Discovery Enrich] get_track_details returned None for ID {track_id} ({result_name})") - except Exception as _enrich_err: - logger.error(f"[Discovery Enrich] Failed for {result_name} (ID {track_id}): {_enrich_err}") - - result_data = { - 'id': track_id, - 'name': result_name, - 'artists': [result_artist], - 'album': album_obj, - 'duration_ms': duration_ms, - 'source': _get_metadata_fallback_source(), - 'confidence': best_confidence - } - if track_number: - result_data['track_number'] = track_number - if disc_number: - result_data['disc_number'] = disc_number - return result_data - else: - logger.warning(f"No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})") - return None - - except Exception as e: - logger.error(f"Error searching Spotify for Tidal track: {e}") - return None def convert_tidal_results_to_spotify_tracks(discovery_results): @@ -33935,6 +33667,8 @@ _init_connection_test( docker_resolve_path_fn=docker_resolve_path, ) +_init_discovery_scoring(matching_engine_obj=matching_engine) + _init_debug_info( soulsync_version=SOULSYNC_VERSION, direct_run=_DIRECT_RUN,