Lift discovery scoring + tidal-track search to core/discovery/scoring.py

Both function bodies (_discovery_score_candidates and _search_spotify_for_tidal_track) are byte-identical to the originals. The shared matching_engine instance is injected via init() right after _init_connection_test; the spotify proxy + _get_metadata_fallback_source shim follow the same pattern used elsewhere. web_server.py: 36019 → 35753 (-266 lines).
4 weeks ago · a4eccff4a5
parent 7158130d5a
commit a4eccff4a5
2 changed files with 330 additions and 273 deletions
--- a/core/discovery/scoring.py
+++ b/core/discovery/scoring.py
@ -0,0 +1,323 @@
+"""Discovery scoring + tidal-track search — lifted from web_server.py.
+
+Both function bodies are byte-identical to the originals. The
+``spotify_client`` proxy and ``_get_metadata_fallback_source`` shim
+let the bodies resolve their original names without modification.
+``matching_engine`` is injected via init() because it is constructed
+in web_server.py and referenced by name throughout the bodies.
+"""
+import logging
+
+from core.metadata.cache import get_metadata_cache
+from core.metadata.registry import get_primary_source, get_spotify_client
+from core.spotify_client import _is_globally_rate_limited as _spotify_rate_limited
+
+logger = logging.getLogger(__name__)
+
+
+def _get_metadata_fallback_source():
+    """Mirror of web_server._get_metadata_fallback_source — delegates to registry."""
+    return get_primary_source()
+
+
+class _SpotifyClientProxy:
+    """Resolves the global Spotify client lazily through core.metadata.registry."""
+
+    def __getattr__(self, name):
+        client = get_spotify_client()
+        if client is None:
+            raise AttributeError(name)
+        return getattr(client, name)
+
+    def __bool__(self):
+        return get_spotify_client() is not None
+
+
+spotify_client = _SpotifyClientProxy()
+
+
+# Injected at runtime via init().
+matching_engine = None
+
+
+def init(matching_engine_obj):
+    """Bind the shared matching engine instance from web_server."""
+    global matching_engine
+    matching_engine = matching_engine_obj
+
+
+def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results):
+    """Score search results against a source track using the matching engine.
+
+    Both artist AND title must independently pass minimum similarity floors.
+    This prevents weighted scoring from allowing a perfect artist to carry a
+    garbage title (or vice versa). If either dimension doesn't match, the
+    candidate is rejected — no match is better than a wrong match.
+
+    Args:
+        source_title: The source track title (already cleaned for YouTube, raw for others)
+        source_artist: The source track primary artist
+        source_duration_ms: The source track duration in ms (0 if unknown)
+        search_results: List of Track objects (Spotify or iTunes) from search
+
+    Returns:
+        (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results
+    """
+    best_match = None
+    best_confidence = 0.0
+    best_index = -1
+    min_artist_similarity = 0.5
+    min_title_similarity = 0.5
+
+    source_artist_cleaned = matching_engine.clean_artist(source_artist)
+    source_title_cleaned = matching_engine.clean_title(source_title)
+    source_core_title = matching_engine.get_core_string(source_title)
+
+    for idx, result in enumerate(search_results):
+        try:
+            result_artists = result.artists if hasattr(result, 'artists') and result.artists else []
+            result_name = result.name if hasattr(result, 'name') else ''
+            result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0
+
+            # Artist floor — both must match, not just the weighted score
+            best_artist_sim = 0.0
+            for cand_artist in result_artists:
+                if not cand_artist:
+                    continue
+                cand_cleaned = matching_engine.clean_artist(cand_artist)
+                cand_normalized = matching_engine.normalize_string(cand_artist)
+                if source_artist_cleaned and source_artist_cleaned in cand_normalized:
+                    best_artist_sim = 1.0
+                    break
+                sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned)
+                if sim > best_artist_sim:
+                    best_artist_sim = sim
+
+            if best_artist_sim < min_artist_similarity:
+                continue
+
+            # Title floor — both must match, not just the weighted score
+            cand_title_cleaned = matching_engine.clean_title(result_name)
+            cand_core_title = matching_engine.get_core_string(result_name)
+
+            # Core title exact match bypasses the floor (e.g., "edamame" == "edamame")
+            title_passes = False
+            if source_core_title and cand_core_title and source_core_title == cand_core_title:
+                title_passes = True
+            else:
+                title_sim = matching_engine.similarity_score(source_title_cleaned, cand_title_cleaned)
+                if title_sim >= min_title_similarity:
+                    title_passes = True
+
+            if not title_passes:
+                continue
+
+            # Both floors passed — now do full scoring
+            confidence, match_type = matching_engine.score_track_match(
+                source_title=source_title,
+                source_artists=[source_artist],
+                source_duration_ms=source_duration_ms,
+                candidate_title=result_name,
+                candidate_artists=result_artists,
+                candidate_duration_ms=result_duration
+            )
+
+            if confidence > best_confidence:
+                best_confidence = confidence
+                best_match = result
+                best_index = idx
+
+        except Exception as e:
+            logger.error(f"Error scoring candidate {idx}: {e}")
+            continue
+
+    return best_match, best_confidence, best_index
+
+
+def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client=None):
+    """Search Spotify/fallback for a Tidal track using matching_engine for better accuracy
+
+    Args:
+        tidal_track: The Tidal track to search for
+        use_spotify: If True, use Spotify; if False, use fallback source
+        itunes_client: Fallback client instance (required when use_spotify=False)
+
+    Returns:
+        For Spotify: (Track, raw_data, confidence) tuple or None
+        For fallback: dict with track data (includes 'confidence' key) or None
+    """
+    if use_spotify:
+        if not spotify_client or not spotify_client.is_authenticated():
+            return None
+    else:
+        if not itunes_client:
+            return None
+
+    try:
+        # Get track info
+        track_name = tidal_track.name
+        artists = tidal_track.artists or []
+
+        if not artists:
+            return None
+
+        artist_name = artists[0]  # Use primary artist
+        source_duration = getattr(tidal_track, 'duration_ms', 0) or 0
+        source_name = "Spotify" if use_spotify else _get_metadata_fallback_source().capitalize()
+
+        logger.info(f"Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})")
+
+        # Use matching engine to generate search queries (with fallback)
+        try:
+            temp_track = type('TempTrack', (), {
+                'name': track_name,
+                'artists': [artist_name],
+                'album': None
+            })()
+            search_queries = matching_engine.generate_download_queries(temp_track)
+            logger.info(f"Generated {len(search_queries)} search queries for Tidal track")
+        except Exception as e:
+            logger.error(f"Matching engine failed for Tidal, falling back to basic queries: {e}")
+            if use_spotify:
+                search_queries = [
+                    f'track:"{track_name}" artist:"{artist_name}"',
+                    f'"{track_name}" "{artist_name}"',
+                    f'{track_name} {artist_name}'
+                ]
+            else:
+                search_queries = [
+                    f'{artist_name} {track_name}',
+                    f'{track_name} {artist_name}',
+                    track_name
+                ]
+
+        best_match = None
+        best_match_raw = None
+        best_confidence = 0.0
+        min_confidence = 0.9
+
+        for query_idx, search_query in enumerate(search_queries):
+            try:
+                logger.debug(f"Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})")
+
+                if use_spotify and not _spotify_rate_limited():
+                    results = spotify_client.search_tracks(search_query, limit=10)
+                    if not results:
+                        continue
+                else:
+                    results = itunes_client.search_tracks(search_query, limit=10)
+                    if not results:
+                        continue
+
+                # Score all results using the matching engine
+                match, confidence, match_idx = _discovery_score_candidates(
+                    track_name, artist_name, source_duration, results
+                )
+
+                if match and confidence > best_confidence and confidence >= min_confidence:
+                    best_confidence = confidence
+                    best_match = match
+                    if use_spotify and match.id:
+                        _cache = get_metadata_cache()
+                        best_match_raw = _cache.get_entity('spotify', 'track', match.id)
+                    else:
+                        best_match_raw = None
+                    logger.info(f"New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
+                if best_confidence >= 0.9:
+                    logger.info(f"High confidence Tidal match found ({best_confidence:.3f}), stopping search")
+                    break
+
+            except Exception as e:
+                logger.debug(f"Error in Tidal {source_name} search for query '{search_query}': {e}")
+                continue
+
+        # Strategy 4: Extended search with higher limit (last resort)
+        if not best_match:
+            logger.info("Tidal Strategy 4: Extended search with limit=50")
+            query = f"{artist_name} {track_name}"
+            if use_spotify:
+                extended_results = spotify_client.search_tracks(query, limit=50)
+            else:
+                extended_results = itunes_client.search_tracks(query, limit=50)
+            if extended_results:
+                match, confidence, match_idx = _discovery_score_candidates(
+                    track_name, artist_name, source_duration, extended_results
+                )
+                if match and confidence >= min_confidence:
+                    best_match = match
+                    best_confidence = confidence
+                    logger.info(f"Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
+
+        if best_match:
+            if use_spotify:
+                logger.info(f"Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})")
+                return (best_match, best_match_raw, best_confidence)
+            else:
+                result_artists = best_match.artists if hasattr(best_match, 'artists') else []
+                result_artist = result_artists[0] if result_artists else 'Unknown'
+                result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown'
+                logger.info(f"Final Tidal {source_name} match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})")
+
+                album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album'
+                image_url = best_match.image_url if hasattr(best_match, 'image_url') else ''
+                track_id = best_match.id if hasattr(best_match, 'id') else ''
+                duration_ms = best_match.duration_ms if hasattr(best_match, 'duration_ms') else 0
+
+                # Fetch full track details to get album ID, track_number, etc.
+                # The Track dataclass strips this data — the API has it
+                album_obj = {
+                    'name': album_name,
+                    'album_type': 'album',
+                    'release_date': getattr(best_match, 'release_date', '') or '',
+                    'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else []
+                }
+                track_number = None
+                disc_number = None
+                if track_id:
+                    try:
+                        detailed = itunes_client.get_track_details(track_id)
+                        if detailed and isinstance(detailed.get('album'), dict):
+                            dt_album = detailed['album']
+                            if dt_album.get('id'):
+                                album_obj['id'] = dt_album['id']
+                            if dt_album.get('total_tracks'):
+                                album_obj['total_tracks'] = dt_album['total_tracks']
+                            if dt_album.get('release_date') and not album_obj.get('release_date'):
+                                album_obj['release_date'] = dt_album['release_date']
+                            if dt_album.get('album_type'):
+                                album_obj['album_type'] = dt_album['album_type']
+                            if dt_album.get('images') and not album_obj.get('images'):
+                                album_obj['images'] = dt_album['images']
+                            if dt_album.get('artists'):
+                                album_obj['artists'] = dt_album['artists']
+                        if detailed:
+                            track_number = detailed.get('track_number')
+                            disc_number = detailed.get('disc_number')
+                            logger.info(f"[Discovery Enrich] {result_name}: track_number={track_number}, disc={disc_number}")
+                        else:
+                            logger.info(f"[Discovery Enrich] get_track_details returned None for ID {track_id} ({result_name})")
+                    except Exception as _enrich_err:
+                        logger.error(f"[Discovery Enrich] Failed for {result_name} (ID {track_id}): {_enrich_err}")
+
+                result_data = {
+                    'id': track_id,
+                    'name': result_name,
+                    'artists': [result_artist],
+                    'album': album_obj,
+                    'duration_ms': duration_ms,
+                    'source': _get_metadata_fallback_source(),
+                    'confidence': best_confidence
+                }
+                if track_number:
+                    result_data['track_number'] = track_number
+                if disc_number:
+                    result_data['disc_number'] = disc_number
+                return result_data
+        else:
+            logger.warning(f"No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})")
+            return None
+
+    except Exception as e:
+        logger.error(f"Error searching Spotify for Tidal track: {e}")
+        return None
--- a/web_server.py
+++ b/web_server.py
@ -21240,92 +21240,11 @@ def _validate_discovery_cache_artist(source_artist, cached_match):
    return True


-def _discovery_score_candidates(source_title, source_artist, source_duration_ms, search_results):
-    """Score search results against a source track using the matching engine.
-
-    Both artist AND title must independently pass minimum similarity floors.
-    This prevents weighted scoring from allowing a perfect artist to carry a
-    garbage title (or vice versa). If either dimension doesn't match, the
-    candidate is rejected — no match is better than a wrong match.
-
-    Args:
-        source_title: The source track title (already cleaned for YouTube, raw for others)
-        source_artist: The source track primary artist
-        source_duration_ms: The source track duration in ms (0 if unknown)
-        search_results: List of Track objects (Spotify or iTunes) from search
-
-    Returns:
-        (best_match, best_confidence, best_index) or (None, 0.0, -1) if no results
-    """
-    best_match = None
-    best_confidence = 0.0
-    best_index = -1
-    min_artist_similarity = 0.5
-    min_title_similarity = 0.5
-
-    source_artist_cleaned = matching_engine.clean_artist(source_artist)
-    source_title_cleaned = matching_engine.clean_title(source_title)
-    source_core_title = matching_engine.get_core_string(source_title)
-
-    for idx, result in enumerate(search_results):
-        try:
-            result_artists = result.artists if hasattr(result, 'artists') and result.artists else []
-            result_name = result.name if hasattr(result, 'name') else ''
-            result_duration = result.duration_ms if hasattr(result, 'duration_ms') else 0
-
-            # Artist floor — both must match, not just the weighted score
-            best_artist_sim = 0.0
-            for cand_artist in result_artists:
-                if not cand_artist:
-                    continue
-                cand_cleaned = matching_engine.clean_artist(cand_artist)
-                cand_normalized = matching_engine.normalize_string(cand_artist)
-                if source_artist_cleaned and source_artist_cleaned in cand_normalized:
-                    best_artist_sim = 1.0
-                    break
-                sim = matching_engine.similarity_score(source_artist_cleaned, cand_cleaned)
-                if sim > best_artist_sim:
-                    best_artist_sim = sim
-
-            if best_artist_sim < min_artist_similarity:
-                continue
-
-            # Title floor — both must match, not just the weighted score
-            cand_title_cleaned = matching_engine.clean_title(result_name)
-            cand_core_title = matching_engine.get_core_string(result_name)
-
-            # Core title exact match bypasses the floor (e.g., "edamame" == "edamame")
-            title_passes = False
-            if source_core_title and cand_core_title and source_core_title == cand_core_title:
-                title_passes = True
-            else:
-                title_sim = matching_engine.similarity_score(source_title_cleaned, cand_title_cleaned)
-                if title_sim >= min_title_similarity:
-                    title_passes = True
-
-            if not title_passes:
-                continue
-
-            # Both floors passed — now do full scoring
-            confidence, match_type = matching_engine.score_track_match(
-                source_title=source_title,
-                source_artists=[source_artist],
-                source_duration_ms=source_duration_ms,
-                candidate_title=result_name,
-                candidate_artists=result_artists,
-                candidate_duration_ms=result_duration
-            )
-
-            if confidence > best_confidence:
-                best_confidence = confidence
-                best_match = result
-                best_index = idx
-
-        except Exception as e:
-            logger.error(f"Error scoring candidate {idx}: {e}")
-            continue
-
-    return best_match, best_confidence, best_index
+from core.discovery.scoring import (
+    _discovery_score_candidates,
+    _search_spotify_for_tidal_track,
+    init as _init_discovery_scoring,
+)


 # Tidal discovery worker logic lives in core/discovery/tidal.py.
@ -21356,193 +21275,6 @@ def _run_tidal_discovery_worker(playlist_id):



-def _search_spotify_for_tidal_track(tidal_track, use_spotify=True, itunes_client=None):
-    """Search Spotify/fallback for a Tidal track using matching_engine for better accuracy
-
-    Args:
-        tidal_track: The Tidal track to search for
-        use_spotify: If True, use Spotify; if False, use fallback source
-        itunes_client: Fallback client instance (required when use_spotify=False)
-
-    Returns:
-        For Spotify: (Track, raw_data, confidence) tuple or None
-        For fallback: dict with track data (includes 'confidence' key) or None
-    """
-    if use_spotify:
-        if not spotify_client or not spotify_client.is_authenticated():
-            return None
-    else:
-        if not itunes_client:
-            return None
-
-    try:
-        # Get track info
-        track_name = tidal_track.name
-        artists = tidal_track.artists or []
-
-        if not artists:
-            return None
-
-        artist_name = artists[0]  # Use primary artist
-        source_duration = getattr(tidal_track, 'duration_ms', 0) or 0
-        source_name = "Spotify" if use_spotify else _get_metadata_fallback_source().capitalize()
-
-        logger.info(f"Tidal track: '{artist_name}' - '{track_name}' (searching {source_name})")
-
-        # Use matching engine to generate search queries (with fallback)
-        try:
-            temp_track = type('TempTrack', (), {
-                'name': track_name,
-                'artists': [artist_name],
-                'album': None
-            })()
-            search_queries = matching_engine.generate_download_queries(temp_track)
-            logger.info(f"Generated {len(search_queries)} search queries for Tidal track")
-        except Exception as e:
-            logger.error(f"Matching engine failed for Tidal, falling back to basic queries: {e}")
-            if use_spotify:
-                search_queries = [
-                    f'track:"{track_name}" artist:"{artist_name}"',
-                    f'"{track_name}" "{artist_name}"',
-                    f'{track_name} {artist_name}'
-                ]
-            else:
-                search_queries = [
-                    f'{artist_name} {track_name}',
-                    f'{track_name} {artist_name}',
-                    track_name
-                ]
-
-        best_match = None
-        best_match_raw = None
-        best_confidence = 0.0
-        min_confidence = 0.9
-
-        for query_idx, search_query in enumerate(search_queries):
-            try:
-                logger.debug(f"Tidal query {query_idx + 1}/{len(search_queries)}: {search_query} ({source_name})")
-
-                if use_spotify and not _spotify_rate_limited():
-                    results = spotify_client.search_tracks(search_query, limit=10)
-                    if not results:
-                        continue
-                else:
-                    results = itunes_client.search_tracks(search_query, limit=10)
-                    if not results:
-                        continue
-
-                # Score all results using the matching engine
-                match, confidence, match_idx = _discovery_score_candidates(
-                    track_name, artist_name, source_duration, results
-                )
-
-                if match and confidence > best_confidence and confidence >= min_confidence:
-                    best_confidence = confidence
-                    best_match = match
-                    if use_spotify and match.id:
-                        _cache = get_metadata_cache()
-                        best_match_raw = _cache.get_entity('spotify', 'track', match.id)
-                    else:
-                        best_match_raw = None
-                    logger.info(f"New best Tidal match: {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
-
-                if best_confidence >= 0.9:
-                    logger.info(f"High confidence Tidal match found ({best_confidence:.3f}), stopping search")
-                    break
-
-            except Exception as e:
-                logger.debug(f"Error in Tidal {source_name} search for query '{search_query}': {e}")
-                continue
-
-        # Strategy 4: Extended search with higher limit (last resort)
-        if not best_match:
-            logger.info("Tidal Strategy 4: Extended search with limit=50")
-            query = f"{artist_name} {track_name}"
-            if use_spotify:
-                extended_results = spotify_client.search_tracks(query, limit=50)
-            else:
-                extended_results = itunes_client.search_tracks(query, limit=50)
-            if extended_results:
-                match, confidence, match_idx = _discovery_score_candidates(
-                    track_name, artist_name, source_duration, extended_results
-                )
-                if match and confidence >= min_confidence:
-                    best_match = match
-                    best_confidence = confidence
-                    logger.info(f"Strategy 4 Tidal match (extended): {match.artists[0]} - {match.name} (confidence: {confidence:.3f})")
-
-        if best_match:
-            if use_spotify:
-                logger.info(f"Final Tidal Spotify match: {best_match.artists[0]} - {best_match.name} (confidence: {best_confidence:.3f})")
-                return (best_match, best_match_raw, best_confidence)
-            else:
-                result_artists = best_match.artists if hasattr(best_match, 'artists') else []
-                result_artist = result_artists[0] if result_artists else 'Unknown'
-                result_name = best_match.name if hasattr(best_match, 'name') else 'Unknown'
-                logger.info(f"Final Tidal {source_name} match: {result_artist} - {result_name} (confidence: {best_confidence:.3f})")
-
-                album_name = best_match.album if hasattr(best_match, 'album') else 'Unknown Album'
-                image_url = best_match.image_url if hasattr(best_match, 'image_url') else ''
-                track_id = best_match.id if hasattr(best_match, 'id') else ''
-                duration_ms = best_match.duration_ms if hasattr(best_match, 'duration_ms') else 0
-
-                # Fetch full track details to get album ID, track_number, etc.
-                # The Track dataclass strips this data — the API has it
-                album_obj = {
-                    'name': album_name,
-                    'album_type': 'album',
-                    'release_date': getattr(best_match, 'release_date', '') or '',
-                    'images': [{'url': image_url, 'height': 300, 'width': 300}] if image_url else []
-                }
-                track_number = None
-                disc_number = None
-                if track_id:
-                    try:
-                        detailed = itunes_client.get_track_details(track_id)
-                        if detailed and isinstance(detailed.get('album'), dict):
-                            dt_album = detailed['album']
-                            if dt_album.get('id'):
-                                album_obj['id'] = dt_album['id']
-                            if dt_album.get('total_tracks'):
-                                album_obj['total_tracks'] = dt_album['total_tracks']
-                            if dt_album.get('release_date') and not album_obj.get('release_date'):
-                                album_obj['release_date'] = dt_album['release_date']
-                            if dt_album.get('album_type'):
-                                album_obj['album_type'] = dt_album['album_type']
-                            if dt_album.get('images') and not album_obj.get('images'):
-                                album_obj['images'] = dt_album['images']
-                            if dt_album.get('artists'):
-                                album_obj['artists'] = dt_album['artists']
-                        if detailed:
-                            track_number = detailed.get('track_number')
-                            disc_number = detailed.get('disc_number')
-                            logger.info(f"[Discovery Enrich] {result_name}: track_number={track_number}, disc={disc_number}")
-                        else:
-                            logger.info(f"[Discovery Enrich] get_track_details returned None for ID {track_id} ({result_name})")
-                    except Exception as _enrich_err:
-                        logger.error(f"[Discovery Enrich] Failed for {result_name} (ID {track_id}): {_enrich_err}")
-
-                result_data = {
-                    'id': track_id,
-                    'name': result_name,
-                    'artists': [result_artist],
-                    'album': album_obj,
-                    'duration_ms': duration_ms,
-                    'source': _get_metadata_fallback_source(),
-                    'confidence': best_confidence
-                }
-                if track_number:
-                    result_data['track_number'] = track_number
-                if disc_number:
-                    result_data['disc_number'] = disc_number
-                return result_data
-        else:
-            logger.warning(f"No suitable Tidal match found (best confidence was {best_confidence:.3f}, required {min_confidence:.3f})")
-            return None
-
-    except Exception as e:
-        logger.error(f"Error searching Spotify for Tidal track: {e}")
-        return None


 def convert_tidal_results_to_spotify_tracks(discovery_results):
@ -33935,6 +33667,8 @@ _init_connection_test(
    docker_resolve_path_fn=docker_resolve_path,
 )

+_init_discovery_scoring(matching_engine_obj=matching_engine)
+
 _init_debug_info(
    soulsync_version=SOULSYNC_VERSION,
    direct_run=_DIRECT_RUN,