You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/discovery/matching.py

135 lines
4.5 KiB

"""Pure helper for matching raw MusicBrainz-metadata tracks against
Spotify / iTunes.
Used by the PlaylistSource adapters whose ``get_playlist`` returns
tracks with ``needs_discovery=True`` (ListenBrainz, Last.fm radio).
Phase 1b ships Strategy 1 only (matching-engine queries → search →
score → pick best ≥0.9). The richer multi-strategy +
discovery-cache flow stays in
``core.discovery.listenbrainz.run_listenbrainz_discovery_worker``
for the Discover-page state-machine UI; this helper is the slimmer
version used by the auto-refresh pipeline.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
@dataclass
class MBMatchDeps:
"""Bundle of primitives the matcher needs.
Wired up at bootstrap. Tests pass stub callables / clients."""
matching_engine: Any
score_candidates: Callable[..., Any]
spotify_client_getter: Callable[[], Any]
itunes_client_getter: Callable[[], Any]
prefer_spotify_getter: Callable[[], bool]
min_confidence: float = 0.9
def match_mb_track(
track: Dict[str, Any], deps: MBMatchDeps
) -> Optional[Dict[str, Any]]:
"""Try to match a single MB-metadata track.
Input shape:
``{'track_name', 'artist_name', 'album_name', 'duration_ms'}``
Returns the matched_data dict (Spotify/iTunes track projection)
or ``None`` when no candidate cleared the confidence threshold.
"""
title = track.get("track_name") or ""
artist = track.get("artist_name") or ""
album = track.get("album_name") or ""
duration_ms = int(track.get("duration_ms") or 0)
if not title or not artist:
return None
spotify_client = deps.spotify_client_getter()
itunes_client = deps.itunes_client_getter()
use_spotify = bool(
deps.prefer_spotify_getter()
and spotify_client is not None
and getattr(spotify_client, "is_spotify_authenticated", lambda: False)()
)
if not use_spotify and itunes_client is None:
return None
# Strategy 1 — matching-engine query generation.
try:
temp_track = type("_TempTrack", (), {
"name": title,
"artists": [artist],
"album": album or None,
})()
queries = deps.matching_engine.generate_download_queries(temp_track)
except Exception as exc:
logger.debug(f"matching_engine query-gen failed: {exc}")
queries = [f"{artist} {title}", title]
best_match: Any = None
best_confidence = 0.0
for query in queries:
try:
if use_spotify:
results = spotify_client.search_tracks(query, limit=10)
else:
results = itunes_client.search_tracks(query, limit=10)
except Exception as exc:
logger.debug(f"search failed for query={query!r}: {exc}")
continue
if not results:
continue
try:
match, confidence, _ = deps.score_candidates(
title, artist, duration_ms, results
)
except Exception as exc:
logger.debug(f"score_candidates failed: {exc}")
continue
if match and confidence > best_confidence and confidence >= deps.min_confidence:
best_match = match
best_confidence = confidence
if best_confidence >= deps.min_confidence:
break
if not best_match:
return None
provider = "spotify" if use_spotify else "itunes"
image_url = getattr(best_match, "image_url", None) or ""
album_data: Dict[str, Any] = {
"name": getattr(best_match, "album", "") or "",
}
if image_url:
album_data["images"] = [{"url": image_url}]
return {
"id": getattr(best_match, "id", "") or "",
"name": getattr(best_match, "name", "") or "",
"artists": list(getattr(best_match, "artists", []) or []),
"album": album_data,
"duration_ms": int(getattr(best_match, "duration_ms", 0) or 0),
"image_url": image_url,
"source": provider,
"_provider": provider,
"_confidence": float(best_confidence),
}
def match_mb_tracks(
tracks: List[Dict[str, Any]], deps: MBMatchDeps
) -> List[Optional[Dict[str, Any]]]:
"""Vectorized variant — runs ``match_mb_track`` per track.
Phase 1b is sequential. If profiling shows it's too slow on big
LB playlists, this becomes the natural spot to thread-pool the
per-track searches."""
return [match_mb_track(t, deps) for t in tracks]