"""Local relevance re-ranking for metadata-source search results. Background ---------- Some metadata sources (Deezer notably) return search results in a relevance order that puts karaoke covers, "originally performed by", re-recorded versions, tribute compilations, and Vocal/Backing-Track variants ABOVE the actual studio recording the user is looking for. Their global popularity ordering means anything that appears across many compilations outranks the canonical track. Issue #534 is the canonical example: searching `Dirty White Boy` + `Foreigner` returned five karaoke / cover variants before the real Foreigner studio cut. This module is a provider-neutral helper. Given a list of typed ``Track`` results plus an expected title + artist, it re-ranks by local heuristics that the source's own ranking ignores: - Hard penalty for known cover/karaoke/tribute patterns (title OR album OR artist field). These rarely belong in import / match results when the user typed the original artist. - Soft penalty for variant types (Live, Acoustic, Remix, Demo, Instrumental) UNLESS the user's expected title also contains the variant tag (so "Track (Live)" search matches Live recordings). - Boost for exact artist match — the strongest signal that this is the canonical recording. - Title similarity via SequenceMatcher on normalised strings (drop parentheticals + punctuation before comparison). - Album-type weight: album > compilation > single (compilations are more likely to be tributes / "best of" repackages). Pure-function design over the canonical ``Track`` dataclass — no Deezer-specific assumptions, applies to iTunes / Spotify / Hydrabase results equally well. Each scoring component is its own small function so tests can pin them independently. Usage ----- >>> from core.metadata.relevance import rerank_tracks >>> tracks = client.search_tracks(query) >>> ranked = rerank_tracks(tracks, expected_title='Dirty White Boy', expected_artist='Foreigner') >>> # ranked[0] is now the most relevant; karaoke variants drop to bottom """ from __future__ import annotations import re from difflib import SequenceMatcher from typing import List, Optional, Sequence from core.metadata.types import Track # --------------------------------------------------------------------------- # Pattern tables — public so tests can introspect, callers can extend # --------------------------------------------------------------------------- # Title / album / artist substrings that strongly indicate a cover, # karaoke, tribute, or "originally performed by" compilation. Multiplier # applied to the final score when matched. 0.05 effectively buries these # unless nothing else matches. COVER_KARAOKE_PATTERNS = ( 'karaoke', 'originally performed by', 'in the style of', 'made famous by', 'tribute', 'vocal version', # karaoke "vocal version" backing tracks 'backing track', 'cover version', 're-recorded', # artist re-recordings (Taylor's Version notwithstanding) 're-record', 'rerecorded', 'cover by', 'as performed by', 'workout mix', # gym-music compilations 'study music', 'music for', # "Music for Studying", "Music for Sleep" etc ) COVER_KARAOKE_PENALTY = 0.05 # Multiplicative; effectively bury # Variant tags — softer penalty since the user MAY want them. Skipped # when the user's expected_title also contains the same tag (so # "Track Name (Live)" search matches the Live version cleanly). VARIANT_TAG_PATTERNS = ( 'live', 'acoustic', 'demo', 'instrumental', 'remix', 'edit', 'extended', 'radio edit', 'club mix', 'a cappella', 'acapella', # Remaster — softer than karaoke (user might want it) but still # demoted vs. the original recording. Verified against live Deezer # API behaviour where "(2008 Remaster)" outranks the Head Games # original on `track:"X" artist:"Y"` advanced queries. 'remaster', 'remastered', 'reissue', ) VARIANT_TAG_PENALTY = 0.4 # Strong boost when the source's artist field exactly matches the # user's expected artist (case-insensitive, normalised). The single # strongest signal that this is the canonical recording. EXACT_ARTIST_BOOST = 1.5 # Album-type weights. Compilations are more likely to be tributes / # karaoke repackages; albums are most likely to be the canonical # studio source. ALBUM_TYPE_WEIGHT = { 'album': 1.0, 'single': 0.85, 'ep': 0.85, 'compilation': 0.7, } DEFAULT_ALBUM_TYPE_WEIGHT = 0.85 # --------------------------------------------------------------------------- # Normalisation # --------------------------------------------------------------------------- _PARENTHETICAL_RE = re.compile(r'[\(\[].*?[\)\]]') _PUNCT_RE = re.compile(r'[^\w\s]') def _normalise(text: str) -> str: """Lowercase, strip parentheticals + punctuation, collapse spaces. Used for similarity scoring AND for variant-tag detection (since we want to know if the user typed the variant tag inside their own search input).""" if not text: return '' t = text.lower().strip() t = _PARENTHETICAL_RE.sub('', t) t = _PUNCT_RE.sub('', t) return ' '.join(t.split()) def _contains_pattern(haystack: str, patterns: Sequence[str]) -> bool: """Case-insensitive substring match across patterns. Read raw `haystack` (NOT the parenthetical-stripped version) — patterns like "karaoke" most often live INSIDE the parentheticals on Deezer's titles.""" if not haystack: return False lowered = haystack.lower() return any(p in lowered for p in patterns) # --------------------------------------------------------------------------- # Scoring components # --------------------------------------------------------------------------- def title_similarity(track: Track, expected_title: str) -> float: """Normalised SequenceMatcher ratio against the expected title.""" if not expected_title: return 0.0 return SequenceMatcher( None, _normalise(track.name), _normalise(expected_title), ).ratio() def primary_artist(track: Track) -> str: """First entry from track.artists — that's the lead/primary credit. Empty when the track has no artist info.""" if not track.artists: return '' first = track.artists[0] if isinstance(first, dict): # Some sources still surface raw dicts during migration; fall # back to .get() rather than assume the dataclass is fully # normalised. return str(first.get('name', '') or '') return str(first) def artist_similarity(track: Track, expected_artist: str) -> float: """Normalised SequenceMatcher ratio against the expected artist.""" if not expected_artist: return 0.0 return SequenceMatcher( None, _normalise(primary_artist(track)), _normalise(expected_artist), ).ratio() def has_exact_artist(track: Track, expected_artist: str) -> bool: """True when the primary artist matches expected_artist after normalisation. Strict equality on the normalised form (so "Foreigner" matches "Foreigner" but not "Foreigner Tribute Band").""" if not expected_artist: return False return _normalise(primary_artist(track)) == _normalise(expected_artist) def has_cover_pattern(track: Track) -> bool: """Any cover/karaoke/tribute pattern in the track title, album title, or artist credits.""" if _contains_pattern(track.name, COVER_KARAOKE_PATTERNS): return True if _contains_pattern(track.album, COVER_KARAOKE_PATTERNS): return True if _contains_pattern(primary_artist(track), COVER_KARAOKE_PATTERNS): return True return False def has_variant_tag(track: Track) -> bool: """Track title contains a variant-version tag (Live, Acoustic, Remix, Demo, Instrumental, etc.). Album field is intentionally NOT checked — albums named "MTV Unplugged" shouldn't penalise every track on them.""" return _contains_pattern(track.name, VARIANT_TAG_PATTERNS) def album_type_weight(track: Track) -> float: """Weight from track.album_type. Compilations ranked lower since they're frequently tribute / karaoke repackages.""" if not track.album_type: return DEFAULT_ALBUM_TYPE_WEIGHT return ALBUM_TYPE_WEIGHT.get(track.album_type.lower(), DEFAULT_ALBUM_TYPE_WEIGHT) # --------------------------------------------------------------------------- # Combined score # --------------------------------------------------------------------------- def score_track( track: Track, *, expected_title: str, expected_artist: str, ) -> float: """Combined relevance score for a single track. Higher = more relevant. Roughly 0.0 - 2.5 in practice (boosts can push above 1.0; penalties can push below 0.1). Composition: 1. Base = title_sim * 0.6 + artist_sim * 0.4 2. Multiply by album_type_weight 3. If exact artist match: multiply by EXACT_ARTIST_BOOST 4. If cover/karaoke pattern: multiply by COVER_KARAOKE_PENALTY (effectively buries unless nothing else matched) 5. If variant tag (Live, Remix, etc.) AND user did NOT type a variant tag in their input: multiply by VARIANT_TAG_PENALTY Each rule is its own component above so tests can pin them individually without standing up the full pipeline. """ title_sim = title_similarity(track, expected_title) artist_sim = artist_similarity(track, expected_artist) score = title_sim * 0.6 + artist_sim * 0.4 score *= album_type_weight(track) if has_exact_artist(track, expected_artist): score *= EXACT_ARTIST_BOOST if has_cover_pattern(track): score *= COVER_KARAOKE_PENALTY # Variant tag penalty — only when the user didn't ask for a # variant. Their input "Track (Live)" should rank Live versions # higher, not lower. user_wanted_variant = _contains_pattern(expected_title, VARIANT_TAG_PATTERNS) if has_variant_tag(track) and not user_wanted_variant: score *= VARIANT_TAG_PENALTY return score def rerank_tracks( tracks: List[Track], *, expected_title: str, expected_artist: str, ) -> List[Track]: """Return a copy of ``tracks`` sorted by descending relevance score against the expected title + artist. Caller's input list is left untouched. Stable sort preserves the source's original ordering as a tiebreaker (which is the right fallback when two candidates score identically — the source's popularity signal is still useful as a tiebreak). No-op when both ``expected_title`` and ``expected_artist`` are empty (no signal to rank against — return input order).""" if not expected_title and not expected_artist: return list(tracks) scored = [ (score_track(t, expected_title=expected_title, expected_artist=expected_artist), idx, t) for idx, t in enumerate(tracks) ] # Sort by score desc; idx asc as tiebreaker preserves stable order. scored.sort(key=lambda x: (-x[0], x[1])) return [t for _score, _idx, t in scored] def filter_and_rerank( tracks: List[Track], *, expected_title: str, expected_artist: str, min_score: Optional[float] = None, ) -> List[Track]: """Convenience: rerank then optionally drop everything below a score floor. Useful when callers want to hide low-confidence matches entirely instead of demoting them. Returns reranked-only list when ``min_score`` is None — same as ``rerank_tracks``.""" ranked = rerank_tracks( tracks, expected_title=expected_title, expected_artist=expected_artist, ) if min_score is None: return ranked return [ t for t in ranked if score_track(t, expected_title=expected_title, expected_artist=expected_artist) >= min_score ]