You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/metadata/relevance.py

372 lines
13 KiB

"""Local relevance re-ranking for metadata-source search results.
Background
----------
Some metadata sources (Deezer notably) return search results in a
relevance order that puts karaoke covers, "originally performed by",
re-recorded versions, tribute compilations, and Vocal/Backing-Track
variants ABOVE the actual studio recording the user is looking for.
Their global popularity ordering means anything that appears across
many compilations outranks the canonical track. Issue #534 is the
canonical example: searching `Dirty White Boy` + `Foreigner` returned
five karaoke / cover variants before the real Foreigner studio cut.
This module is a provider-neutral helper. Given a list of typed
``Track`` results plus an expected title + artist, it re-ranks by
local heuristics that the source's own ranking ignores:
- Hard penalty for known cover/karaoke/tribute patterns (title OR
album OR artist field). These rarely belong in import / match
results when the user typed the original artist.
- Soft penalty for variant types (Live, Acoustic, Remix, Demo,
Instrumental) UNLESS the user's expected title also contains the
variant tag (so "Track (Live)" search matches Live recordings).
- Boost for exact artist match — the strongest signal that this is
the canonical recording.
- Title similarity via SequenceMatcher on normalised strings (drop
parentheticals + punctuation before comparison).
- Album-type weight: album > compilation > single (compilations are
more likely to be tributes / "best of" repackages).
Pure-function design over the canonical ``Track`` dataclass —
no Deezer-specific assumptions, applies to iTunes / Spotify /
Hydrabase results equally well. Each scoring component is its own
small function so tests can pin them independently.
Usage
-----
>>> from core.metadata.relevance import rerank_tracks
>>> tracks = client.search_tracks(query)
>>> ranked = rerank_tracks(tracks, expected_title='Dirty White Boy', expected_artist='Foreigner')
>>> # ranked[0] is now the most relevant; karaoke variants drop to bottom
"""
from __future__ import annotations
import re
from difflib import SequenceMatcher
from typing import List, Optional, Sequence
from core.metadata.types import Track
# ---------------------------------------------------------------------------
# Pattern tables — public so tests can introspect, callers can extend
# ---------------------------------------------------------------------------
# Title / album / artist substrings that strongly indicate a cover,
# karaoke, tribute, or "originally performed by" compilation. Multiplier
# applied to the final score when matched. 0.05 effectively buries these
# unless nothing else matches.
COVER_KARAOKE_PATTERNS = (
'karaoke',
'originally performed by',
'in the style of',
'made famous by',
'tribute',
'vocal version', # karaoke "vocal version" backing tracks
'backing track',
'cover version',
're-recorded', # artist re-recordings (Taylor's Version notwithstanding)
're-record',
'rerecorded',
'cover by',
'as performed by',
'workout mix', # gym-music compilations
'study music',
'music for', # "Music for Studying", "Music for Sleep" etc
)
COVER_KARAOKE_PENALTY = 0.05 # Multiplicative; effectively bury
# Variant tags — softer penalty since the user MAY want them. Skipped
# when the user's expected_title also contains the same tag (so
# "Track Name (Live)" search matches the Live version cleanly).
VARIANT_TAG_PATTERNS = (
'live',
'acoustic',
'demo',
'instrumental',
'remix',
'edit',
'extended',
'radio edit',
'club mix',
'a cappella',
'acapella',
# Remaster — softer than karaoke (user might want it) but still
# demoted vs. the original recording. Verified against live Deezer
# API behaviour where "(2008 Remaster)" outranks the Head Games
# original on `track:"X" artist:"Y"` advanced queries.
'remaster',
'remastered',
'reissue',
)
VARIANT_TAG_PENALTY = 0.4
# Strong boost when the source's artist field exactly matches the
# user's expected artist (case-insensitive, normalised). The single
# strongest signal that this is the canonical recording.
EXACT_ARTIST_BOOST = 1.5
# Album-type weights. Compilations are more likely to be tributes /
# karaoke repackages; albums are most likely to be the canonical
# studio source.
ALBUM_TYPE_WEIGHT = {
'album': 1.0,
'single': 0.85,
'ep': 0.85,
'compilation': 0.7,
}
DEFAULT_ALBUM_TYPE_WEIGHT = 0.85
# ---------------------------------------------------------------------------
# Normalisation
# ---------------------------------------------------------------------------
_PARENTHETICAL_RE = re.compile(r'[\(\[].*?[\)\]]')
_PUNCT_RE = re.compile(r'[^\w\s]')
def _normalise(text: str) -> str:
"""Lowercase, strip parentheticals + punctuation, collapse spaces.
Used for similarity scoring AND for variant-tag detection (since
we want to know if the user typed the variant tag inside their
own search input)."""
if not text:
return ''
t = text.lower().strip()
t = _PARENTHETICAL_RE.sub('', t)
t = _PUNCT_RE.sub('', t)
return ' '.join(t.split())
def _contains_pattern(haystack: str, patterns: Sequence[str]) -> bool:
"""Case-insensitive substring match across patterns. Read raw
`haystack` (NOT the parenthetical-stripped version) — patterns
like "karaoke" most often live INSIDE the parentheticals on
Deezer's titles."""
if not haystack:
return False
lowered = haystack.lower()
return any(p in lowered for p in patterns)
# ---------------------------------------------------------------------------
# Scoring components
# ---------------------------------------------------------------------------
def title_similarity(track: Track, expected_title: str) -> float:
"""Normalised SequenceMatcher ratio against the expected title."""
if not expected_title:
return 0.0
return SequenceMatcher(
None,
_normalise(track.name),
_normalise(expected_title),
).ratio()
def primary_artist(track: Track) -> str:
"""First entry from track.artists — that's the lead/primary
credit. Empty when the track has no artist info."""
if not track.artists:
return ''
first = track.artists[0]
if isinstance(first, dict):
# Some sources still surface raw dicts during migration; fall
# back to .get() rather than assume the dataclass is fully
# normalised.
return str(first.get('name', '') or '')
return str(first)
def artist_similarity(track: Track, expected_artist: str) -> float:
"""Normalised SequenceMatcher ratio against the expected artist."""
if not expected_artist:
return 0.0
return SequenceMatcher(
None,
_normalise(primary_artist(track)),
_normalise(expected_artist),
).ratio()
def has_exact_artist(track: Track, expected_artist: str) -> bool:
"""True when the primary artist matches expected_artist after
normalisation. Strict equality on the normalised form (so
"Foreigner" matches "Foreigner" but not "Foreigner Tribute Band")."""
if not expected_artist:
return False
return _normalise(primary_artist(track)) == _normalise(expected_artist)
def has_cover_pattern(track: Track) -> bool:
"""Any cover/karaoke/tribute pattern in the track title, album
title, or artist credits."""
if _contains_pattern(track.name, COVER_KARAOKE_PATTERNS):
return True
if _contains_pattern(track.album, COVER_KARAOKE_PATTERNS):
return True
if _contains_pattern(primary_artist(track), COVER_KARAOKE_PATTERNS):
return True
return False
def has_variant_tag(track: Track) -> bool:
"""Track title contains a variant-version tag (Live, Acoustic,
Remix, Demo, Instrumental, etc.). Album field is intentionally
NOT checked — albums named "MTV Unplugged" shouldn't penalise
every track on them."""
return _contains_pattern(track.name, VARIANT_TAG_PATTERNS)
def album_type_weight(track: Track) -> float:
"""Weight from track.album_type. Compilations ranked lower since
they're frequently tribute / karaoke repackages."""
if not track.album_type:
return DEFAULT_ALBUM_TYPE_WEIGHT
return ALBUM_TYPE_WEIGHT.get(track.album_type.lower(), DEFAULT_ALBUM_TYPE_WEIGHT)
# ---------------------------------------------------------------------------
# Combined score
# ---------------------------------------------------------------------------
def score_track(
track: Track,
*,
expected_title: str,
expected_artist: str,
) -> float:
"""Combined relevance score for a single track. Higher = more
relevant. Roughly 0.0 - 2.5 in practice (boosts can push above
1.0; penalties can push below 0.1).
Composition:
1. Base = title_sim * 0.6 + artist_sim * 0.4
2. Multiply by album_type_weight
3. If exact artist match: multiply by EXACT_ARTIST_BOOST
4. If cover/karaoke pattern: multiply by COVER_KARAOKE_PENALTY
(effectively buries unless nothing else matched)
5. If variant tag (Live, Remix, etc.) AND user did NOT type
a variant tag in their input: multiply by VARIANT_TAG_PENALTY
Each rule is its own component above so tests can pin them
individually without standing up the full pipeline.
"""
title_sim = title_similarity(track, expected_title)
artist_sim = artist_similarity(track, expected_artist)
score = title_sim * 0.6 + artist_sim * 0.4
score *= album_type_weight(track)
if has_exact_artist(track, expected_artist):
score *= EXACT_ARTIST_BOOST
if has_cover_pattern(track):
score *= COVER_KARAOKE_PENALTY
# Variant tag penalty — only when the user didn't ask for a
# variant. Their input "Track (Live)" should rank Live versions
# higher, not lower.
user_wanted_variant = _contains_pattern(expected_title, VARIANT_TAG_PATTERNS)
if has_variant_tag(track) and not user_wanted_variant:
score *= VARIANT_TAG_PENALTY
return score
def rerank_tracks(
tracks: List[Track],
*,
expected_title: str,
expected_artist: str,
prefer_known_duration: bool = False,
) -> List[Track]:
"""Return a copy of ``tracks`` sorted by descending relevance
score against the expected title + artist.
Caller's input list is left untouched. Stable sort preserves the
source's original ordering as a tiebreaker (which is the right
fallback when two candidates score identically — the source's
popularity signal is still useful as a tiebreak).
``prefer_known_duration``: when True, recordings with non-zero
``duration_ms`` get a score boost. Used for MusicBrainz, which
often has several recordings per song (single edition, album
edition, compilations, remasters) where some carry length data
and some don't. The boost is set above the album_type weight
spread so length-known recordings can beat length-less
siblings even when the sibling sits on a higher-weighted
album-type — real case: Zeds Dead "Coffee Break" canonical
recording lives on the Single release (album_type='single',
weight 0.85) while a length-less sibling lives on an Album
release (weight 1.0). Without the boost, the length-less album
edition wins and the user sees 0:00 instead of 3:04. Cover /
karaoke penalties dominate the boost (their penalty is 0.05)
so a length-known tribute still loses to a length-less
canonical match.
No-op when both ``expected_title`` and ``expected_artist`` are
empty (no signal to rank against — return input order)."""
if not expected_title and not expected_artist:
return list(tracks)
scored = [
(score_track(t, expected_title=expected_title, expected_artist=expected_artist), idx, t)
for idx, t in enumerate(tracks)
]
if prefer_known_duration:
# Multiplier sized above the album-type weight spread (album 1.0
# vs single 0.85 = ~18%) so length-known recordings can overcome
# the album-vs-single penalty when scores would otherwise tie on
# title + artist match. Penalty multipliers (cover/karaoke=0.05,
# variant=0.85) still dominate, so this only flips order among
# close-relevance siblings — exactly the MB-duplicate case.
scored = [
(score * 1.25 if (t.duration_ms or 0) > 0 else score, idx, t)
for score, idx, t in scored
]
# Sort by score desc; idx asc as tiebreaker preserves stable order.
scored.sort(key=lambda x: (-x[0], x[1]))
return [t for _score, _idx, t in scored]
def filter_and_rerank(
tracks: List[Track],
*,
expected_title: str,
expected_artist: str,
min_score: Optional[float] = None,
) -> List[Track]:
"""Convenience: rerank then optionally drop everything below a
score floor. Useful when callers want to hide low-confidence
matches entirely instead of demoting them.
Returns reranked-only list when ``min_score`` is None — same as
``rerank_tracks``."""
ranked = rerank_tracks(
tracks,
expected_title=expected_title,
expected_artist=expected_artist,
)
if min_score is None:
return ranked
return [
t for t in ranked
if score_track(t, expected_title=expected_title, expected_artist=expected_artist) >= min_score
]