mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
344 lines
12 KiB
344 lines
12 KiB
"""Local relevance re-ranking for metadata-source search results.
|
|
|
|
Background
|
|
----------
|
|
|
|
Some metadata sources (Deezer notably) return search results in a
|
|
relevance order that puts karaoke covers, "originally performed by",
|
|
re-recorded versions, tribute compilations, and Vocal/Backing-Track
|
|
variants ABOVE the actual studio recording the user is looking for.
|
|
Their global popularity ordering means anything that appears across
|
|
many compilations outranks the canonical track. Issue #534 is the
|
|
canonical example: searching `Dirty White Boy` + `Foreigner` returned
|
|
five karaoke / cover variants before the real Foreigner studio cut.
|
|
|
|
This module is a provider-neutral helper. Given a list of typed
|
|
``Track`` results plus an expected title + artist, it re-ranks by
|
|
local heuristics that the source's own ranking ignores:
|
|
|
|
- Hard penalty for known cover/karaoke/tribute patterns (title OR
|
|
album OR artist field). These rarely belong in import / match
|
|
results when the user typed the original artist.
|
|
- Soft penalty for variant types (Live, Acoustic, Remix, Demo,
|
|
Instrumental) UNLESS the user's expected title also contains the
|
|
variant tag (so "Track (Live)" search matches Live recordings).
|
|
- Boost for exact artist match — the strongest signal that this is
|
|
the canonical recording.
|
|
- Title similarity via SequenceMatcher on normalised strings (drop
|
|
parentheticals + punctuation before comparison).
|
|
- Album-type weight: album > compilation > single (compilations are
|
|
more likely to be tributes / "best of" repackages).
|
|
|
|
Pure-function design over the canonical ``Track`` dataclass —
|
|
no Deezer-specific assumptions, applies to iTunes / Spotify /
|
|
Hydrabase results equally well. Each scoring component is its own
|
|
small function so tests can pin them independently.
|
|
|
|
Usage
|
|
-----
|
|
|
|
>>> from core.metadata.relevance import rerank_tracks
|
|
>>> tracks = client.search_tracks(query)
|
|
>>> ranked = rerank_tracks(tracks, expected_title='Dirty White Boy', expected_artist='Foreigner')
|
|
>>> # ranked[0] is now the most relevant; karaoke variants drop to bottom
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
from typing import List, Optional, Sequence
|
|
|
|
from core.metadata.types import Track
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pattern tables — public so tests can introspect, callers can extend
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Title / album / artist substrings that strongly indicate a cover,
|
|
# karaoke, tribute, or "originally performed by" compilation. Multiplier
|
|
# applied to the final score when matched. 0.05 effectively buries these
|
|
# unless nothing else matches.
|
|
COVER_KARAOKE_PATTERNS = (
|
|
'karaoke',
|
|
'originally performed by',
|
|
'in the style of',
|
|
'made famous by',
|
|
'tribute',
|
|
'vocal version', # karaoke "vocal version" backing tracks
|
|
'backing track',
|
|
'cover version',
|
|
're-recorded', # artist re-recordings (Taylor's Version notwithstanding)
|
|
're-record',
|
|
'rerecorded',
|
|
'cover by',
|
|
'as performed by',
|
|
'workout mix', # gym-music compilations
|
|
'study music',
|
|
'music for', # "Music for Studying", "Music for Sleep" etc
|
|
)
|
|
|
|
COVER_KARAOKE_PENALTY = 0.05 # Multiplicative; effectively bury
|
|
|
|
|
|
# Variant tags — softer penalty since the user MAY want them. Skipped
|
|
# when the user's expected_title also contains the same tag (so
|
|
# "Track Name (Live)" search matches the Live version cleanly).
|
|
VARIANT_TAG_PATTERNS = (
|
|
'live',
|
|
'acoustic',
|
|
'demo',
|
|
'instrumental',
|
|
'remix',
|
|
'edit',
|
|
'extended',
|
|
'radio edit',
|
|
'club mix',
|
|
'a cappella',
|
|
'acapella',
|
|
# Remaster — softer than karaoke (user might want it) but still
|
|
# demoted vs. the original recording. Verified against live Deezer
|
|
# API behaviour where "(2008 Remaster)" outranks the Head Games
|
|
# original on `track:"X" artist:"Y"` advanced queries.
|
|
'remaster',
|
|
'remastered',
|
|
'reissue',
|
|
)
|
|
|
|
VARIANT_TAG_PENALTY = 0.4
|
|
|
|
|
|
# Strong boost when the source's artist field exactly matches the
|
|
# user's expected artist (case-insensitive, normalised). The single
|
|
# strongest signal that this is the canonical recording.
|
|
EXACT_ARTIST_BOOST = 1.5
|
|
|
|
|
|
# Album-type weights. Compilations are more likely to be tributes /
|
|
# karaoke repackages; albums are most likely to be the canonical
|
|
# studio source.
|
|
ALBUM_TYPE_WEIGHT = {
|
|
'album': 1.0,
|
|
'single': 0.85,
|
|
'ep': 0.85,
|
|
'compilation': 0.7,
|
|
}
|
|
DEFAULT_ALBUM_TYPE_WEIGHT = 0.85
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Normalisation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_PARENTHETICAL_RE = re.compile(r'[\(\[].*?[\)\]]')
|
|
_PUNCT_RE = re.compile(r'[^\w\s]')
|
|
|
|
|
|
def _normalise(text: str) -> str:
|
|
"""Lowercase, strip parentheticals + punctuation, collapse spaces.
|
|
|
|
Used for similarity scoring AND for variant-tag detection (since
|
|
we want to know if the user typed the variant tag inside their
|
|
own search input)."""
|
|
if not text:
|
|
return ''
|
|
t = text.lower().strip()
|
|
t = _PARENTHETICAL_RE.sub('', t)
|
|
t = _PUNCT_RE.sub('', t)
|
|
return ' '.join(t.split())
|
|
|
|
|
|
def _contains_pattern(haystack: str, patterns: Sequence[str]) -> bool:
|
|
"""Case-insensitive substring match across patterns. Read raw
|
|
`haystack` (NOT the parenthetical-stripped version) — patterns
|
|
like "karaoke" most often live INSIDE the parentheticals on
|
|
Deezer's titles."""
|
|
if not haystack:
|
|
return False
|
|
lowered = haystack.lower()
|
|
return any(p in lowered for p in patterns)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scoring components
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def title_similarity(track: Track, expected_title: str) -> float:
|
|
"""Normalised SequenceMatcher ratio against the expected title."""
|
|
if not expected_title:
|
|
return 0.0
|
|
return SequenceMatcher(
|
|
None,
|
|
_normalise(track.name),
|
|
_normalise(expected_title),
|
|
).ratio()
|
|
|
|
|
|
def primary_artist(track: Track) -> str:
|
|
"""First entry from track.artists — that's the lead/primary
|
|
credit. Empty when the track has no artist info."""
|
|
if not track.artists:
|
|
return ''
|
|
first = track.artists[0]
|
|
if isinstance(first, dict):
|
|
# Some sources still surface raw dicts during migration; fall
|
|
# back to .get() rather than assume the dataclass is fully
|
|
# normalised.
|
|
return str(first.get('name', '') or '')
|
|
return str(first)
|
|
|
|
|
|
def artist_similarity(track: Track, expected_artist: str) -> float:
|
|
"""Normalised SequenceMatcher ratio against the expected artist."""
|
|
if not expected_artist:
|
|
return 0.0
|
|
return SequenceMatcher(
|
|
None,
|
|
_normalise(primary_artist(track)),
|
|
_normalise(expected_artist),
|
|
).ratio()
|
|
|
|
|
|
def has_exact_artist(track: Track, expected_artist: str) -> bool:
|
|
"""True when the primary artist matches expected_artist after
|
|
normalisation. Strict equality on the normalised form (so
|
|
"Foreigner" matches "Foreigner" but not "Foreigner Tribute Band")."""
|
|
if not expected_artist:
|
|
return False
|
|
return _normalise(primary_artist(track)) == _normalise(expected_artist)
|
|
|
|
|
|
def has_cover_pattern(track: Track) -> bool:
|
|
"""Any cover/karaoke/tribute pattern in the track title, album
|
|
title, or artist credits."""
|
|
if _contains_pattern(track.name, COVER_KARAOKE_PATTERNS):
|
|
return True
|
|
if _contains_pattern(track.album, COVER_KARAOKE_PATTERNS):
|
|
return True
|
|
if _contains_pattern(primary_artist(track), COVER_KARAOKE_PATTERNS):
|
|
return True
|
|
return False
|
|
|
|
|
|
def has_variant_tag(track: Track) -> bool:
|
|
"""Track title contains a variant-version tag (Live, Acoustic,
|
|
Remix, Demo, Instrumental, etc.). Album field is intentionally
|
|
NOT checked — albums named "MTV Unplugged" shouldn't penalise
|
|
every track on them."""
|
|
return _contains_pattern(track.name, VARIANT_TAG_PATTERNS)
|
|
|
|
|
|
def album_type_weight(track: Track) -> float:
|
|
"""Weight from track.album_type. Compilations ranked lower since
|
|
they're frequently tribute / karaoke repackages."""
|
|
if not track.album_type:
|
|
return DEFAULT_ALBUM_TYPE_WEIGHT
|
|
return ALBUM_TYPE_WEIGHT.get(track.album_type.lower(), DEFAULT_ALBUM_TYPE_WEIGHT)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Combined score
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def score_track(
|
|
track: Track,
|
|
*,
|
|
expected_title: str,
|
|
expected_artist: str,
|
|
) -> float:
|
|
"""Combined relevance score for a single track. Higher = more
|
|
relevant. Roughly 0.0 - 2.5 in practice (boosts can push above
|
|
1.0; penalties can push below 0.1).
|
|
|
|
Composition:
|
|
|
|
1. Base = title_sim * 0.6 + artist_sim * 0.4
|
|
2. Multiply by album_type_weight
|
|
3. If exact artist match: multiply by EXACT_ARTIST_BOOST
|
|
4. If cover/karaoke pattern: multiply by COVER_KARAOKE_PENALTY
|
|
(effectively buries unless nothing else matched)
|
|
5. If variant tag (Live, Remix, etc.) AND user did NOT type
|
|
a variant tag in their input: multiply by VARIANT_TAG_PENALTY
|
|
|
|
Each rule is its own component above so tests can pin them
|
|
individually without standing up the full pipeline.
|
|
"""
|
|
title_sim = title_similarity(track, expected_title)
|
|
artist_sim = artist_similarity(track, expected_artist)
|
|
score = title_sim * 0.6 + artist_sim * 0.4
|
|
|
|
score *= album_type_weight(track)
|
|
|
|
if has_exact_artist(track, expected_artist):
|
|
score *= EXACT_ARTIST_BOOST
|
|
|
|
if has_cover_pattern(track):
|
|
score *= COVER_KARAOKE_PENALTY
|
|
|
|
# Variant tag penalty — only when the user didn't ask for a
|
|
# variant. Their input "Track (Live)" should rank Live versions
|
|
# higher, not lower.
|
|
user_wanted_variant = _contains_pattern(expected_title, VARIANT_TAG_PATTERNS)
|
|
if has_variant_tag(track) and not user_wanted_variant:
|
|
score *= VARIANT_TAG_PENALTY
|
|
|
|
return score
|
|
|
|
|
|
def rerank_tracks(
|
|
tracks: List[Track],
|
|
*,
|
|
expected_title: str,
|
|
expected_artist: str,
|
|
) -> List[Track]:
|
|
"""Return a copy of ``tracks`` sorted by descending relevance
|
|
score against the expected title + artist.
|
|
|
|
Caller's input list is left untouched. Stable sort preserves the
|
|
source's original ordering as a tiebreaker (which is the right
|
|
fallback when two candidates score identically — the source's
|
|
popularity signal is still useful as a tiebreak).
|
|
|
|
No-op when both ``expected_title`` and ``expected_artist`` are
|
|
empty (no signal to rank against — return input order)."""
|
|
if not expected_title and not expected_artist:
|
|
return list(tracks)
|
|
scored = [
|
|
(score_track(t, expected_title=expected_title, expected_artist=expected_artist), idx, t)
|
|
for idx, t in enumerate(tracks)
|
|
]
|
|
# Sort by score desc; idx asc as tiebreaker preserves stable order.
|
|
scored.sort(key=lambda x: (-x[0], x[1]))
|
|
return [t for _score, _idx, t in scored]
|
|
|
|
|
|
def filter_and_rerank(
|
|
tracks: List[Track],
|
|
*,
|
|
expected_title: str,
|
|
expected_artist: str,
|
|
min_score: Optional[float] = None,
|
|
) -> List[Track]:
|
|
"""Convenience: rerank then optionally drop everything below a
|
|
score floor. Useful when callers want to hide low-confidence
|
|
matches entirely instead of demoting them.
|
|
|
|
Returns reranked-only list when ``min_score`` is None — same as
|
|
``rerank_tracks``."""
|
|
ranked = rerank_tracks(
|
|
tracks,
|
|
expected_title=expected_title,
|
|
expected_artist=expected_artist,
|
|
)
|
|
if min_score is None:
|
|
return ranked
|
|
return [
|
|
t for t in ranked
|
|
if score_track(t, expected_title=expected_title, expected_artist=expected_artist) >= min_score
|
|
]
|