SoulSync/core/metadata/relevance.py

"""Local relevance re-ranking for metadata-source search results.

Background
----------

Some metadata sources (Deezer notably) return search results in a
relevance order that puts karaoke covers, "originally performed by",
re-recorded versions, tribute compilations, and Vocal/Backing-Track
variants ABOVE the actual studio recording the user is looking for.
Their global popularity ordering means anything that appears across
many compilations outranks the canonical track. Issue #534 is the
canonical example: searching `Dirty White Boy` + `Foreigner` returned
five karaoke / cover variants before the real Foreigner studio cut.

This module is a provider-neutral helper. Given a list of typed
``Track`` results plus an expected title + artist, it re-ranks by
local heuristics that the source's own ranking ignores:

- Hard penalty for known cover/karaoke/tribute patterns (title OR
  album OR artist field). These rarely belong in import / match
  results when the user typed the original artist.
- Soft penalty for variant types (Live, Acoustic, Remix, Demo,
  Instrumental) UNLESS the user's expected title also contains the
  variant tag (so "Track (Live)" search matches Live recordings).
- Boost for exact artist match — the strongest signal that this is
  the canonical recording.
- Title similarity via SequenceMatcher on normalised strings (drop
  parentheticals + punctuation before comparison).
- Album-type weight: album > compilation > single (compilations are
  more likely to be tributes / "best of" repackages).

Pure-function design over the canonical ``Track`` dataclass —
no Deezer-specific assumptions, applies to iTunes / Spotify /
Hydrabase results equally well. Each scoring component is its own
small function so tests can pin them independently.

Usage
-----

>>> from core.metadata.relevance import rerank_tracks
>>> tracks = client.search_tracks(query)
>>> ranked = rerank_tracks(tracks, expected_title='Dirty White Boy', expected_artist='Foreigner')
>>> # ranked[0] is now the most relevant; karaoke variants drop to bottom
"""

from __future__ import annotations

import re
from difflib import SequenceMatcher
from typing import List, Optional, Sequence

from core.metadata.types import Track


# ---------------------------------------------------------------------------
# Pattern tables — public so tests can introspect, callers can extend
# ---------------------------------------------------------------------------


# Title / album / artist substrings that strongly indicate a cover,
# karaoke, tribute, or "originally performed by" compilation. Multiplier
# applied to the final score when matched. 0.05 effectively buries these
# unless nothing else matches.
COVER_KARAOKE_PATTERNS = (
    'karaoke',
    'originally performed by',
    'in the style of',
    'made famous by',
    'tribute',
    'vocal version',           # karaoke "vocal version" backing tracks
    'backing track',
    'cover version',
    're-recorded',             # artist re-recordings (Taylor's Version notwithstanding)
    're-record',
    'rerecorded',
    'cover by',
    'as performed by',
    'workout mix',             # gym-music compilations
    'study music',
    'music for',               # "Music for Studying", "Music for Sleep" etc
)

COVER_KARAOKE_PENALTY = 0.05  # Multiplicative; effectively bury


# Variant tags — softer penalty since the user MAY want them. Skipped
# when the user's expected_title also contains the same tag (so
# "Track Name (Live)" search matches the Live version cleanly).
VARIANT_TAG_PATTERNS = (
    'live',
    'acoustic',
    'demo',
    'instrumental',
    'remix',
    'edit',
    'extended',
    'radio edit',
    'club mix',
    'a cappella',
    'acapella',
    # Remaster — softer than karaoke (user might want it) but still
    # demoted vs. the original recording. Verified against live Deezer
    # API behaviour where "(2008 Remaster)" outranks the Head Games
    # original on `track:"X" artist:"Y"` advanced queries.
    'remaster',
    'remastered',
    'reissue',
)

VARIANT_TAG_PENALTY = 0.4


# Strong boost when the source's artist field exactly matches the
# user's expected artist (case-insensitive, normalised). The single
# strongest signal that this is the canonical recording.
EXACT_ARTIST_BOOST = 1.5


# Album-type weights. Compilations are more likely to be tributes /
# karaoke repackages; albums are most likely to be the canonical
# studio source.
ALBUM_TYPE_WEIGHT = {
    'album': 1.0,
    'single': 0.85,
    'ep': 0.85,
    'compilation': 0.7,
}
DEFAULT_ALBUM_TYPE_WEIGHT = 0.85


# ---------------------------------------------------------------------------
# Normalisation
# ---------------------------------------------------------------------------


_PARENTHETICAL_RE = re.compile(r'[\(\[].*?[\)\]]')
_PUNCT_RE = re.compile(r'[^\w\s]')


def _normalise(text: str) -> str:
    """Lowercase, strip parentheticals + punctuation, collapse spaces.

    Used for similarity scoring AND for variant-tag detection (since
    we want to know if the user typed the variant tag inside their
    own search input)."""
    if not text:
        return ''
    t = text.lower().strip()
    t = _PARENTHETICAL_RE.sub('', t)
    t = _PUNCT_RE.sub('', t)
    return ' '.join(t.split())


def _contains_pattern(haystack: str, patterns: Sequence[str]) -> bool:
    """Case-insensitive substring match across patterns. Read raw
    `haystack` (NOT the parenthetical-stripped version) — patterns
    like "karaoke" most often live INSIDE the parentheticals on
    Deezer's titles."""
    if not haystack:
        return False
    lowered = haystack.lower()
    return any(p in lowered for p in patterns)


# ---------------------------------------------------------------------------
# Scoring components
# ---------------------------------------------------------------------------


def title_similarity(track: Track, expected_title: str) -> float:
    """Normalised SequenceMatcher ratio against the expected title."""
    if not expected_title:
        return 0.0
    return SequenceMatcher(
        None,
        _normalise(track.name),
        _normalise(expected_title),
    ).ratio()


def primary_artist(track: Track) -> str:
    """First entry from track.artists — that's the lead/primary
    credit. Empty when the track has no artist info."""
    if not track.artists:
        return ''
    first = track.artists[0]
    if isinstance(first, dict):
        # Some sources still surface raw dicts during migration; fall
        # back to .get() rather than assume the dataclass is fully
        # normalised.
        return str(first.get('name', '') or '')
    return str(first)


def artist_similarity(track: Track, expected_artist: str) -> float:
    """Normalised SequenceMatcher ratio against the expected artist."""
    if not expected_artist:
        return 0.0
    return SequenceMatcher(
        None,
        _normalise(primary_artist(track)),
        _normalise(expected_artist),
    ).ratio()


def has_exact_artist(track: Track, expected_artist: str) -> bool:
    """True when the primary artist matches expected_artist after
    normalisation. Strict equality on the normalised form (so
    "Foreigner" matches "Foreigner" but not "Foreigner Tribute Band")."""
    if not expected_artist:
        return False
    return _normalise(primary_artist(track)) == _normalise(expected_artist)


def has_cover_pattern(track: Track) -> bool:
    """Any cover/karaoke/tribute pattern in the track title, album
    title, or artist credits."""
    if _contains_pattern(track.name, COVER_KARAOKE_PATTERNS):
        return True
    if _contains_pattern(track.album, COVER_KARAOKE_PATTERNS):
        return True
    if _contains_pattern(primary_artist(track), COVER_KARAOKE_PATTERNS):
        return True
    return False


def has_variant_tag(track: Track) -> bool:
    """Track title contains a variant-version tag (Live, Acoustic,
    Remix, Demo, Instrumental, etc.). Album field is intentionally
    NOT checked — albums named "MTV Unplugged" shouldn't penalise
    every track on them."""
    return _contains_pattern(track.name, VARIANT_TAG_PATTERNS)


def album_type_weight(track: Track) -> float:
    """Weight from track.album_type. Compilations ranked lower since
    they're frequently tribute / karaoke repackages."""
    if not track.album_type:
        return DEFAULT_ALBUM_TYPE_WEIGHT
    return ALBUM_TYPE_WEIGHT.get(track.album_type.lower(), DEFAULT_ALBUM_TYPE_WEIGHT)


# ---------------------------------------------------------------------------
# Combined score
# ---------------------------------------------------------------------------


def score_track(
    track: Track,
    *,
    expected_title: str,
    expected_artist: str,
) -> float:
    """Combined relevance score for a single track. Higher = more
    relevant. Roughly 0.0 - 2.5 in practice (boosts can push above
    1.0; penalties can push below 0.1).

    Composition:

    1. Base = title_sim * 0.6 + artist_sim * 0.4
    2. Multiply by album_type_weight
    3. If exact artist match: multiply by EXACT_ARTIST_BOOST
    4. If cover/karaoke pattern: multiply by COVER_KARAOKE_PENALTY
       (effectively buries unless nothing else matched)
    5. If variant tag (Live, Remix, etc.) AND user did NOT type
       a variant tag in their input: multiply by VARIANT_TAG_PENALTY

    Each rule is its own component above so tests can pin them
    individually without standing up the full pipeline.
    """
    title_sim = title_similarity(track, expected_title)
    artist_sim = artist_similarity(track, expected_artist)
    score = title_sim * 0.6 + artist_sim * 0.4

    score *= album_type_weight(track)

    if has_exact_artist(track, expected_artist):
        score *= EXACT_ARTIST_BOOST

    if has_cover_pattern(track):
        score *= COVER_KARAOKE_PENALTY

    # Variant tag penalty — only when the user didn't ask for a
    # variant. Their input "Track (Live)" should rank Live versions
    # higher, not lower.
    user_wanted_variant = _contains_pattern(expected_title, VARIANT_TAG_PATTERNS)
    if has_variant_tag(track) and not user_wanted_variant:
        score *= VARIANT_TAG_PENALTY

    return score


def rerank_tracks(
    tracks: List[Track],
    *,
    expected_title: str,
    expected_artist: str,
    prefer_known_duration: bool = False,
) -> List[Track]:
    """Return a copy of ``tracks`` sorted by descending relevance
    score against the expected title + artist.

    Caller's input list is left untouched. Stable sort preserves the
    source's original ordering as a tiebreaker (which is the right
    fallback when two candidates score identically — the source's
    popularity signal is still useful as a tiebreak).

    ``prefer_known_duration``: when True, recordings with non-zero
    ``duration_ms`` get a score boost. Used for MusicBrainz, which
    often has several recordings per song (single edition, album
    edition, compilations, remasters) where some carry length data
    and some don't. The boost is set above the album_type weight
    spread so length-known recordings can beat length-less
    siblings even when the sibling sits on a higher-weighted
    album-type — real case: Zeds Dead "Coffee Break" canonical
    recording lives on the Single release (album_type='single',
    weight 0.85) while a length-less sibling lives on an Album
    release (weight 1.0). Without the boost, the length-less album
    edition wins and the user sees 0:00 instead of 3:04. Cover /
    karaoke penalties dominate the boost (their penalty is 0.05)
    so a length-known tribute still loses to a length-less
    canonical match.

    No-op when both ``expected_title`` and ``expected_artist`` are
    empty (no signal to rank against — return input order)."""
    if not expected_title and not expected_artist:
        return list(tracks)
    scored = [
        (score_track(t, expected_title=expected_title, expected_artist=expected_artist), idx, t)
        for idx, t in enumerate(tracks)
    ]
    if prefer_known_duration:
        # Multiplier sized above the album-type weight spread (album 1.0
        # vs single 0.85 = ~18%) so length-known recordings can overcome
        # the album-vs-single penalty when scores would otherwise tie on
        # title + artist match. Penalty multipliers (cover/karaoke=0.05,
        # variant=0.85) still dominate, so this only flips order among
        # close-relevance siblings — exactly the MB-duplicate case.
        scored = [
            (score * 1.25 if (t.duration_ms or 0) > 0 else score, idx, t)
            for score, idx, t in scored
        ]
    # Sort by score desc; idx asc as tiebreaker preserves stable order.
    scored.sort(key=lambda x: (-x[0], x[1]))
    return [t for _score, _idx, t in scored]


def filter_and_rerank(
    tracks: List[Track],
    *,
    expected_title: str,
    expected_artist: str,
    min_score: Optional[float] = None,
) -> List[Track]:
    """Convenience: rerank then optionally drop everything below a
    score floor. Useful when callers want to hide low-confidence
    matches entirely instead of demoting them.

    Returns reranked-only list when ``min_score`` is None — same as
    ``rerank_tracks``."""
    ranked = rerank_tracks(
        tracks,
        expected_title=expected_title,
        expected_artist=expected_artist,
    )
    if min_score is None:
        return ranked
    return [
        t for t in ranked
        if score_track(t, expected_title=expected_title, expected_artist=expected_artist) >= min_score
    ]