You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/matching/artist_aliases.py

243 lines
9.4 KiB

"""Pure-function artist-name comparison with alias awareness.
Issue #442 — cross-script artist quarantines
-----------------------------------------------------
A file tagged with one spelling of an artist's name (e.g. the
Japanese kanji `澤野弘之`) was being quarantined when SoulSync's
expected-artist metadata used the romanized spelling
(`Hiroyuki Sawano`). Raw similarity comparison scores 0% across
scripts even though MusicBrainz already knows both names belong to
the same artist (its alias list).
This module is the shared resolution helper. Given an expected
artist name, an actual artist name, and an iterable of known
aliases, it returns whether they should be treated as the same
artist + the highest similarity score across the candidate set.
Pure function design:
- No I/O, no DB access, no network
- Caller supplies aliases (looked up from library DB or live MB)
- Caller supplies normalize + similarity functions to keep the
helper provider-neutral (the verifier and the matching engine
use slightly different normalizers — let each pass its own)
- Returns ``(matched: bool, score: float)`` so callers can log
the score they made the decision on
Backward compat: when ``aliases`` is empty (or the looking-up
caller hasn't been wired yet), the helper degrades to a plain
direct similarity comparison — identical to the pre-fix behaviour.
"""
from __future__ import annotations
import re
from difflib import SequenceMatcher
from typing import Callable, Iterable, List, Optional, Tuple
# Default threshold matches the existing ARTIST_MATCH_THRESHOLD in
# core/acoustid_verification.py. Callers can override but the helper
# defaults are tuned to preserve current verifier behaviour.
DEFAULT_ARTIST_MATCH_THRESHOLD = 0.6
# Multi-value credit-string separators. AcoustID returns the FULL
# artist credit ("Okayracer, aldrch & poptropicaslutz!") while the
# library DB carries only the primary artist ("Okayracer"). Raw string
# similarity scores ~40% — the primary IS in the credit but split by
# punctuation. Splitting on these tokens lets each contributor compare
# individually so the primary-artist match wins at near-100%.
#
# Two patterns because the punctuation separators (comma, ampersand,
# slash, etc.) don't need surrounding whitespace, but the keyword
# separators ("feat", "ft", "vs", etc.) MUST be whitespace-bounded —
# otherwise we'd split "JAY-X" or any artist with "x" / "with" etc.
# in their name.
_CREDIT_PUNCT_SPLITTER = r'\s*[,&;/+]\s*'
_CREDIT_KEYWORD_SPLITTER = (
r'\s+(?:feat\.?|ft\.?|featuring|with|vs\.?|x)\s+'
)
_CREDIT_SPLITTER = re.compile(
rf'(?:{_CREDIT_PUNCT_SPLITTER}|{_CREDIT_KEYWORD_SPLITTER})',
re.IGNORECASE,
)
def _default_normalize(text: str) -> str:
"""Lowercase + strip whitespace. Minimal — caller's normaliser
almost always replaces this with something stricter (parenthetical
stripping, punctuation removal). Used only when the caller
doesn't pass a custom one."""
if not text:
return ''
return str(text).strip().lower()
def _default_similarity(a: str, b: str) -> float:
"""SequenceMatcher ratio after the default normaliser. Matches
the verifier's existing ``_similarity`` semantics for the no-
custom-callable path."""
na = _default_normalize(a)
nb = _default_normalize(b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
return SequenceMatcher(None, na, nb).ratio()
def split_artist_credit(credit: str) -> List[str]:
"""Split a multi-value artist credit string into individual names.
Examples:
- ``"Okayracer, aldrch & poptropicaslutz!"`` → ``["Okayracer", "aldrch", "poptropicaslutz!"]``
- ``"Daft Punk feat. Pharrell"`` → ``["Daft Punk", "Pharrell"]``
- ``"Artist1 / Artist2 / Artist3"`` → ``["Artist1", "Artist2", "Artist3"]``
- ``"Solo Artist"`` → ``["Solo Artist"]`` (no separators → single-entry list)
Empty string / whitespace-only entries dropped. Always returns at
least one entry when input is non-empty (the single-artist case).
"""
if not credit:
return []
parts = _CREDIT_SPLITTER.split(str(credit))
return [p.strip() for p in parts if p and p.strip()]
def _coerce_aliases(aliases: Optional[Iterable[str]]) -> Tuple[str, ...]:
"""Normalise the aliases input to a tuple of clean strings.
Accepts ``None``, empty iterables, lists, tuples, sets. Drops
None / empty / non-string entries silently — callers feeding us
raw MusicBrainz response dicts shouldn't have to clean first.
"""
if not aliases:
return ()
cleaned = []
for value in aliases:
if value is None:
continue
text = str(value).strip()
if text:
cleaned.append(text)
return tuple(cleaned)
def artist_names_match(
expected: str,
actual: str,
*,
aliases: Optional[Iterable[str]] = None,
threshold: float = DEFAULT_ARTIST_MATCH_THRESHOLD,
similarity: Optional[Callable[[str, str], float]] = None,
) -> Tuple[bool, float]:
"""Compare ``expected`` and ``actual`` artist names with alias
awareness.
Args:
expected: The artist name the caller expected (typically from
metadata-source data — Spotify / iTunes / Deezer track
payload).
actual: The artist name the caller observed (typically from
an AcoustID recording or a downloaded file's tag).
aliases: Iterable of known alternate spellings for ``expected``.
Each one gets compared against ``actual``; the best score
wins. Empty or omitted → plain direct comparison
(backward-compat with pre-fix behaviour).
threshold: Score at or above which we consider the names a
match. Defaults to 0.6 to match the verifier's existing
``ARTIST_MATCH_THRESHOLD``.
similarity: Optional caller-supplied similarity function
``(a, b) -> float in [0, 1]``. Lets the verifier pass its
stricter normaliser (parenthetical stripping etc.) without
this module having to know about it. Defaults to a
lowercase + SequenceMatcher comparison.
Returns:
``(matched, best_score)`` where ``matched`` is True iff the
best score across (actual, *aliases) ≥ threshold and
``best_score`` is that maximum. ``best_score`` is informative
for callers that want to log "matched at 0.83" or similar.
"""
sim = similarity or _default_similarity
# Direct compare first — both for the fast path and so the
# returned score reflects the actual-vs-expected baseline (callers
# may want it for logging even when an alias is the actual winner).
direct_score = sim(expected, actual)
best_score = direct_score
if direct_score >= threshold:
return True, direct_score
# Multi-value credit compare: AcoustID + media-server clients
# often surface the FULL credit ("Artist1, Artist2 & Artist3")
# while the library DB carries only the primary artist. Split
# `actual` into its constituent contributors and check each against
# `expected`. Skipped when actual is single-token (no separators
# present) — _split_credit returns [actual] in that case which
# equals the direct compare we already did, so don't recompute.
actual_credits = split_artist_credit(actual)
if len(actual_credits) > 1:
for credit in actual_credits:
score = sim(expected, credit)
if score > best_score:
best_score = score
if score >= threshold:
return True, score
# Alias compare: each alias is a known alternate spelling of the
# EXPECTED artist; match it against the ACTUAL name we observed.
# Also check each alias against each credit token from above so
# cross-script primary-in-collab cases (e.g. expected='Hiroyuki
# Sawano', actual='澤野弘之, FeaturedJp') still bridge.
# Highest score wins.
for alias in _coerce_aliases(aliases):
score = sim(alias, actual)
if score > best_score:
best_score = score
if score >= threshold:
return True, score
if len(actual_credits) > 1:
for credit in actual_credits:
token_score = sim(alias, credit)
if token_score > best_score:
best_score = token_score
if token_score >= threshold:
return True, token_score
return False, best_score
def best_alias_match(
expected: str,
actual: str,
aliases: Optional[Iterable[str]] = None,
*,
similarity: Optional[Callable[[str, str], float]] = None,
) -> Tuple[Optional[str], float]:
"""Return the alias that best matched ``actual`` (or None for the
direct expected-vs-actual comparison) and its score.
Companion to ``artist_names_match`` for callers that want to
surface which alias triggered the match (debug logging, UI
explanations). Doesn't apply a threshold — purely informative.
Returns:
``(winner, score)`` where ``winner`` is the alias string when
an alias outscored the direct comparison, ``None`` when the
direct comparison won (or both tied at zero).
"""
sim = similarity or _default_similarity
direct_score = sim(expected, actual)
winner: Optional[str] = None
best = direct_score
for alias in _coerce_aliases(aliases):
score = sim(alias, actual)
if score > best:
best = score
winner = alias
return winner, best