You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/acoustid_verification.py

679 lines
30 KiB

"""
AcoustID Verification Service
Verifies downloaded audio files match expected track metadata by comparing
title/artist from AcoustID fingerprint results against the expected track info.
If the audio fingerprint confidently identifies a DIFFERENT song than expected,
the file is flagged as incorrect.
"""
import re
import threading
from difflib import SequenceMatcher
from typing import Optional, Dict, Any, Tuple, List
from enum import Enum
from utils.logging_config import get_logger
from core.acoustid_client import AcoustIDClient
from core.matching_engine import MusicMatchingEngine
from core.matching.version_mismatch import is_acceptable_version_mismatch
from core.musicbrainz_client import MusicBrainzClient
logger = get_logger("acoustid.verification")
# Thresholds
MIN_ACOUSTID_SCORE = 0.80 # Minimum AcoustID fingerprint score to trust
TITLE_MATCH_THRESHOLD = 0.70 # Title similarity needed to consider a match
ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity needed to consider a match
# Single matching-engine instance so version detection reuses the same patterns
# used by the pre-download Soulseek matcher (remix / live / acoustic /
# instrumental / etc). detect_version_type doesn't use self state, so one
# shared instance is fine.
_match_engine_for_version = MusicMatchingEngine()
def _detect_title_version(title: str) -> str:
"""Return version label for a track title.
Returns ``'original'`` when no version marker is detected, otherwise one
of the labels produced by ``MusicMatchingEngine.detect_version_type``
(``'instrumental'``, ``'live'``, ``'acoustic'``, ``'remix'``, etc).
"""
if not title:
return 'original'
version_type, _ = _match_engine_for_version.detect_version_type(title)
return version_type
class VerificationResult(Enum):
"""Possible outcomes of audio verification."""
PASS = "pass" # Title/artist match - file is correct
FAIL = "fail" # Title/artist mismatch - wrong file downloaded
SKIP = "skip" # Could not verify (error or unavailable) - continue normally
DISABLED = "disabled" # Verification not enabled
def _normalize(text: str) -> str:
"""Normalize a string for comparison: lowercase, strip parentheticals, punctuation."""
if not text:
return ""
s = text.lower().strip()
# Remove ALL parenthetical suffixes — these are metadata annotations, not core title
# Covers: (Live), (Remastered), (Parody of ...), (from "..." Soundtrack), (feat. ...), etc.
s = re.sub(r'\s*\([^)]*\)', '', s)
# Remove ALL square bracket suffixes: [Live], [Remastered], [Deluxe], etc.
s = re.sub(r'\s*\[[^\]]*\]', '', s)
# Remove trailing featuring info not in parentheses: "feat. ...", "ft. ...", "featuring ..."
s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE)
# Remove dash-separated version tags: "- Vocal", "- Instrumental", "- Acoustic", etc.
s = re.sub(r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$', '', s, flags=re.IGNORECASE)
# Remove soundtrack/source subtitles: ' - From "..." Soundtrack', ' - from the film ...'
s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE)
# Remove non-alphanumeric except spaces
s = re.sub(r'[^\w\s]', '', s)
# Collapse whitespace
s = re.sub(r'\s+', ' ', s).strip()
return s
def _similarity(a: str, b: str) -> float:
"""Calculate similarity between two strings (0.0-1.0) after normalization."""
na = _normalize(a)
nb = _normalize(b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
return SequenceMatcher(None, na, nb).ratio()
def _alias_aware_artist_sim(
expected_artist: str,
actual_artist: str,
aliases: Optional[Any] = None,
) -> float:
"""Best artist-similarity across (expected, *aliases) vs actual.
Issue #442 — when expected and actual are in different scripts
(e.g. `Hiroyuki Sawano` vs `澤野弘之`), raw `_similarity` scores
near 0% even though MusicBrainz aliases bridge them. Routes
through the pure helper so the verifier inherits one shared
contract.
Returns the highest score across all candidates so existing
threshold checks (>= ARTIST_MATCH_THRESHOLD) keep their
semantics. When `aliases` is None or empty, behaves identically
to the prior raw `_similarity(expected, actual)` call.
`aliases` accepts two shapes:
- **Iterable** (list/tuple/set of strings): used directly. Used
by tests that already know the aliases.
- **Callable**: invoked LAZILY only when direct similarity
falls below the threshold. Lets the verifier pass a memoizing
thunk that resolves aliases (DB / cache / live MB) only when
needed. Verifications where the direct match already passes
never trigger the lookup chain — no wasted DB query for the
happy path.
Diagnostic logging: emits an INFO line whenever an alias rescues
a comparison that direct similarity would have failed. Lets
future bug reports trace which alias triggered which PASS
decision (e.g. "this file passed because alias `澤野弘之` matched
the file's artist tag").
"""
from core.matching.artist_aliases import artist_names_match
direct = _similarity(expected_artist, actual_artist)
# Fast path — direct match already passes the threshold OR caller
# supplied no aliases handle. Avoids any lookup work.
if aliases is None:
return direct
if direct >= ARTIST_MATCH_THRESHOLD:
return direct
# Resolve the iterable. Callable provider invoked NOW (lazily —
# the caller can memoize the result across multiple invocations
# within one verify_audio_file call).
resolved = aliases() if callable(aliases) else aliases
if not resolved:
return direct
_matched, score = artist_names_match(
expected_artist,
actual_artist,
aliases=resolved,
threshold=ARTIST_MATCH_THRESHOLD,
similarity=_similarity,
)
# Diagnostic — alias rescued a comparison that direct would
# have failed. Worth logging at INFO since it's a user-visible
# decision (file PASS instead of FAIL). One line per rescue
# within a single verify call.
if score >= ARTIST_MATCH_THRESHOLD and direct < ARTIST_MATCH_THRESHOLD:
from core.matching.artist_aliases import best_alias_match
winner, _ = best_alias_match(
expected_artist, actual_artist, resolved, similarity=_similarity,
)
logger.info(
"Artist alias rescued comparison: expected=%r vs actual=%r "
"(direct sim=%.2f, alias %r → score=%.2f)",
expected_artist, actual_artist, direct, winner, score,
)
return score
def _find_best_title_artist_match(
recordings: List[Dict[str, Any]],
expected_title: str,
expected_artist: str,
expected_artist_aliases: Optional[Any] = None,
) -> Tuple[Optional[Dict], float, float]:
"""
Find the AcoustID recording that best matches expected title/artist.
Issue #442 — `expected_artist_aliases` (when supplied) is the
list of alternate spellings for `expected_artist` (Japanese
kanji, Cyrillic, etc.). Accepts either:
- An iterable of alias strings (used eagerly), or
- A callable returning the list (resolved lazily — only fires
when at least one recording fails direct artist similarity).
Each recording's artist is scored against (expected, *aliases)
and the best score wins. When the list is empty/omitted/None,
behavior is identical to the prior raw similarity comparison.
Returns:
(best_recording, title_similarity, artist_similarity)
"""
best_rec = None
best_title_sim = 0.0
best_artist_sim = 0.0
best_combined = 0.0
for rec in recordings:
title = rec.get('title') or ''
artist = rec.get('artist') or ''
title_sim = _similarity(expected_title, title)
artist_sim = _alias_aware_artist_sim(
expected_artist, artist, expected_artist_aliases,
)
# Weight title higher since that's the primary identifier
combined = (title_sim * 0.6) + (artist_sim * 0.4)
if combined > best_combined:
best_combined = combined
best_rec = rec
best_title_sim = title_sim
best_artist_sim = artist_sim
return best_rec, best_title_sim, best_artist_sim
# Shared MusicBrainz client for enrichment lookups
_mb_client = None
_mb_client_lock = threading.Lock()
# Shared MusicBrainzService for alias lookups (issue #442). Service
# layer wraps the raw client + adds caching + DB access — all of which
# the alias resolution chain (library DB → cache → live MB) needs.
_mb_service = None
_mb_service_lock = threading.Lock()
MAX_MB_ENRICHMENT_LOOKUPS = 3
def _get_mb_client() -> MusicBrainzClient:
"""Get or create a shared MusicBrainz client instance."""
global _mb_client
if _mb_client is None:
with _mb_client_lock:
if _mb_client is None:
_mb_client = MusicBrainzClient()
return _mb_client
def _get_mb_service():
"""Get or create a shared MusicBrainzService instance.
Used by the alias-resolution chain in `verify_audio_file`. Lazy
init so importing this module doesn't trigger a DB connection on
paths that never run AcoustID verification (test runs, dry runs).
"""
global _mb_service
if _mb_service is None:
with _mb_service_lock:
if _mb_service is None:
from core.musicbrainz_service import MusicBrainzService
from database.music_database import get_database
_mb_service = MusicBrainzService(get_database())
return _mb_service
def _resolve_expected_artist_aliases(expected_artist_name: str) -> List[str]:
"""Look up alternate-spelling aliases for the expected artist.
Issue #442 — bridges cross-script artist comparisons (Japanese
kanji ↔ romanized, Cyrillic ↔ Latin, etc.) without forcing the
verifier to know about the resolution chain. Best-effort: any
failure (no MB service, network down, no library DB) returns
empty list so verification falls back to the prior direct
similarity check.
"""
if not expected_artist_name:
return []
try:
return _get_mb_service().lookup_artist_aliases(expected_artist_name)
except Exception as e:
logger.debug("alias lookup failed for %r: %s", expected_artist_name, e)
return []
def _enrich_recordings_from_musicbrainz(
recordings: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""
Enrich recordings that are missing title/artist by looking up their
MBIDs via MusicBrainz.
AcoustID often returns recordings with title=None, artist=None even though
the MBIDs are valid. This resolves the metadata so verification can compare
title/artist instead of skipping.
Args:
recordings: List of recording dicts from fingerprint_and_lookup()
Returns:
The same list, with title/artist filled in where possible.
"""
# Fast path: if any recording already has title AND artist, no enrichment needed
if any(rec.get('title') and rec.get('artist') for rec in recordings):
return recordings
logger.info(f"Enriching {len(recordings)} recordings via MusicBrainz (all missing title/artist)...")
mb = _get_mb_client()
enriched_count = 0
for rec in recordings[:MAX_MB_ENRICHMENT_LOOKUPS]:
mbid = rec.get('mbid')
if not mbid:
continue
try:
data = mb.get_recording(mbid, includes=['artist-credits'])
if not data:
logger.debug(f"MusicBrainz returned no data for recording {mbid}")
continue
title = data.get('title')
artist_credit = data.get('artist-credit', [])
# Build artist string from artist-credit array
# Each entry has {"artist": {"name": "..."}, "joinphrase": "..."}
artist_parts = []
for credit in artist_credit:
name = credit.get('artist', {}).get('name', '')
joinphrase = credit.get('joinphrase', '')
if name:
artist_parts.append(name + joinphrase)
artist = ''.join(artist_parts).strip() if artist_parts else None
if title:
rec['title'] = title
logger.debug(f"Enriched {mbid}: title='{title}'")
if artist:
rec['artist'] = artist
logger.debug(f"Enriched {mbid}: artist='{artist}'")
if title or artist:
enriched_count += 1
except Exception as e:
logger.debug(f"Failed to enrich recording {mbid}: {e}")
continue
logger.info(f"Enriched {enriched_count}/{min(len(recordings), MAX_MB_ENRICHMENT_LOOKUPS)} recordings from MusicBrainz")
return recordings
class AcoustIDVerification:
"""
Verification service that compares audio fingerprint identity
against expected track metadata using title/artist matching.
Design Principle: FAIL OPEN
- Only returns FAIL when we are CONFIDENT the file is wrong
- Any error or uncertainty results in SKIP (continue normally)
- Never blocks downloads due to verification infrastructure issues
Usage:
verifier = AcoustIDVerification()
result, message = verifier.verify_audio_file(
"/path/to/downloaded.mp3",
"Expected Song Title",
"Expected Artist"
)
if result == VerificationResult.FAIL:
# Move to quarantine
else:
# Continue with normal processing (PASS, SKIP, or DISABLED)
"""
def __init__(self):
"""Initialize verification service."""
self.acoustid_client = AcoustIDClient()
def verify_audio_file(
self,
audio_file_path: str,
expected_track_name: str,
expected_artist_name: str,
context: Optional[Dict[str, Any]] = None
) -> Tuple[VerificationResult, str]:
"""
Verify that an audio file matches expected track metadata.
Compares title/artist from AcoustID fingerprint results against
the expected track info. No MusicBrainz lookup needed.
Args:
audio_file_path: Path to the downloaded audio file
expected_track_name: Track name we expected to download
expected_artist_name: Artist name we expected
context: Optional download context for logging/debugging
Returns:
Tuple of (VerificationResult, reason_message)
"""
try:
# Step 1: Check availability
available, reason = self.acoustid_client.is_available()
if not available:
logger.debug(f"AcoustID verification skipped: {reason}")
return VerificationResult.SKIP, reason
# Step 2: Fingerprint and lookup in AcoustID
logger.info(f"Fingerprinting and looking up: {audio_file_path}")
acoustid_result = self.acoustid_client.fingerprint_and_lookup(audio_file_path)
if not acoustid_result:
return VerificationResult.SKIP, "Track not found in AcoustID database"
recordings = acoustid_result.get('recordings', [])
best_score = acoustid_result.get('best_score', 0)
if not recordings:
return VerificationResult.SKIP, "AcoustID returned no recordings"
logger.debug(
f"AcoustID returned {len(recordings)} recording(s) "
f"(best fingerprint score: {best_score:.2f})"
)
# Step 3: Check fingerprint confidence
if best_score < MIN_ACOUSTID_SCORE:
msg = f"AcoustID fingerprint score too low ({best_score:.2f}) to verify"
logger.info(msg)
return VerificationResult.SKIP, msg
# Enrich recordings that are missing title/artist via MusicBrainz lookup
recordings = _enrich_recordings_from_musicbrainz(recordings)
# Issue #442 — alias resolution is LAZY. We pass a memoising
# thunk to the artist-comparison sites; it only fires the
# multi-tier lookup (library DB → cache → live MB) when
# direct artist similarity falls below threshold. Verifications
# where the direct match already passes (the common case for
# same-script artist names) never trigger any lookup work,
# so the fix doesn't add a per-verification DB query for the
# happy path. When the thunk DOES fire, the result is cached
# in the closure so the 3 comparison sites within one
# verification share a single resolution pass.
_alias_cache: Dict[str, Any] = {}
def _aliases_provider() -> List[str]:
if 'value' not in _alias_cache:
resolved = _resolve_expected_artist_aliases(expected_artist_name)
_alias_cache['value'] = resolved
if resolved:
logger.debug(
"Resolved %d aliases for expected artist '%s'",
len(resolved), expected_artist_name,
)
return _alias_cache['value']
# Step 4: Find best title/artist match among AcoustID results
best_rec, title_sim, artist_sim = _find_best_title_artist_match(
recordings, expected_track_name, expected_artist_name,
expected_artist_aliases=_aliases_provider,
)
if not best_rec:
return VerificationResult.SKIP, "No recordings with title/artist info"
matched_title = best_rec.get('title', '?')
matched_artist = best_rec.get('artist', '?')
logger.info(
f"Best match: '{matched_title}' by '{matched_artist}' "
f"(title_sim={title_sim:.2f}, artist_sim={artist_sim:.2f})"
)
# Step 4b: Version-mismatch gate.
#
# The ``_normalize`` step deliberately strips parentheticals and
# version tags ("(Instrumental)", "- Live", etc) so that legit
# name variations don't fail the title-similarity comparison.
# That same stripping made it impossible to tell a vocal track
# apart from its instrumental: "In My Feelings" and "In My
# Feelings (Instrumental)" both normalize to "in my feelings",
# the title sim ends up 1.0, and the file passes verification
# even though it's the wrong cut.
#
# Detect the version on each side BEFORE normalization runs.
# If the expected track and the AcoustID-matched recording
# disagree on version (one is original, the other is
# instrumental / live / remix / acoustic / etc), reject — the
# fingerprint identified a real song but it's not the one the
# caller asked for.
expected_version = _detect_title_version(expected_track_name)
matched_version = _detect_title_version(matched_title)
if expected_version != matched_version:
# Issue #607 (AfonsoG6): MusicBrainz often stores live
# recordings with bare titles ("Clarity") while the
# release entry carries the venue annotation ("Clarity
# (Live at Blossom Music Center, ...)"). The fingerprint
# correctly identifies the LIVE recording; only the
# title text is bare. Helper accepts the one-sided bare
# case when fingerprint + bare-title + artist all agree.
# Two-sided version mismatches (live vs remix etc) stay
# strict — those are genuinely different recordings.
if is_acceptable_version_mismatch(
expected_version, matched_version,
fingerprint_score=best_score,
title_similarity=title_sim,
artist_similarity=artist_sim,
):
logger.info(
f"AcoustID version annotation differs (expected={expected_version}, "
f"matched={matched_version}) but fingerprint+title+artist all match — "
f"accepting (likely MB metadata gap on a live/version-annotated recording)"
)
else:
msg = (
f"Version mismatch: expected '{expected_track_name}' ({expected_version}) "
f"but file is '{matched_title}' ({matched_version})"
)
logger.warning(f"AcoustID verification FAILED (version mismatch) - {msg}")
return VerificationResult.FAIL, msg
# Step 5: Decide pass/fail based on similarity
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD:
msg = (
f"Audio verified: '{matched_title}' by '{matched_artist}' "
f"matches expected '{expected_track_name}' by '{expected_artist_name}' "
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
)
logger.info(f"AcoustID verification PASSED - {msg}")
return VerificationResult.PASS, msg
# Title matches but artist doesn't — could be a cover/collab OR a
# genuinely different track with the same name. Distinguish the
# two by checking whether the expected artist appears anywhere in
# AcoustID's returned recordings.
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD:
# First: if the expected artist is present in ANY recording's
# metadata for this fingerprint, it's likely the right track
# (AcoustID's "best" match just picked the wrong variant).
for rec in recordings:
rec_artist = rec.get('artist', '')
if _alias_aware_artist_sim(
expected_artist_name, rec_artist, _aliases_provider,
) >= ARTIST_MATCH_THRESHOLD:
msg = (
f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' "
f"in AcoustID results"
)
logger.info(f"AcoustID verification PASSED (secondary match) - {msg}")
return VerificationResult.PASS, msg
# Expected artist wasn't found anywhere. Decide between:
# - FAIL: clear mismatch, e.g. "Tom Walker" (sim ~0.2) when
# expecting "Maduk" — different song with same name
# - SKIP: ambiguous, e.g. collab / alt credit / formatting
# difference (sim 0.3-0.6)
#
# The 0.3 cutoff catches hard mismatches while preserving the
# benefit of the doubt for borderline artist formatting.
CLEAR_MISMATCH_THRESHOLD = 0.3
if artist_sim < CLEAR_MISMATCH_THRESHOLD:
msg = (
f"Audio mismatch: file identified as '{matched_title}' by '{matched_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}' "
f"(title={title_sim:.0%}, artist={artist_sim:.0%}) — "
f"expected artist not found in any AcoustID recording"
)
logger.warning(f"AcoustID verification FAILED (clear artist mismatch) - {msg}")
return VerificationResult.FAIL, msg
msg = (
f"Title matches but artist unclear: "
f"AcoustID='{matched_title}' by '{matched_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}' "
f"(artist_sim={artist_sim:.0%} — ambiguous, could be cover/collab)"
)
logger.info(f"AcoustID verification SKIPPED - {msg}")
return VerificationResult.SKIP, msg
# Title doesn't match — check ALL recordings for any title/artist match
# (the best combined match might not be the right one if there are many results)
# Skip recordings whose version (instrumental/live/etc) disagrees with
# what the caller asked for — the version mismatch above checked
# only the best recording, but a wrong-version variant could still
# win this fallback scan if its bare title matched.
for rec in recordings:
t = rec.get('title') or ''
a = rec.get('artist') or ''
if _detect_title_version(t) != expected_version:
continue
if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and
_alias_aware_artist_sim(
expected_artist_name, a, _aliases_provider,
) >= ARTIST_MATCH_THRESHOLD):
msg = (
f"Audio verified: found '{t}' by '{a}' in AcoustID results "
f"matching expected '{expected_track_name}' by '{expected_artist_name}'"
)
logger.info(f"AcoustID verification PASSED (scan match) - {msg}")
return VerificationResult.PASS, msg
# No match found — but if fingerprint score is very high (≥0.95)
# AND we have evidence the mismatch is a language/script case
# (rather than two genuinely different songs by the same artist),
# skip rather than quarantine a correct file. Two routes:
#
# (a) Either side of the comparison contains non-ASCII characters
# — strong signal of transliteration / kanji↔roman cases.
# Artist must still be a strong match to use this path.
# (b) Both title AND artist similarity are very high (the song
# is recognizably the same with minor punctuation / casing
# differences that fell below the strict match thresholds).
#
# The OLD logic was ``title_sim >= 0.55 OR artist_sim >= match``.
# That fired for English-vs-English songs by the same artist that
# share NO actual content — e.g. "R.O.T.C (Interlude)" by
# Kendrick Lamar getting accepted as "Rich (Interlude)" by
# Kendrick Lamar because the artist matched perfectly and
# "interlude" was shared in both titles. Reported by user when
# downloading Mr. Morale: three tracks (Rich Interlude, Savior
# Interlude, Savior) all received the wrong R.O.T.C audio file
# because of this leak.
# Use the BEST matching recording's strings here (not
# `recordings[0]`) so the failure message reports the same
# candidate the title/artist similarity scores came from.
# Issue #607 (AfonsoG6) example 1: the prior code mixed
# `recordings[0]`'s strings (which can be empty) with
# `best_rec`'s scores, producing nonsense reasons like
# "file identified as '' by '' (artist=100%)" when a later
# recording in the list scored well on artist.
display_title = matched_title or '?'
display_artist = matched_artist or '?'
has_non_ascii = (
any(ord(c) > 127 for c in (expected_track_name or ''))
or any(ord(c) > 127 for c in display_title)
)
language_script_skip = (
best_score >= 0.95
and has_non_ascii
and artist_sim >= ARTIST_MATCH_THRESHOLD
)
high_confidence_strong_match_skip = (
best_score >= 0.95
and title_sim >= 0.80
and artist_sim >= ARTIST_MATCH_THRESHOLD
)
if language_script_skip or high_confidence_strong_match_skip:
reason = (
"likely same song in different language/script"
if language_script_skip
else "title/artist match within tolerance"
)
msg = (
f"Title/artist mismatch but fingerprint confidence very high ({best_score:.2f}): "
f"AcoustID='{display_title}' by '{display_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}'"
f"{reason}"
)
logger.info(f"AcoustID verification SKIPPED (high confidence) - {msg}")
return VerificationResult.SKIP, msg
# Low fingerprint score + no metadata match — file is likely wrong.
msg = (
f"Audio mismatch: file identified as '{display_title}' by '{display_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}' "
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
)
logger.warning(f"AcoustID verification FAILED - {msg}")
return VerificationResult.FAIL, msg
except Exception as e:
# Any unexpected error -> SKIP (fail open)
logger.error(f"Unexpected error during AcoustID verification: {e}")
return VerificationResult.SKIP, f"Verification error: {str(e)}"
def quick_check_available(self) -> Tuple[bool, str]:
"""
Quick check if verification is available without doing a full verification.
Returns:
Tuple of (is_available, reason)
"""
return self.acoustid_client.is_available()