""" AcoustID Verification Service Verifies downloaded audio files match expected track metadata by comparing title/artist from AcoustID fingerprint results against the expected track info. If the audio fingerprint confidently identifies a DIFFERENT song than expected, the file is flagged as incorrect. """ import re import threading from difflib import SequenceMatcher from typing import Optional, Dict, Any, Tuple, List from enum import Enum from utils.logging_config import get_logger from core.acoustid_client import AcoustIDClient from core.matching_engine import MusicMatchingEngine from core.matching.version_mismatch import is_acceptable_version_mismatch from core.musicbrainz_client import MusicBrainzClient logger = get_logger("acoustid.verification") # Thresholds MIN_ACOUSTID_SCORE = 0.80 # Minimum AcoustID fingerprint score to trust TITLE_MATCH_THRESHOLD = 0.70 # Title similarity needed to consider a match ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity needed to consider a match # Single matching-engine instance so version detection reuses the same patterns # used by the pre-download Soulseek matcher (remix / live / acoustic / # instrumental / etc). detect_version_type doesn't use self state, so one # shared instance is fine. _match_engine_for_version = MusicMatchingEngine() def _detect_title_version(title: str) -> str: """Return version label for a track title. Returns ``'original'`` when no version marker is detected, otherwise one of the labels produced by ``MusicMatchingEngine.detect_version_type`` (``'instrumental'``, ``'live'``, ``'acoustic'``, ``'remix'``, etc). """ if not title: return 'original' version_type, _ = _match_engine_for_version.detect_version_type(title) return version_type class VerificationResult(Enum): """Possible outcomes of audio verification.""" PASS = "pass" # Title/artist match - file is correct FAIL = "fail" # Title/artist mismatch - wrong file downloaded SKIP = "skip" # Could not verify (error or unavailable) - continue normally DISABLED = "disabled" # Verification not enabled def _normalize(text: str) -> str: """Normalize a string for comparison: lowercase, strip parentheticals, punctuation.""" if not text: return "" s = text.lower().strip() # Remove ALL parenthetical suffixes — these are metadata annotations, not core title # Covers: (Live), (Remastered), (Parody of ...), (from "..." Soundtrack), (feat. ...), etc. s = re.sub(r'\s*\([^)]*\)', '', s) # Remove ALL square bracket suffixes: [Live], [Remastered], [Deluxe], etc. s = re.sub(r'\s*\[[^\]]*\]', '', s) # Remove trailing featuring info not in parentheses: "feat. ...", "ft. ...", "featuring ..." s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE) # Remove dash-separated version tags: "- Vocal", "- Instrumental", "- Acoustic", etc. s = re.sub(r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$', '', s, flags=re.IGNORECASE) # Remove soundtrack/source subtitles: ' - From "..." Soundtrack', ' - from the film ...' s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE) # Remove non-alphanumeric except spaces s = re.sub(r'[^\w\s]', '', s) # Collapse whitespace s = re.sub(r'\s+', ' ', s).strip() return s def _similarity(a: str, b: str) -> float: """Calculate similarity between two strings (0.0-1.0) after normalization.""" na = _normalize(a) nb = _normalize(b) if not na or not nb: return 0.0 if na == nb: return 1.0 return SequenceMatcher(None, na, nb).ratio() def _alias_aware_artist_sim( expected_artist: str, actual_artist: str, aliases: Optional[Any] = None, ) -> float: """Best artist-similarity across (expected, *aliases) vs actual. Issue #442 — when expected and actual are in different scripts (e.g. `Hiroyuki Sawano` vs `澤野弘之`), raw `_similarity` scores near 0% even though MusicBrainz aliases bridge them. Routes through the pure helper so the verifier inherits one shared contract. Returns the highest score across all candidates so existing threshold checks (>= ARTIST_MATCH_THRESHOLD) keep their semantics. When `aliases` is None or empty, behaves identically to the prior raw `_similarity(expected, actual)` call. `aliases` accepts two shapes: - **Iterable** (list/tuple/set of strings): used directly. Used by tests that already know the aliases. - **Callable**: invoked LAZILY only when direct similarity falls below the threshold. Lets the verifier pass a memoizing thunk that resolves aliases (DB / cache / live MB) only when needed. Verifications where the direct match already passes never trigger the lookup chain — no wasted DB query for the happy path. Diagnostic logging: emits an INFO line whenever an alias rescues a comparison that direct similarity would have failed. Lets future bug reports trace which alias triggered which PASS decision (e.g. "this file passed because alias `澤野弘之` matched the file's artist tag"). """ from core.matching.artist_aliases import artist_names_match direct = _similarity(expected_artist, actual_artist) # Fast path — direct match already passes the threshold OR caller # supplied no aliases handle. Avoids any lookup work. if aliases is None: return direct if direct >= ARTIST_MATCH_THRESHOLD: return direct # Resolve the iterable. Callable provider invoked NOW (lazily — # the caller can memoize the result across multiple invocations # within one verify_audio_file call). resolved = aliases() if callable(aliases) else aliases if not resolved: return direct _matched, score = artist_names_match( expected_artist, actual_artist, aliases=resolved, threshold=ARTIST_MATCH_THRESHOLD, similarity=_similarity, ) # Diagnostic — alias rescued a comparison that direct would # have failed. Worth logging at INFO since it's a user-visible # decision (file PASS instead of FAIL). One line per rescue # within a single verify call. if score >= ARTIST_MATCH_THRESHOLD and direct < ARTIST_MATCH_THRESHOLD: from core.matching.artist_aliases import best_alias_match winner, _ = best_alias_match( expected_artist, actual_artist, resolved, similarity=_similarity, ) logger.info( "Artist alias rescued comparison: expected=%r vs actual=%r " "(direct sim=%.2f, alias %r → score=%.2f)", expected_artist, actual_artist, direct, winner, score, ) return score def _find_best_title_artist_match( recordings: List[Dict[str, Any]], expected_title: str, expected_artist: str, expected_artist_aliases: Optional[Any] = None, ) -> Tuple[Optional[Dict], float, float]: """ Find the AcoustID recording that best matches expected title/artist. Issue #442 — `expected_artist_aliases` (when supplied) is the list of alternate spellings for `expected_artist` (Japanese kanji, Cyrillic, etc.). Accepts either: - An iterable of alias strings (used eagerly), or - A callable returning the list (resolved lazily — only fires when at least one recording fails direct artist similarity). Each recording's artist is scored against (expected, *aliases) and the best score wins. When the list is empty/omitted/None, behavior is identical to the prior raw similarity comparison. Returns: (best_recording, title_similarity, artist_similarity) """ best_rec = None best_title_sim = 0.0 best_artist_sim = 0.0 best_combined = 0.0 for rec in recordings: title = rec.get('title') or '' artist = rec.get('artist') or '' title_sim = _similarity(expected_title, title) artist_sim = _alias_aware_artist_sim( expected_artist, artist, expected_artist_aliases, ) # Weight title higher since that's the primary identifier combined = (title_sim * 0.6) + (artist_sim * 0.4) if combined > best_combined: best_combined = combined best_rec = rec best_title_sim = title_sim best_artist_sim = artist_sim return best_rec, best_title_sim, best_artist_sim # Shared MusicBrainz client for enrichment lookups _mb_client = None _mb_client_lock = threading.Lock() # Shared MusicBrainzService for alias lookups (issue #442). Service # layer wraps the raw client + adds caching + DB access — all of which # the alias resolution chain (library DB → cache → live MB) needs. _mb_service = None _mb_service_lock = threading.Lock() MAX_MB_ENRICHMENT_LOOKUPS = 3 def _get_mb_client() -> MusicBrainzClient: """Get or create a shared MusicBrainz client instance.""" global _mb_client if _mb_client is None: with _mb_client_lock: if _mb_client is None: _mb_client = MusicBrainzClient() return _mb_client def _get_mb_service(): """Get or create a shared MusicBrainzService instance. Used by the alias-resolution chain in `verify_audio_file`. Lazy init so importing this module doesn't trigger a DB connection on paths that never run AcoustID verification (test runs, dry runs). """ global _mb_service if _mb_service is None: with _mb_service_lock: if _mb_service is None: from core.musicbrainz_service import MusicBrainzService from database.music_database import get_database _mb_service = MusicBrainzService(get_database()) return _mb_service def _resolve_expected_artist_aliases(expected_artist_name: str) -> List[str]: """Look up alternate-spelling aliases for the expected artist. Issue #442 — bridges cross-script artist comparisons (Japanese kanji ↔ romanized, Cyrillic ↔ Latin, etc.) without forcing the verifier to know about the resolution chain. Best-effort: any failure (no MB service, network down, no library DB) returns empty list so verification falls back to the prior direct similarity check. """ if not expected_artist_name: return [] try: return _get_mb_service().lookup_artist_aliases(expected_artist_name) except Exception as e: logger.debug("alias lookup failed for %r: %s", expected_artist_name, e) return [] def _enrich_recordings_from_musicbrainz( recordings: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """ Enrich recordings that are missing title/artist by looking up their MBIDs via MusicBrainz. AcoustID often returns recordings with title=None, artist=None even though the MBIDs are valid. This resolves the metadata so verification can compare title/artist instead of skipping. Args: recordings: List of recording dicts from fingerprint_and_lookup() Returns: The same list, with title/artist filled in where possible. """ # Fast path: if any recording already has title AND artist, no enrichment needed if any(rec.get('title') and rec.get('artist') for rec in recordings): return recordings logger.info(f"Enriching {len(recordings)} recordings via MusicBrainz (all missing title/artist)...") mb = _get_mb_client() enriched_count = 0 for rec in recordings[:MAX_MB_ENRICHMENT_LOOKUPS]: mbid = rec.get('mbid') if not mbid: continue try: data = mb.get_recording(mbid, includes=['artist-credits']) if not data: logger.debug(f"MusicBrainz returned no data for recording {mbid}") continue title = data.get('title') artist_credit = data.get('artist-credit', []) # Build artist string from artist-credit array # Each entry has {"artist": {"name": "..."}, "joinphrase": "..."} artist_parts = [] for credit in artist_credit: name = credit.get('artist', {}).get('name', '') joinphrase = credit.get('joinphrase', '') if name: artist_parts.append(name + joinphrase) artist = ''.join(artist_parts).strip() if artist_parts else None if title: rec['title'] = title logger.debug(f"Enriched {mbid}: title='{title}'") if artist: rec['artist'] = artist logger.debug(f"Enriched {mbid}: artist='{artist}'") if title or artist: enriched_count += 1 except Exception as e: logger.debug(f"Failed to enrich recording {mbid}: {e}") continue logger.info(f"Enriched {enriched_count}/{min(len(recordings), MAX_MB_ENRICHMENT_LOOKUPS)} recordings from MusicBrainz") return recordings class AcoustIDVerification: """ Verification service that compares audio fingerprint identity against expected track metadata using title/artist matching. Design Principle: FAIL OPEN - Only returns FAIL when we are CONFIDENT the file is wrong - Any error or uncertainty results in SKIP (continue normally) - Never blocks downloads due to verification infrastructure issues Usage: verifier = AcoustIDVerification() result, message = verifier.verify_audio_file( "/path/to/downloaded.mp3", "Expected Song Title", "Expected Artist" ) if result == VerificationResult.FAIL: # Move to quarantine else: # Continue with normal processing (PASS, SKIP, or DISABLED) """ def __init__(self): """Initialize verification service.""" self.acoustid_client = AcoustIDClient() def verify_audio_file( self, audio_file_path: str, expected_track_name: str, expected_artist_name: str, context: Optional[Dict[str, Any]] = None ) -> Tuple[VerificationResult, str]: """ Verify that an audio file matches expected track metadata. Compares title/artist from AcoustID fingerprint results against the expected track info. No MusicBrainz lookup needed. Args: audio_file_path: Path to the downloaded audio file expected_track_name: Track name we expected to download expected_artist_name: Artist name we expected context: Optional download context for logging/debugging Returns: Tuple of (VerificationResult, reason_message) """ try: # Step 1: Check availability available, reason = self.acoustid_client.is_available() if not available: logger.debug(f"AcoustID verification skipped: {reason}") return VerificationResult.SKIP, reason # Step 2: Fingerprint and lookup in AcoustID logger.info(f"Fingerprinting and looking up: {audio_file_path}") acoustid_result = self.acoustid_client.fingerprint_and_lookup(audio_file_path) if not acoustid_result: return VerificationResult.SKIP, "Track not found in AcoustID database" recordings = acoustid_result.get('recordings', []) best_score = acoustid_result.get('best_score', 0) if not recordings: return VerificationResult.SKIP, "AcoustID returned no recordings" logger.debug( f"AcoustID returned {len(recordings)} recording(s) " f"(best fingerprint score: {best_score:.2f})" ) # Step 3: Check fingerprint confidence if best_score < MIN_ACOUSTID_SCORE: msg = f"AcoustID fingerprint score too low ({best_score:.2f}) to verify" logger.info(msg) return VerificationResult.SKIP, msg # Enrich recordings that are missing title/artist via MusicBrainz lookup recordings = _enrich_recordings_from_musicbrainz(recordings) # Issue #442 — alias resolution is LAZY. We pass a memoising # thunk to the artist-comparison sites; it only fires the # multi-tier lookup (library DB → cache → live MB) when # direct artist similarity falls below threshold. Verifications # where the direct match already passes (the common case for # same-script artist names) never trigger any lookup work, # so the fix doesn't add a per-verification DB query for the # happy path. When the thunk DOES fire, the result is cached # in the closure so the 3 comparison sites within one # verification share a single resolution pass. _alias_cache: Dict[str, Any] = {} def _aliases_provider() -> List[str]: if 'value' not in _alias_cache: resolved = _resolve_expected_artist_aliases(expected_artist_name) _alias_cache['value'] = resolved if resolved: logger.debug( "Resolved %d aliases for expected artist '%s'", len(resolved), expected_artist_name, ) return _alias_cache['value'] # Step 4: Find best title/artist match among AcoustID results best_rec, title_sim, artist_sim = _find_best_title_artist_match( recordings, expected_track_name, expected_artist_name, expected_artist_aliases=_aliases_provider, ) if not best_rec: return VerificationResult.SKIP, "No recordings with title/artist info" matched_title = best_rec.get('title', '?') matched_artist = best_rec.get('artist', '?') logger.info( f"Best match: '{matched_title}' by '{matched_artist}' " f"(title_sim={title_sim:.2f}, artist_sim={artist_sim:.2f})" ) # Step 4b: Version-mismatch gate. # # The ``_normalize`` step deliberately strips parentheticals and # version tags ("(Instrumental)", "- Live", etc) so that legit # name variations don't fail the title-similarity comparison. # That same stripping made it impossible to tell a vocal track # apart from its instrumental: "In My Feelings" and "In My # Feelings (Instrumental)" both normalize to "in my feelings", # the title sim ends up 1.0, and the file passes verification # even though it's the wrong cut. # # Detect the version on each side BEFORE normalization runs. # If the expected track and the AcoustID-matched recording # disagree on version (one is original, the other is # instrumental / live / remix / acoustic / etc), reject — the # fingerprint identified a real song but it's not the one the # caller asked for. expected_version = _detect_title_version(expected_track_name) matched_version = _detect_title_version(matched_title) if expected_version != matched_version: # Issue #607 (AfonsoG6): MusicBrainz often stores live # recordings with bare titles ("Clarity") while the # release entry carries the venue annotation ("Clarity # (Live at Blossom Music Center, ...)"). The fingerprint # correctly identifies the LIVE recording; only the # title text is bare. Helper accepts the one-sided bare # case when fingerprint + bare-title + artist all agree. # Two-sided version mismatches (live vs remix etc) stay # strict — those are genuinely different recordings. if is_acceptable_version_mismatch( expected_version, matched_version, fingerprint_score=best_score, title_similarity=title_sim, artist_similarity=artist_sim, ): logger.info( f"AcoustID version annotation differs (expected={expected_version}, " f"matched={matched_version}) but fingerprint+title+artist all match — " f"accepting (likely MB metadata gap on a live/version-annotated recording)" ) else: msg = ( f"Version mismatch: expected '{expected_track_name}' ({expected_version}) " f"but file is '{matched_title}' ({matched_version})" ) logger.warning(f"AcoustID verification FAILED (version mismatch) - {msg}") return VerificationResult.FAIL, msg # Step 5: Decide pass/fail based on similarity if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD: msg = ( f"Audio verified: '{matched_title}' by '{matched_artist}' " f"matches expected '{expected_track_name}' by '{expected_artist_name}' " f"(title={title_sim:.0%}, artist={artist_sim:.0%})" ) logger.info(f"AcoustID verification PASSED - {msg}") return VerificationResult.PASS, msg # Title matches but artist doesn't — could be a cover/collab OR a # genuinely different track with the same name. Distinguish the # two by checking whether the expected artist appears anywhere in # AcoustID's returned recordings. if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD: # First: if the expected artist is present in ANY recording's # metadata for this fingerprint, it's likely the right track # (AcoustID's "best" match just picked the wrong variant). for rec in recordings: rec_artist = rec.get('artist', '') if _alias_aware_artist_sim( expected_artist_name, rec_artist, _aliases_provider, ) >= ARTIST_MATCH_THRESHOLD: msg = ( f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' " f"in AcoustID results" ) logger.info(f"AcoustID verification PASSED (secondary match) - {msg}") return VerificationResult.PASS, msg # Expected artist wasn't found anywhere. Decide between: # - FAIL: clear mismatch, e.g. "Tom Walker" (sim ~0.2) when # expecting "Maduk" — different song with same name # - SKIP: ambiguous, e.g. collab / alt credit / formatting # difference (sim 0.3-0.6) # # The 0.3 cutoff catches hard mismatches while preserving the # benefit of the doubt for borderline artist formatting. CLEAR_MISMATCH_THRESHOLD = 0.3 if artist_sim < CLEAR_MISMATCH_THRESHOLD: msg = ( f"Audio mismatch: file identified as '{matched_title}' by '{matched_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}' " f"(title={title_sim:.0%}, artist={artist_sim:.0%}) — " f"expected artist not found in any AcoustID recording" ) logger.warning(f"AcoustID verification FAILED (clear artist mismatch) - {msg}") return VerificationResult.FAIL, msg msg = ( f"Title matches but artist unclear: " f"AcoustID='{matched_title}' by '{matched_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}' " f"(artist_sim={artist_sim:.0%} — ambiguous, could be cover/collab)" ) logger.info(f"AcoustID verification SKIPPED - {msg}") return VerificationResult.SKIP, msg # Title doesn't match — check ALL recordings for any title/artist match # (the best combined match might not be the right one if there are many results) # Skip recordings whose version (instrumental/live/etc) disagrees with # what the caller asked for — the version mismatch above checked # only the best recording, but a wrong-version variant could still # win this fallback scan if its bare title matched. for rec in recordings: t = rec.get('title') or '' a = rec.get('artist') or '' if _detect_title_version(t) != expected_version: continue if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and _alias_aware_artist_sim( expected_artist_name, a, _aliases_provider, ) >= ARTIST_MATCH_THRESHOLD): msg = ( f"Audio verified: found '{t}' by '{a}' in AcoustID results " f"matching expected '{expected_track_name}' by '{expected_artist_name}'" ) logger.info(f"AcoustID verification PASSED (scan match) - {msg}") return VerificationResult.PASS, msg # No match found — but if fingerprint score is very high (≥0.95) # AND we have evidence the mismatch is a language/script case # (rather than two genuinely different songs by the same artist), # skip rather than quarantine a correct file. Two routes: # # (a) Either side of the comparison contains non-ASCII characters # — strong signal of transliteration / kanji↔roman cases. # Artist must still be a strong match to use this path. # (b) Both title AND artist similarity are very high (the song # is recognizably the same with minor punctuation / casing # differences that fell below the strict match thresholds). # # The OLD logic was ``title_sim >= 0.55 OR artist_sim >= match``. # That fired for English-vs-English songs by the same artist that # share NO actual content — e.g. "R.O.T.C (Interlude)" by # Kendrick Lamar getting accepted as "Rich (Interlude)" by # Kendrick Lamar because the artist matched perfectly and # "interlude" was shared in both titles. Reported by user when # downloading Mr. Morale: three tracks (Rich Interlude, Savior # Interlude, Savior) all received the wrong R.O.T.C audio file # because of this leak. # Use the BEST matching recording's strings here (not # `recordings[0]`) so the failure message reports the same # candidate the title/artist similarity scores came from. # Issue #607 (AfonsoG6) example 1: the prior code mixed # `recordings[0]`'s strings (which can be empty) with # `best_rec`'s scores, producing nonsense reasons like # "file identified as '' by '' (artist=100%)" when a later # recording in the list scored well on artist. display_title = matched_title or '?' display_artist = matched_artist or '?' has_non_ascii = ( any(ord(c) > 127 for c in (expected_track_name or '')) or any(ord(c) > 127 for c in display_title) ) language_script_skip = ( best_score >= 0.95 and has_non_ascii and artist_sim >= ARTIST_MATCH_THRESHOLD ) high_confidence_strong_match_skip = ( best_score >= 0.95 and title_sim >= 0.80 and artist_sim >= ARTIST_MATCH_THRESHOLD ) if language_script_skip or high_confidence_strong_match_skip: reason = ( "likely same song in different language/script" if language_script_skip else "title/artist match within tolerance" ) msg = ( f"Title/artist mismatch but fingerprint confidence very high ({best_score:.2f}): " f"AcoustID='{display_title}' by '{display_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}' — " f"{reason}" ) logger.info(f"AcoustID verification SKIPPED (high confidence) - {msg}") return VerificationResult.SKIP, msg # Low fingerprint score + no metadata match — file is likely wrong. msg = ( f"Audio mismatch: file identified as '{display_title}' by '{display_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}' " f"(title={title_sim:.0%}, artist={artist_sim:.0%})" ) logger.warning(f"AcoustID verification FAILED - {msg}") return VerificationResult.FAIL, msg except Exception as e: # Any unexpected error -> SKIP (fail open) logger.error(f"Unexpected error during AcoustID verification: {e}") return VerificationResult.SKIP, f"Verification error: {str(e)}" def quick_check_available(self) -> Tuple[bool, str]: """ Quick check if verification is available without doing a full verification. Returns: Tuple of (is_available, reason) """ return self.acoustid_client.is_available()