""" AcoustID Verification Service Verifies downloaded audio files match expected track metadata by comparing title/artist from AcoustID fingerprint results against the expected track info. If the audio fingerprint confidently identifies a DIFFERENT song than expected, the file is flagged as incorrect. """ import re import threading from difflib import SequenceMatcher from typing import Optional, Dict, Any, Tuple, List from enum import Enum from utils.logging_config import get_logger from core.acoustid_client import AcoustIDClient from core.musicbrainz_client import MusicBrainzClient logger = get_logger("acoustid.verification") # Thresholds MIN_ACOUSTID_SCORE = 0.80 # Minimum AcoustID fingerprint score to trust TITLE_MATCH_THRESHOLD = 0.70 # Title similarity needed to consider a match ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity needed to consider a match class VerificationResult(Enum): """Possible outcomes of audio verification.""" PASS = "pass" # Title/artist match - file is correct FAIL = "fail" # Title/artist mismatch - wrong file downloaded SKIP = "skip" # Could not verify (error or unavailable) - continue normally DISABLED = "disabled" # Verification not enabled def _normalize(text: str) -> str: """Normalize a string for comparison: lowercase, strip parentheticals, punctuation.""" if not text: return "" s = text.lower().strip() # Remove ALL parenthetical suffixes — these are metadata annotations, not core title # Covers: (Live), (Remastered), (Parody of ...), (from "..." Soundtrack), (feat. ...), etc. s = re.sub(r'\s*\([^)]*\)', '', s) # Remove ALL square bracket suffixes: [Live], [Remastered], [Deluxe], etc. s = re.sub(r'\s*\[[^\]]*\]', '', s) # Remove trailing featuring info not in parentheses: "feat. ...", "ft. ...", "featuring ..." s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE) # Remove dash-separated version tags: "- Vocal", "- Instrumental", "- Acoustic", etc. s = re.sub(r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$', '', s, flags=re.IGNORECASE) # Remove soundtrack/source subtitles: ' - From "..." Soundtrack', ' - from the film ...' s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE) # Remove non-alphanumeric except spaces s = re.sub(r'[^\w\s]', '', s) # Collapse whitespace s = re.sub(r'\s+', ' ', s).strip() return s def _similarity(a: str, b: str) -> float: """Calculate similarity between two strings (0.0-1.0) after normalization.""" na = _normalize(a) nb = _normalize(b) if not na or not nb: return 0.0 if na == nb: return 1.0 return SequenceMatcher(None, na, nb).ratio() def _find_best_title_artist_match( recordings: List[Dict[str, Any]], expected_title: str, expected_artist: str, ) -> Tuple[Optional[Dict], float, float]: """ Find the AcoustID recording that best matches expected title/artist. Returns: (best_recording, title_similarity, artist_similarity) """ best_rec = None best_title_sim = 0.0 best_artist_sim = 0.0 best_combined = 0.0 for rec in recordings: title = rec.get('title') or '' artist = rec.get('artist') or '' title_sim = _similarity(expected_title, title) artist_sim = _similarity(expected_artist, artist) # Weight title higher since that's the primary identifier combined = (title_sim * 0.6) + (artist_sim * 0.4) if combined > best_combined: best_combined = combined best_rec = rec best_title_sim = title_sim best_artist_sim = artist_sim return best_rec, best_title_sim, best_artist_sim # Shared MusicBrainz client for enrichment lookups _mb_client = None _mb_client_lock = threading.Lock() MAX_MB_ENRICHMENT_LOOKUPS = 3 def _get_mb_client() -> MusicBrainzClient: """Get or create a shared MusicBrainz client instance.""" global _mb_client if _mb_client is None: with _mb_client_lock: if _mb_client is None: _mb_client = MusicBrainzClient() return _mb_client def _enrich_recordings_from_musicbrainz( recordings: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: """ Enrich recordings that are missing title/artist by looking up their MBIDs via MusicBrainz. AcoustID often returns recordings with title=None, artist=None even though the MBIDs are valid. This resolves the metadata so verification can compare title/artist instead of skipping. Args: recordings: List of recording dicts from fingerprint_and_lookup() Returns: The same list, with title/artist filled in where possible. """ # Fast path: if any recording already has title AND artist, no enrichment needed if any(rec.get('title') and rec.get('artist') for rec in recordings): return recordings logger.info(f"Enriching {len(recordings)} recordings via MusicBrainz (all missing title/artist)...") mb = _get_mb_client() enriched_count = 0 for rec in recordings[:MAX_MB_ENRICHMENT_LOOKUPS]: mbid = rec.get('mbid') if not mbid: continue try: data = mb.get_recording(mbid, includes=['artist-credits']) if not data: logger.debug(f"MusicBrainz returned no data for recording {mbid}") continue title = data.get('title') artist_credit = data.get('artist-credit', []) # Build artist string from artist-credit array # Each entry has {"artist": {"name": "..."}, "joinphrase": "..."} artist_parts = [] for credit in artist_credit: name = credit.get('artist', {}).get('name', '') joinphrase = credit.get('joinphrase', '') if name: artist_parts.append(name + joinphrase) artist = ''.join(artist_parts).strip() if artist_parts else None if title: rec['title'] = title logger.debug(f"Enriched {mbid}: title='{title}'") if artist: rec['artist'] = artist logger.debug(f"Enriched {mbid}: artist='{artist}'") if title or artist: enriched_count += 1 except Exception as e: logger.debug(f"Failed to enrich recording {mbid}: {e}") continue logger.info(f"Enriched {enriched_count}/{min(len(recordings), MAX_MB_ENRICHMENT_LOOKUPS)} recordings from MusicBrainz") return recordings class AcoustIDVerification: """ Verification service that compares audio fingerprint identity against expected track metadata using title/artist matching. Design Principle: FAIL OPEN - Only returns FAIL when we are CONFIDENT the file is wrong - Any error or uncertainty results in SKIP (continue normally) - Never blocks downloads due to verification infrastructure issues Usage: verifier = AcoustIDVerification() result, message = verifier.verify_audio_file( "/path/to/downloaded.mp3", "Expected Song Title", "Expected Artist" ) if result == VerificationResult.FAIL: # Move to quarantine else: # Continue with normal processing (PASS, SKIP, or DISABLED) """ def __init__(self): """Initialize verification service.""" self.acoustid_client = AcoustIDClient() def verify_audio_file( self, audio_file_path: str, expected_track_name: str, expected_artist_name: str, context: Optional[Dict[str, Any]] = None ) -> Tuple[VerificationResult, str]: """ Verify that an audio file matches expected track metadata. Compares title/artist from AcoustID fingerprint results against the expected track info. No MusicBrainz lookup needed. Args: audio_file_path: Path to the downloaded audio file expected_track_name: Track name we expected to download expected_artist_name: Artist name we expected context: Optional download context for logging/debugging Returns: Tuple of (VerificationResult, reason_message) """ try: # Step 1: Check availability available, reason = self.acoustid_client.is_available() if not available: logger.debug(f"AcoustID verification skipped: {reason}") return VerificationResult.SKIP, reason # Step 2: Fingerprint and lookup in AcoustID logger.info(f"Fingerprinting and looking up: {audio_file_path}") acoustid_result = self.acoustid_client.fingerprint_and_lookup(audio_file_path) if not acoustid_result: return VerificationResult.SKIP, "Track not found in AcoustID database" recordings = acoustid_result.get('recordings', []) best_score = acoustid_result.get('best_score', 0) if not recordings: return VerificationResult.SKIP, "AcoustID returned no recordings" logger.debug( f"AcoustID returned {len(recordings)} recording(s) " f"(best fingerprint score: {best_score:.2f})" ) # Step 3: Check fingerprint confidence if best_score < MIN_ACOUSTID_SCORE: msg = f"AcoustID fingerprint score too low ({best_score:.2f}) to verify" logger.info(msg) return VerificationResult.SKIP, msg # Enrich recordings that are missing title/artist via MusicBrainz lookup recordings = _enrich_recordings_from_musicbrainz(recordings) # Step 4: Find best title/artist match among AcoustID results best_rec, title_sim, artist_sim = _find_best_title_artist_match( recordings, expected_track_name, expected_artist_name ) if not best_rec: return VerificationResult.SKIP, "No recordings with title/artist info" matched_title = best_rec.get('title', '?') matched_artist = best_rec.get('artist', '?') logger.info( f"Best match: '{matched_title}' by '{matched_artist}' " f"(title_sim={title_sim:.2f}, artist_sim={artist_sim:.2f})" ) # Step 5: Decide pass/fail based on similarity if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD: msg = ( f"Audio verified: '{matched_title}' by '{matched_artist}' " f"matches expected '{expected_track_name}' by '{expected_artist_name}' " f"(title={title_sim:.0%}, artist={artist_sim:.0%})" ) logger.info(f"AcoustID verification PASSED - {msg}") return VerificationResult.PASS, msg # Title matches but artist doesn't — could be a cover or collab, skip if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD: # Check if the expected artist appears anywhere in the AcoustID results for rec in recordings: if _similarity(expected_artist_name, rec.get('artist', '')) >= ARTIST_MATCH_THRESHOLD: msg = ( f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' " f"in AcoustID results" ) logger.info(f"AcoustID verification PASSED (secondary match) - {msg}") return VerificationResult.PASS, msg msg = ( f"Title matches but artist unclear: " f"AcoustID='{matched_title}' by '{matched_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}'" ) logger.info(f"AcoustID verification SKIPPED - {msg}") return VerificationResult.SKIP, msg # Title doesn't match — check ALL recordings for any title/artist match # (the best combined match might not be the right one if there are many results) for rec in recordings: t = rec.get('title') or '' a = rec.get('artist') or '' if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and _similarity(expected_artist_name, a) >= ARTIST_MATCH_THRESHOLD): msg = ( f"Audio verified: found '{t}' by '{a}' in AcoustID results " f"matching expected '{expected_track_name}' by '{expected_artist_name}'" ) logger.info(f"AcoustID verification PASSED (scan match) - {msg}") return VerificationResult.PASS, msg # No match found — but if fingerprint score is very high (≥0.95) # AND there's partial similarity in title or artist, the mismatch is # likely a language/script difference (e.g. Japanese kanji vs English). # Skip rather than quarantine a correct file. # But if both title AND artist similarity are very low, the download # source gave us a completely wrong file — fail it. if best_score >= 0.95 and (title_sim >= 0.55 or artist_sim >= ARTIST_MATCH_THRESHOLD): top = recordings[0] msg = ( f"Title/artist mismatch but fingerprint confidence very high ({best_score:.2f}): " f"AcoustID='{top.get('title', '?')}' by '{top.get('artist', '?')}', " f"expected '{expected_track_name}' by '{expected_artist_name}' — " f"likely same song in different language/script" ) logger.info(f"AcoustID verification SKIPPED (high confidence) - {msg}") return VerificationResult.SKIP, msg # Low fingerprint score + no metadata match — file is likely wrong top = recordings[0] top_title = top.get('title', '?') top_artist = top.get('artist', '?') msg = ( f"Audio mismatch: file identified as '{top_title}' by '{top_artist}', " f"expected '{expected_track_name}' by '{expected_artist_name}' " f"(title={title_sim:.0%}, artist={artist_sim:.0%})" ) logger.warning(f"AcoustID verification FAILED - {msg}") return VerificationResult.FAIL, msg except Exception as e: # Any unexpected error -> SKIP (fail open) logger.error(f"Unexpected error during AcoustID verification: {e}") return VerificationResult.SKIP, f"Verification error: {str(e)}" def quick_check_available(self) -> Tuple[bool, str]: """ Quick check if verification is available without doing a full verification. Returns: Tuple of (is_available, reason) """ return self.acoustid_client.is_available()