You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/acoustid_verification.py

355 lines
14 KiB

"""
AcoustID Verification Service
Verifies downloaded audio files match expected track metadata by comparing
title/artist from AcoustID fingerprint results against the expected track info.
If the audio fingerprint confidently identifies a DIFFERENT song than expected,
the file is flagged as incorrect.
"""
import re
import threading
from difflib import SequenceMatcher
from typing import Optional, Dict, Any, Tuple, List
from enum import Enum
from utils.logging_config import get_logger
from core.acoustid_client import AcoustIDClient
from core.musicbrainz_client import MusicBrainzClient
logger = get_logger("acoustid_verification")
# Thresholds
MIN_ACOUSTID_SCORE = 0.80 # Minimum AcoustID fingerprint score to trust
TITLE_MATCH_THRESHOLD = 0.70 # Title similarity needed to consider a match
ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity needed to consider a match
class VerificationResult(Enum):
"""Possible outcomes of audio verification."""
PASS = "pass" # Title/artist match - file is correct
FAIL = "fail" # Title/artist mismatch - wrong file downloaded
SKIP = "skip" # Could not verify (error or unavailable) - continue normally
DISABLED = "disabled" # Verification not enabled
def _normalize(text: str) -> str:
"""Normalize a string for comparison: lowercase, strip parentheticals, punctuation."""
if not text:
return ""
s = text.lower().strip()
# Remove common parenthetical suffixes like (Live), (Remastered), (Radio Edit)
s = re.sub(r'\s*\((?:live|remaster(?:ed)?|deluxe|bonus|radio\s*edit|single\s*version|visualize.*?)\)', '', s, flags=re.IGNORECASE)
# Remove featuring info: "(feat. ...)", "(ft. ...)", "(featuring ...)"
s = re.sub(r'\s*\((?:feat\.?|ft\.?|featuring)\s+[^)]*\)', '', s, flags=re.IGNORECASE)
# Remove trailing featuring info: "feat. ...", "ft. ...", "featuring ..."
s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE)
# Remove non-alphanumeric except spaces
s = re.sub(r'[^\w\s]', '', s)
# Collapse whitespace
s = re.sub(r'\s+', ' ', s).strip()
return s
def _similarity(a: str, b: str) -> float:
"""Calculate similarity between two strings (0.0-1.0) after normalization."""
na = _normalize(a)
nb = _normalize(b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
return SequenceMatcher(None, na, nb).ratio()
def _find_best_title_artist_match(
recordings: List[Dict[str, Any]],
expected_title: str,
expected_artist: str,
) -> Tuple[Optional[Dict], float, float]:
"""
Find the AcoustID recording that best matches expected title/artist.
Returns:
(best_recording, title_similarity, artist_similarity)
"""
best_rec = None
best_title_sim = 0.0
best_artist_sim = 0.0
best_combined = 0.0
for rec in recordings:
title = rec.get('title') or ''
artist = rec.get('artist') or ''
title_sim = _similarity(expected_title, title)
artist_sim = _similarity(expected_artist, artist)
# Weight title higher since that's the primary identifier
combined = (title_sim * 0.6) + (artist_sim * 0.4)
if combined > best_combined:
best_combined = combined
best_rec = rec
best_title_sim = title_sim
best_artist_sim = artist_sim
return best_rec, best_title_sim, best_artist_sim
# Shared MusicBrainz client for enrichment lookups
_mb_client = None
_mb_client_lock = threading.Lock()
MAX_MB_ENRICHMENT_LOOKUPS = 3
def _get_mb_client() -> MusicBrainzClient:
"""Get or create a shared MusicBrainz client instance."""
global _mb_client
if _mb_client is None:
with _mb_client_lock:
if _mb_client is None:
_mb_client = MusicBrainzClient()
return _mb_client
def _enrich_recordings_from_musicbrainz(
recordings: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""
Enrich recordings that are missing title/artist by looking up their
MBIDs via MusicBrainz.
AcoustID often returns recordings with title=None, artist=None even though
the MBIDs are valid. This resolves the metadata so verification can compare
title/artist instead of skipping.
Args:
recordings: List of recording dicts from fingerprint_and_lookup()
Returns:
The same list, with title/artist filled in where possible.
"""
# Fast path: if any recording already has title AND artist, no enrichment needed
if any(rec.get('title') and rec.get('artist') for rec in recordings):
return recordings
logger.info(f"Enriching {len(recordings)} recordings via MusicBrainz (all missing title/artist)...")
mb = _get_mb_client()
enriched_count = 0
for rec in recordings[:MAX_MB_ENRICHMENT_LOOKUPS]:
mbid = rec.get('mbid')
if not mbid:
continue
try:
data = mb.get_recording(mbid, includes=['artist-credits'])
if not data:
logger.debug(f"MusicBrainz returned no data for recording {mbid}")
continue
title = data.get('title')
artist_credit = data.get('artist-credit', [])
# Build artist string from artist-credit array
# Each entry has {"artist": {"name": "..."}, "joinphrase": "..."}
artist_parts = []
for credit in artist_credit:
name = credit.get('artist', {}).get('name', '')
joinphrase = credit.get('joinphrase', '')
if name:
artist_parts.append(name + joinphrase)
artist = ''.join(artist_parts).strip() if artist_parts else None
if title:
rec['title'] = title
logger.debug(f"Enriched {mbid}: title='{title}'")
if artist:
rec['artist'] = artist
logger.debug(f"Enriched {mbid}: artist='{artist}'")
if title or artist:
enriched_count += 1
except Exception as e:
logger.debug(f"Failed to enrich recording {mbid}: {e}")
continue
logger.info(f"Enriched {enriched_count}/{min(len(recordings), MAX_MB_ENRICHMENT_LOOKUPS)} recordings from MusicBrainz")
return recordings
class AcoustIDVerification:
"""
Verification service that compares audio fingerprint identity
against expected track metadata using title/artist matching.
Design Principle: FAIL OPEN
- Only returns FAIL when we are CONFIDENT the file is wrong
- Any error or uncertainty results in SKIP (continue normally)
- Never blocks downloads due to verification infrastructure issues
Usage:
verifier = AcoustIDVerification()
result, message = verifier.verify_audio_file(
"/path/to/downloaded.mp3",
"Expected Song Title",
"Expected Artist"
)
if result == VerificationResult.FAIL:
# Move to quarantine
else:
# Continue with normal processing (PASS, SKIP, or DISABLED)
"""
def __init__(self):
"""Initialize verification service."""
self.acoustid_client = AcoustIDClient()
def verify_audio_file(
self,
audio_file_path: str,
expected_track_name: str,
expected_artist_name: str,
context: Optional[Dict[str, Any]] = None
) -> Tuple[VerificationResult, str]:
"""
Verify that an audio file matches expected track metadata.
Compares title/artist from AcoustID fingerprint results against
the expected track info. No MusicBrainz lookup needed.
Args:
audio_file_path: Path to the downloaded audio file
expected_track_name: Track name we expected to download
expected_artist_name: Artist name we expected
context: Optional download context for logging/debugging
Returns:
Tuple of (VerificationResult, reason_message)
"""
try:
# Step 1: Check availability
available, reason = self.acoustid_client.is_available()
if not available:
logger.debug(f"AcoustID verification skipped: {reason}")
return VerificationResult.SKIP, reason
# Step 2: Fingerprint and lookup in AcoustID
logger.info(f"Fingerprinting and looking up: {audio_file_path}")
acoustid_result = self.acoustid_client.fingerprint_and_lookup(audio_file_path)
if not acoustid_result:
return VerificationResult.SKIP, "Track not found in AcoustID database"
recordings = acoustid_result.get('recordings', [])
best_score = acoustid_result.get('best_score', 0)
if not recordings:
return VerificationResult.SKIP, "AcoustID returned no recordings"
logger.debug(
f"AcoustID returned {len(recordings)} recording(s) "
f"(best fingerprint score: {best_score:.2f})"
)
# Step 3: Check fingerprint confidence
if best_score < MIN_ACOUSTID_SCORE:
msg = f"AcoustID fingerprint score too low ({best_score:.2f}) to verify"
logger.info(msg)
return VerificationResult.SKIP, msg
# Enrich recordings that are missing title/artist via MusicBrainz lookup
recordings = _enrich_recordings_from_musicbrainz(recordings)
# Step 4: Find best title/artist match among AcoustID results
best_rec, title_sim, artist_sim = _find_best_title_artist_match(
recordings, expected_track_name, expected_artist_name
)
if not best_rec:
return VerificationResult.SKIP, "No recordings with title/artist info"
matched_title = best_rec.get('title', '?')
matched_artist = best_rec.get('artist', '?')
logger.info(
f"Best match: '{matched_title}' by '{matched_artist}' "
f"(title_sim={title_sim:.2f}, artist_sim={artist_sim:.2f})"
)
# Step 5: Decide pass/fail based on similarity
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD:
msg = (
f"Audio verified: '{matched_title}' by '{matched_artist}' "
f"matches expected '{expected_track_name}' by '{expected_artist_name}' "
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
)
logger.info(f"AcoustID verification PASSED - {msg}")
return VerificationResult.PASS, msg
# Title matches but artist doesn't — could be a cover or collab, skip
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD:
# Check if the expected artist appears anywhere in the AcoustID results
for rec in recordings:
if _similarity(expected_artist_name, rec.get('artist', '')) >= ARTIST_MATCH_THRESHOLD:
msg = (
f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' "
f"in AcoustID results"
)
logger.info(f"AcoustID verification PASSED (secondary match) - {msg}")
return VerificationResult.PASS, msg
msg = (
f"Title matches but artist unclear: "
f"AcoustID='{matched_title}' by '{matched_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}'"
)
logger.info(f"AcoustID verification SKIPPED - {msg}")
return VerificationResult.SKIP, msg
# Title doesn't match — check ALL recordings for any title/artist match
# (the best combined match might not be the right one if there are many results)
for rec in recordings:
t = rec.get('title') or ''
a = rec.get('artist') or ''
if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and
_similarity(expected_artist_name, a) >= ARTIST_MATCH_THRESHOLD):
msg = (
f"Audio verified: found '{t}' by '{a}' in AcoustID results "
f"matching expected '{expected_track_name}' by '{expected_artist_name}'"
)
logger.info(f"AcoustID verification PASSED (scan match) - {msg}")
return VerificationResult.PASS, msg
# No match found — this file is likely wrong
# Report what AcoustID thinks the file actually is (top result by score)
top = recordings[0]
top_title = top.get('title', '?')
top_artist = top.get('artist', '?')
msg = (
f"Audio mismatch: file identified as '{top_title}' by '{top_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}' "
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
)
logger.warning(f"AcoustID verification FAILED - {msg}")
return VerificationResult.FAIL, msg
except Exception as e:
# Any unexpected error -> SKIP (fail open)
logger.error(f"Unexpected error during AcoustID verification: {e}")
return VerificationResult.SKIP, f"Verification error: {str(e)}"
def quick_check_available(self) -> Tuple[bool, str]:
"""
Quick check if verification is available without doing a full verification.
Returns:
Tuple of (is_available, reason)
"""
return self.acoustid_client.is_available()