mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
679 lines
30 KiB
679 lines
30 KiB
"""
|
|
AcoustID Verification Service
|
|
|
|
Verifies downloaded audio files match expected track metadata by comparing
|
|
title/artist from AcoustID fingerprint results against the expected track info.
|
|
|
|
If the audio fingerprint confidently identifies a DIFFERENT song than expected,
|
|
the file is flagged as incorrect.
|
|
"""
|
|
|
|
import re
|
|
import threading
|
|
from difflib import SequenceMatcher
|
|
from typing import Optional, Dict, Any, Tuple, List
|
|
from enum import Enum
|
|
from utils.logging_config import get_logger
|
|
from core.acoustid_client import AcoustIDClient
|
|
from core.matching_engine import MusicMatchingEngine
|
|
from core.matching.version_mismatch import is_acceptable_version_mismatch
|
|
from core.musicbrainz_client import MusicBrainzClient
|
|
|
|
logger = get_logger("acoustid.verification")
|
|
|
|
# Thresholds
|
|
MIN_ACOUSTID_SCORE = 0.80 # Minimum AcoustID fingerprint score to trust
|
|
TITLE_MATCH_THRESHOLD = 0.70 # Title similarity needed to consider a match
|
|
ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity needed to consider a match
|
|
|
|
# Single matching-engine instance so version detection reuses the same patterns
|
|
# used by the pre-download Soulseek matcher (remix / live / acoustic /
|
|
# instrumental / etc). detect_version_type doesn't use self state, so one
|
|
# shared instance is fine.
|
|
_match_engine_for_version = MusicMatchingEngine()
|
|
|
|
|
|
def _detect_title_version(title: str) -> str:
|
|
"""Return version label for a track title.
|
|
|
|
Returns ``'original'`` when no version marker is detected, otherwise one
|
|
of the labels produced by ``MusicMatchingEngine.detect_version_type``
|
|
(``'instrumental'``, ``'live'``, ``'acoustic'``, ``'remix'``, etc).
|
|
"""
|
|
if not title:
|
|
return 'original'
|
|
version_type, _ = _match_engine_for_version.detect_version_type(title)
|
|
return version_type
|
|
|
|
|
|
class VerificationResult(Enum):
|
|
"""Possible outcomes of audio verification."""
|
|
PASS = "pass" # Title/artist match - file is correct
|
|
FAIL = "fail" # Title/artist mismatch - wrong file downloaded
|
|
SKIP = "skip" # Could not verify (error or unavailable) - continue normally
|
|
DISABLED = "disabled" # Verification not enabled
|
|
|
|
|
|
def _normalize(text: str) -> str:
|
|
"""Normalize a string for comparison: lowercase, strip parentheticals, punctuation."""
|
|
if not text:
|
|
return ""
|
|
s = text.lower().strip()
|
|
# Remove ALL parenthetical suffixes — these are metadata annotations, not core title
|
|
# Covers: (Live), (Remastered), (Parody of ...), (from "..." Soundtrack), (feat. ...), etc.
|
|
s = re.sub(r'\s*\([^)]*\)', '', s)
|
|
# Remove ALL square bracket suffixes: [Live], [Remastered], [Deluxe], etc.
|
|
s = re.sub(r'\s*\[[^\]]*\]', '', s)
|
|
# Remove trailing featuring info not in parentheses: "feat. ...", "ft. ...", "featuring ..."
|
|
s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE)
|
|
# Remove dash-separated version tags: "- Vocal", "- Instrumental", "- Acoustic", etc.
|
|
s = re.sub(r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$', '', s, flags=re.IGNORECASE)
|
|
# Remove soundtrack/source subtitles: ' - From "..." Soundtrack', ' - from the film ...'
|
|
s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE)
|
|
# Remove non-alphanumeric except spaces
|
|
s = re.sub(r'[^\w\s]', '', s)
|
|
# Collapse whitespace
|
|
s = re.sub(r'\s+', ' ', s).strip()
|
|
return s
|
|
|
|
|
|
def _similarity(a: str, b: str) -> float:
|
|
"""Calculate similarity between two strings (0.0-1.0) after normalization."""
|
|
na = _normalize(a)
|
|
nb = _normalize(b)
|
|
if not na or not nb:
|
|
return 0.0
|
|
if na == nb:
|
|
return 1.0
|
|
return SequenceMatcher(None, na, nb).ratio()
|
|
|
|
|
|
def _alias_aware_artist_sim(
|
|
expected_artist: str,
|
|
actual_artist: str,
|
|
aliases: Optional[Any] = None,
|
|
) -> float:
|
|
"""Best artist-similarity across (expected, *aliases) vs actual.
|
|
|
|
Issue #442 — when expected and actual are in different scripts
|
|
(e.g. `Hiroyuki Sawano` vs `澤野弘之`), raw `_similarity` scores
|
|
near 0% even though MusicBrainz aliases bridge them. Routes
|
|
through the pure helper so the verifier inherits one shared
|
|
contract.
|
|
|
|
Returns the highest score across all candidates so existing
|
|
threshold checks (>= ARTIST_MATCH_THRESHOLD) keep their
|
|
semantics. When `aliases` is None or empty, behaves identically
|
|
to the prior raw `_similarity(expected, actual)` call.
|
|
|
|
`aliases` accepts two shapes:
|
|
|
|
- **Iterable** (list/tuple/set of strings): used directly. Used
|
|
by tests that already know the aliases.
|
|
- **Callable**: invoked LAZILY only when direct similarity
|
|
falls below the threshold. Lets the verifier pass a memoizing
|
|
thunk that resolves aliases (DB / cache / live MB) only when
|
|
needed. Verifications where the direct match already passes
|
|
never trigger the lookup chain — no wasted DB query for the
|
|
happy path.
|
|
|
|
Diagnostic logging: emits an INFO line whenever an alias rescues
|
|
a comparison that direct similarity would have failed. Lets
|
|
future bug reports trace which alias triggered which PASS
|
|
decision (e.g. "this file passed because alias `澤野弘之` matched
|
|
the file's artist tag").
|
|
"""
|
|
from core.matching.artist_aliases import artist_names_match
|
|
|
|
direct = _similarity(expected_artist, actual_artist)
|
|
# Fast path — direct match already passes the threshold OR caller
|
|
# supplied no aliases handle. Avoids any lookup work.
|
|
if aliases is None:
|
|
return direct
|
|
if direct >= ARTIST_MATCH_THRESHOLD:
|
|
return direct
|
|
|
|
# Resolve the iterable. Callable provider invoked NOW (lazily —
|
|
# the caller can memoize the result across multiple invocations
|
|
# within one verify_audio_file call).
|
|
resolved = aliases() if callable(aliases) else aliases
|
|
if not resolved:
|
|
return direct
|
|
|
|
_matched, score = artist_names_match(
|
|
expected_artist,
|
|
actual_artist,
|
|
aliases=resolved,
|
|
threshold=ARTIST_MATCH_THRESHOLD,
|
|
similarity=_similarity,
|
|
)
|
|
|
|
# Diagnostic — alias rescued a comparison that direct would
|
|
# have failed. Worth logging at INFO since it's a user-visible
|
|
# decision (file PASS instead of FAIL). One line per rescue
|
|
# within a single verify call.
|
|
if score >= ARTIST_MATCH_THRESHOLD and direct < ARTIST_MATCH_THRESHOLD:
|
|
from core.matching.artist_aliases import best_alias_match
|
|
winner, _ = best_alias_match(
|
|
expected_artist, actual_artist, resolved, similarity=_similarity,
|
|
)
|
|
logger.info(
|
|
"Artist alias rescued comparison: expected=%r vs actual=%r "
|
|
"(direct sim=%.2f, alias %r → score=%.2f)",
|
|
expected_artist, actual_artist, direct, winner, score,
|
|
)
|
|
|
|
return score
|
|
|
|
|
|
def _find_best_title_artist_match(
|
|
recordings: List[Dict[str, Any]],
|
|
expected_title: str,
|
|
expected_artist: str,
|
|
expected_artist_aliases: Optional[Any] = None,
|
|
) -> Tuple[Optional[Dict], float, float]:
|
|
"""
|
|
Find the AcoustID recording that best matches expected title/artist.
|
|
|
|
Issue #442 — `expected_artist_aliases` (when supplied) is the
|
|
list of alternate spellings for `expected_artist` (Japanese
|
|
kanji, Cyrillic, etc.). Accepts either:
|
|
|
|
- An iterable of alias strings (used eagerly), or
|
|
- A callable returning the list (resolved lazily — only fires
|
|
when at least one recording fails direct artist similarity).
|
|
|
|
Each recording's artist is scored against (expected, *aliases)
|
|
and the best score wins. When the list is empty/omitted/None,
|
|
behavior is identical to the prior raw similarity comparison.
|
|
|
|
Returns:
|
|
(best_recording, title_similarity, artist_similarity)
|
|
"""
|
|
best_rec = None
|
|
best_title_sim = 0.0
|
|
best_artist_sim = 0.0
|
|
best_combined = 0.0
|
|
|
|
for rec in recordings:
|
|
title = rec.get('title') or ''
|
|
artist = rec.get('artist') or ''
|
|
|
|
title_sim = _similarity(expected_title, title)
|
|
artist_sim = _alias_aware_artist_sim(
|
|
expected_artist, artist, expected_artist_aliases,
|
|
)
|
|
# Weight title higher since that's the primary identifier
|
|
combined = (title_sim * 0.6) + (artist_sim * 0.4)
|
|
|
|
if combined > best_combined:
|
|
best_combined = combined
|
|
best_rec = rec
|
|
best_title_sim = title_sim
|
|
best_artist_sim = artist_sim
|
|
|
|
return best_rec, best_title_sim, best_artist_sim
|
|
|
|
|
|
# Shared MusicBrainz client for enrichment lookups
|
|
_mb_client = None
|
|
_mb_client_lock = threading.Lock()
|
|
|
|
# Shared MusicBrainzService for alias lookups (issue #442). Service
|
|
# layer wraps the raw client + adds caching + DB access — all of which
|
|
# the alias resolution chain (library DB → cache → live MB) needs.
|
|
_mb_service = None
|
|
_mb_service_lock = threading.Lock()
|
|
|
|
MAX_MB_ENRICHMENT_LOOKUPS = 3
|
|
|
|
|
|
def _get_mb_client() -> MusicBrainzClient:
|
|
"""Get or create a shared MusicBrainz client instance."""
|
|
global _mb_client
|
|
if _mb_client is None:
|
|
with _mb_client_lock:
|
|
if _mb_client is None:
|
|
_mb_client = MusicBrainzClient()
|
|
return _mb_client
|
|
|
|
|
|
def _get_mb_service():
|
|
"""Get or create a shared MusicBrainzService instance.
|
|
|
|
Used by the alias-resolution chain in `verify_audio_file`. Lazy
|
|
init so importing this module doesn't trigger a DB connection on
|
|
paths that never run AcoustID verification (test runs, dry runs).
|
|
"""
|
|
global _mb_service
|
|
if _mb_service is None:
|
|
with _mb_service_lock:
|
|
if _mb_service is None:
|
|
from core.musicbrainz_service import MusicBrainzService
|
|
from database.music_database import get_database
|
|
_mb_service = MusicBrainzService(get_database())
|
|
return _mb_service
|
|
|
|
|
|
def _resolve_expected_artist_aliases(expected_artist_name: str) -> List[str]:
|
|
"""Look up alternate-spelling aliases for the expected artist.
|
|
|
|
Issue #442 — bridges cross-script artist comparisons (Japanese
|
|
kanji ↔ romanized, Cyrillic ↔ Latin, etc.) without forcing the
|
|
verifier to know about the resolution chain. Best-effort: any
|
|
failure (no MB service, network down, no library DB) returns
|
|
empty list so verification falls back to the prior direct
|
|
similarity check.
|
|
"""
|
|
if not expected_artist_name:
|
|
return []
|
|
try:
|
|
return _get_mb_service().lookup_artist_aliases(expected_artist_name)
|
|
except Exception as e:
|
|
logger.debug("alias lookup failed for %r: %s", expected_artist_name, e)
|
|
return []
|
|
|
|
|
|
def _enrich_recordings_from_musicbrainz(
|
|
recordings: List[Dict[str, Any]],
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Enrich recordings that are missing title/artist by looking up their
|
|
MBIDs via MusicBrainz.
|
|
|
|
AcoustID often returns recordings with title=None, artist=None even though
|
|
the MBIDs are valid. This resolves the metadata so verification can compare
|
|
title/artist instead of skipping.
|
|
|
|
Args:
|
|
recordings: List of recording dicts from fingerprint_and_lookup()
|
|
|
|
Returns:
|
|
The same list, with title/artist filled in where possible.
|
|
"""
|
|
# Fast path: if any recording already has title AND artist, no enrichment needed
|
|
if any(rec.get('title') and rec.get('artist') for rec in recordings):
|
|
return recordings
|
|
|
|
logger.info(f"Enriching {len(recordings)} recordings via MusicBrainz (all missing title/artist)...")
|
|
|
|
mb = _get_mb_client()
|
|
enriched_count = 0
|
|
|
|
for rec in recordings[:MAX_MB_ENRICHMENT_LOOKUPS]:
|
|
mbid = rec.get('mbid')
|
|
if not mbid:
|
|
continue
|
|
|
|
try:
|
|
data = mb.get_recording(mbid, includes=['artist-credits'])
|
|
if not data:
|
|
logger.debug(f"MusicBrainz returned no data for recording {mbid}")
|
|
continue
|
|
|
|
title = data.get('title')
|
|
artist_credit = data.get('artist-credit', [])
|
|
|
|
# Build artist string from artist-credit array
|
|
# Each entry has {"artist": {"name": "..."}, "joinphrase": "..."}
|
|
artist_parts = []
|
|
for credit in artist_credit:
|
|
name = credit.get('artist', {}).get('name', '')
|
|
joinphrase = credit.get('joinphrase', '')
|
|
if name:
|
|
artist_parts.append(name + joinphrase)
|
|
artist = ''.join(artist_parts).strip() if artist_parts else None
|
|
|
|
if title:
|
|
rec['title'] = title
|
|
logger.debug(f"Enriched {mbid}: title='{title}'")
|
|
if artist:
|
|
rec['artist'] = artist
|
|
logger.debug(f"Enriched {mbid}: artist='{artist}'")
|
|
|
|
if title or artist:
|
|
enriched_count += 1
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Failed to enrich recording {mbid}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Enriched {enriched_count}/{min(len(recordings), MAX_MB_ENRICHMENT_LOOKUPS)} recordings from MusicBrainz")
|
|
return recordings
|
|
|
|
|
|
class AcoustIDVerification:
|
|
"""
|
|
Verification service that compares audio fingerprint identity
|
|
against expected track metadata using title/artist matching.
|
|
|
|
Design Principle: FAIL OPEN
|
|
- Only returns FAIL when we are CONFIDENT the file is wrong
|
|
- Any error or uncertainty results in SKIP (continue normally)
|
|
- Never blocks downloads due to verification infrastructure issues
|
|
|
|
Usage:
|
|
verifier = AcoustIDVerification()
|
|
result, message = verifier.verify_audio_file(
|
|
"/path/to/downloaded.mp3",
|
|
"Expected Song Title",
|
|
"Expected Artist"
|
|
)
|
|
|
|
if result == VerificationResult.FAIL:
|
|
# Move to quarantine
|
|
else:
|
|
# Continue with normal processing (PASS, SKIP, or DISABLED)
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize verification service."""
|
|
self.acoustid_client = AcoustIDClient()
|
|
|
|
def verify_audio_file(
|
|
self,
|
|
audio_file_path: str,
|
|
expected_track_name: str,
|
|
expected_artist_name: str,
|
|
context: Optional[Dict[str, Any]] = None
|
|
) -> Tuple[VerificationResult, str]:
|
|
"""
|
|
Verify that an audio file matches expected track metadata.
|
|
|
|
Compares title/artist from AcoustID fingerprint results against
|
|
the expected track info. No MusicBrainz lookup needed.
|
|
|
|
Args:
|
|
audio_file_path: Path to the downloaded audio file
|
|
expected_track_name: Track name we expected to download
|
|
expected_artist_name: Artist name we expected
|
|
context: Optional download context for logging/debugging
|
|
|
|
Returns:
|
|
Tuple of (VerificationResult, reason_message)
|
|
"""
|
|
try:
|
|
# Step 1: Check availability
|
|
available, reason = self.acoustid_client.is_available()
|
|
if not available:
|
|
logger.debug(f"AcoustID verification skipped: {reason}")
|
|
return VerificationResult.SKIP, reason
|
|
|
|
# Step 2: Fingerprint and lookup in AcoustID
|
|
logger.info(f"Fingerprinting and looking up: {audio_file_path}")
|
|
acoustid_result = self.acoustid_client.fingerprint_and_lookup(audio_file_path)
|
|
|
|
if not acoustid_result:
|
|
return VerificationResult.SKIP, "Track not found in AcoustID database"
|
|
|
|
recordings = acoustid_result.get('recordings', [])
|
|
best_score = acoustid_result.get('best_score', 0)
|
|
|
|
if not recordings:
|
|
return VerificationResult.SKIP, "AcoustID returned no recordings"
|
|
|
|
logger.debug(
|
|
f"AcoustID returned {len(recordings)} recording(s) "
|
|
f"(best fingerprint score: {best_score:.2f})"
|
|
)
|
|
|
|
# Step 3: Check fingerprint confidence
|
|
if best_score < MIN_ACOUSTID_SCORE:
|
|
msg = f"AcoustID fingerprint score too low ({best_score:.2f}) to verify"
|
|
logger.info(msg)
|
|
return VerificationResult.SKIP, msg
|
|
|
|
# Enrich recordings that are missing title/artist via MusicBrainz lookup
|
|
recordings = _enrich_recordings_from_musicbrainz(recordings)
|
|
|
|
# Issue #442 — alias resolution is LAZY. We pass a memoising
|
|
# thunk to the artist-comparison sites; it only fires the
|
|
# multi-tier lookup (library DB → cache → live MB) when
|
|
# direct artist similarity falls below threshold. Verifications
|
|
# where the direct match already passes (the common case for
|
|
# same-script artist names) never trigger any lookup work,
|
|
# so the fix doesn't add a per-verification DB query for the
|
|
# happy path. When the thunk DOES fire, the result is cached
|
|
# in the closure so the 3 comparison sites within one
|
|
# verification share a single resolution pass.
|
|
_alias_cache: Dict[str, Any] = {}
|
|
|
|
def _aliases_provider() -> List[str]:
|
|
if 'value' not in _alias_cache:
|
|
resolved = _resolve_expected_artist_aliases(expected_artist_name)
|
|
_alias_cache['value'] = resolved
|
|
if resolved:
|
|
logger.debug(
|
|
"Resolved %d aliases for expected artist '%s'",
|
|
len(resolved), expected_artist_name,
|
|
)
|
|
return _alias_cache['value']
|
|
|
|
# Step 4: Find best title/artist match among AcoustID results
|
|
best_rec, title_sim, artist_sim = _find_best_title_artist_match(
|
|
recordings, expected_track_name, expected_artist_name,
|
|
expected_artist_aliases=_aliases_provider,
|
|
)
|
|
|
|
if not best_rec:
|
|
return VerificationResult.SKIP, "No recordings with title/artist info"
|
|
|
|
matched_title = best_rec.get('title', '?')
|
|
matched_artist = best_rec.get('artist', '?')
|
|
|
|
logger.info(
|
|
f"Best match: '{matched_title}' by '{matched_artist}' "
|
|
f"(title_sim={title_sim:.2f}, artist_sim={artist_sim:.2f})"
|
|
)
|
|
|
|
# Step 4b: Version-mismatch gate.
|
|
#
|
|
# The ``_normalize`` step deliberately strips parentheticals and
|
|
# version tags ("(Instrumental)", "- Live", etc) so that legit
|
|
# name variations don't fail the title-similarity comparison.
|
|
# That same stripping made it impossible to tell a vocal track
|
|
# apart from its instrumental: "In My Feelings" and "In My
|
|
# Feelings (Instrumental)" both normalize to "in my feelings",
|
|
# the title sim ends up 1.0, and the file passes verification
|
|
# even though it's the wrong cut.
|
|
#
|
|
# Detect the version on each side BEFORE normalization runs.
|
|
# If the expected track and the AcoustID-matched recording
|
|
# disagree on version (one is original, the other is
|
|
# instrumental / live / remix / acoustic / etc), reject — the
|
|
# fingerprint identified a real song but it's not the one the
|
|
# caller asked for.
|
|
expected_version = _detect_title_version(expected_track_name)
|
|
matched_version = _detect_title_version(matched_title)
|
|
if expected_version != matched_version:
|
|
# Issue #607 (AfonsoG6): MusicBrainz often stores live
|
|
# recordings with bare titles ("Clarity") while the
|
|
# release entry carries the venue annotation ("Clarity
|
|
# (Live at Blossom Music Center, ...)"). The fingerprint
|
|
# correctly identifies the LIVE recording; only the
|
|
# title text is bare. Helper accepts the one-sided bare
|
|
# case when fingerprint + bare-title + artist all agree.
|
|
# Two-sided version mismatches (live vs remix etc) stay
|
|
# strict — those are genuinely different recordings.
|
|
if is_acceptable_version_mismatch(
|
|
expected_version, matched_version,
|
|
fingerprint_score=best_score,
|
|
title_similarity=title_sim,
|
|
artist_similarity=artist_sim,
|
|
):
|
|
logger.info(
|
|
f"AcoustID version annotation differs (expected={expected_version}, "
|
|
f"matched={matched_version}) but fingerprint+title+artist all match — "
|
|
f"accepting (likely MB metadata gap on a live/version-annotated recording)"
|
|
)
|
|
else:
|
|
msg = (
|
|
f"Version mismatch: expected '{expected_track_name}' ({expected_version}) "
|
|
f"but file is '{matched_title}' ({matched_version})"
|
|
)
|
|
logger.warning(f"AcoustID verification FAILED (version mismatch) - {msg}")
|
|
return VerificationResult.FAIL, msg
|
|
|
|
# Step 5: Decide pass/fail based on similarity
|
|
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD:
|
|
msg = (
|
|
f"Audio verified: '{matched_title}' by '{matched_artist}' "
|
|
f"matches expected '{expected_track_name}' by '{expected_artist_name}' "
|
|
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
|
|
)
|
|
logger.info(f"AcoustID verification PASSED - {msg}")
|
|
return VerificationResult.PASS, msg
|
|
|
|
# Title matches but artist doesn't — could be a cover/collab OR a
|
|
# genuinely different track with the same name. Distinguish the
|
|
# two by checking whether the expected artist appears anywhere in
|
|
# AcoustID's returned recordings.
|
|
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD:
|
|
# First: if the expected artist is present in ANY recording's
|
|
# metadata for this fingerprint, it's likely the right track
|
|
# (AcoustID's "best" match just picked the wrong variant).
|
|
for rec in recordings:
|
|
rec_artist = rec.get('artist', '')
|
|
if _alias_aware_artist_sim(
|
|
expected_artist_name, rec_artist, _aliases_provider,
|
|
) >= ARTIST_MATCH_THRESHOLD:
|
|
msg = (
|
|
f"Audio verified: found '{expected_track_name}' by '{expected_artist_name}' "
|
|
f"in AcoustID results"
|
|
)
|
|
logger.info(f"AcoustID verification PASSED (secondary match) - {msg}")
|
|
return VerificationResult.PASS, msg
|
|
|
|
# Expected artist wasn't found anywhere. Decide between:
|
|
# - FAIL: clear mismatch, e.g. "Tom Walker" (sim ~0.2) when
|
|
# expecting "Maduk" — different song with same name
|
|
# - SKIP: ambiguous, e.g. collab / alt credit / formatting
|
|
# difference (sim 0.3-0.6)
|
|
#
|
|
# The 0.3 cutoff catches hard mismatches while preserving the
|
|
# benefit of the doubt for borderline artist formatting.
|
|
CLEAR_MISMATCH_THRESHOLD = 0.3
|
|
if artist_sim < CLEAR_MISMATCH_THRESHOLD:
|
|
msg = (
|
|
f"Audio mismatch: file identified as '{matched_title}' by '{matched_artist}', "
|
|
f"expected '{expected_track_name}' by '{expected_artist_name}' "
|
|
f"(title={title_sim:.0%}, artist={artist_sim:.0%}) — "
|
|
f"expected artist not found in any AcoustID recording"
|
|
)
|
|
logger.warning(f"AcoustID verification FAILED (clear artist mismatch) - {msg}")
|
|
return VerificationResult.FAIL, msg
|
|
|
|
msg = (
|
|
f"Title matches but artist unclear: "
|
|
f"AcoustID='{matched_title}' by '{matched_artist}', "
|
|
f"expected '{expected_track_name}' by '{expected_artist_name}' "
|
|
f"(artist_sim={artist_sim:.0%} — ambiguous, could be cover/collab)"
|
|
)
|
|
logger.info(f"AcoustID verification SKIPPED - {msg}")
|
|
return VerificationResult.SKIP, msg
|
|
|
|
# Title doesn't match — check ALL recordings for any title/artist match
|
|
# (the best combined match might not be the right one if there are many results)
|
|
# Skip recordings whose version (instrumental/live/etc) disagrees with
|
|
# what the caller asked for — the version mismatch above checked
|
|
# only the best recording, but a wrong-version variant could still
|
|
# win this fallback scan if its bare title matched.
|
|
for rec in recordings:
|
|
t = rec.get('title') or ''
|
|
a = rec.get('artist') or ''
|
|
if _detect_title_version(t) != expected_version:
|
|
continue
|
|
if (_similarity(expected_track_name, t) >= TITLE_MATCH_THRESHOLD and
|
|
_alias_aware_artist_sim(
|
|
expected_artist_name, a, _aliases_provider,
|
|
) >= ARTIST_MATCH_THRESHOLD):
|
|
msg = (
|
|
f"Audio verified: found '{t}' by '{a}' in AcoustID results "
|
|
f"matching expected '{expected_track_name}' by '{expected_artist_name}'"
|
|
)
|
|
logger.info(f"AcoustID verification PASSED (scan match) - {msg}")
|
|
return VerificationResult.PASS, msg
|
|
|
|
# No match found — but if fingerprint score is very high (≥0.95)
|
|
# AND we have evidence the mismatch is a language/script case
|
|
# (rather than two genuinely different songs by the same artist),
|
|
# skip rather than quarantine a correct file. Two routes:
|
|
#
|
|
# (a) Either side of the comparison contains non-ASCII characters
|
|
# — strong signal of transliteration / kanji↔roman cases.
|
|
# Artist must still be a strong match to use this path.
|
|
# (b) Both title AND artist similarity are very high (the song
|
|
# is recognizably the same with minor punctuation / casing
|
|
# differences that fell below the strict match thresholds).
|
|
#
|
|
# The OLD logic was ``title_sim >= 0.55 OR artist_sim >= match``.
|
|
# That fired for English-vs-English songs by the same artist that
|
|
# share NO actual content — e.g. "R.O.T.C (Interlude)" by
|
|
# Kendrick Lamar getting accepted as "Rich (Interlude)" by
|
|
# Kendrick Lamar because the artist matched perfectly and
|
|
# "interlude" was shared in both titles. Reported by user when
|
|
# downloading Mr. Morale: three tracks (Rich Interlude, Savior
|
|
# Interlude, Savior) all received the wrong R.O.T.C audio file
|
|
# because of this leak.
|
|
# Use the BEST matching recording's strings here (not
|
|
# `recordings[0]`) so the failure message reports the same
|
|
# candidate the title/artist similarity scores came from.
|
|
# Issue #607 (AfonsoG6) example 1: the prior code mixed
|
|
# `recordings[0]`'s strings (which can be empty) with
|
|
# `best_rec`'s scores, producing nonsense reasons like
|
|
# "file identified as '' by '' (artist=100%)" when a later
|
|
# recording in the list scored well on artist.
|
|
display_title = matched_title or '?'
|
|
display_artist = matched_artist or '?'
|
|
has_non_ascii = (
|
|
any(ord(c) > 127 for c in (expected_track_name or ''))
|
|
or any(ord(c) > 127 for c in display_title)
|
|
)
|
|
language_script_skip = (
|
|
best_score >= 0.95
|
|
and has_non_ascii
|
|
and artist_sim >= ARTIST_MATCH_THRESHOLD
|
|
)
|
|
high_confidence_strong_match_skip = (
|
|
best_score >= 0.95
|
|
and title_sim >= 0.80
|
|
and artist_sim >= ARTIST_MATCH_THRESHOLD
|
|
)
|
|
if language_script_skip or high_confidence_strong_match_skip:
|
|
reason = (
|
|
"likely same song in different language/script"
|
|
if language_script_skip
|
|
else "title/artist match within tolerance"
|
|
)
|
|
msg = (
|
|
f"Title/artist mismatch but fingerprint confidence very high ({best_score:.2f}): "
|
|
f"AcoustID='{display_title}' by '{display_artist}', "
|
|
f"expected '{expected_track_name}' by '{expected_artist_name}' — "
|
|
f"{reason}"
|
|
)
|
|
logger.info(f"AcoustID verification SKIPPED (high confidence) - {msg}")
|
|
return VerificationResult.SKIP, msg
|
|
|
|
# Low fingerprint score + no metadata match — file is likely wrong.
|
|
msg = (
|
|
f"Audio mismatch: file identified as '{display_title}' by '{display_artist}', "
|
|
f"expected '{expected_track_name}' by '{expected_artist_name}' "
|
|
f"(title={title_sim:.0%}, artist={artist_sim:.0%})"
|
|
)
|
|
logger.warning(f"AcoustID verification FAILED - {msg}")
|
|
return VerificationResult.FAIL, msg
|
|
|
|
except Exception as e:
|
|
# Any unexpected error -> SKIP (fail open)
|
|
logger.error(f"Unexpected error during AcoustID verification: {e}")
|
|
return VerificationResult.SKIP, f"Verification error: {str(e)}"
|
|
|
|
def quick_check_available(self) -> Tuple[bool, str]:
|
|
"""
|
|
Quick check if verification is available without doing a full verification.
|
|
|
|
Returns:
|
|
Tuple of (is_available, reason)
|
|
"""
|
|
return self.acoustid_client.is_available()
|