You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/matching/audio_verification.py

268 lines
11 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""Shared audio-verification decision core (pure; no file/DB I/O).
Single source of truth for normalization + the PASS/SKIP/FAIL decision used by
BOTH import-time verification (``core/acoustid_verification.py``) and the library
scan (``core/repair_jobs/acoustid_scanner.py``). Historically each path had its
own ``_normalize`` and decision branches that drifted apart and produced
inconsistent results (a correct cross-script anime-OST track passed at import but
was false-flagged by the scan). Centralising the decision here means the
thresholds, normalization, alias-aware comparison, cross-script handling, version
gate and duration guard are defined exactly once.
"""
import re
from dataclasses import dataclass
from difflib import SequenceMatcher
from enum import Enum
from typing import Any, List, Optional
from utils.logging_config import get_logger
logger = get_logger("audio_verification")
# Thresholds — the single definition both paths share.
MIN_ACOUSTID_SCORE = 0.80 # Minimum fingerprint score to trust a match.
TITLE_MATCH_THRESHOLD = 0.70 # Title similarity to consider a match.
ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity to consider a match.
CLEAR_MISMATCH_THRESHOLD = 0.30 # Below this artist sim = clear wrong song.
class Decision(Enum):
PASS = "pass"
SKIP = "skip"
FAIL = "fail"
@dataclass
class Outcome:
decision: Decision
title_sim: float = 0.0
artist_sim: float = 0.0
matched_title: str = ""
matched_artist: str = ""
reason: str = ""
def normalize(text: str) -> str:
"""Normalize a title/artist for comparison.
lowercase; strip ``()`` / ``[]`` / ``<>`` annotations (version tags,
performer credits like ``<Vocal: MIKA KOBAYASHI>``); strip trailing
version / featuring tags; KEEP CJK characters (``\\w`` is unicode-aware) so
Japanese/Chinese/Korean titles produce a comparable form instead of an empty
string; collapse whitespace.
"""
if not text:
return ""
s = text.lower().strip()
# Annotations that are metadata, not core identity.
s = re.sub(r'\s*\([^)]*\)', '', s)
s = re.sub(r'\s*\[[^\]]*\]', '', s)
s = re.sub(r'\s*<[^>]*>', '', s)
# Trailing featuring / version tags.
s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE)
s = re.sub(
r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|'
r'radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$',
'', s, flags=re.IGNORECASE,
)
s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE)
# Path/separator punctuation -> space so a title keeps matching a source
# filename that substituted '_' for an illegal '/' or ':' (#851): the on-disk
# "You See Big Girl _ T_T" must normalize the same as "You See Big Girl / T:T".
# Done before the strip below so they become word boundaries, not joins.
s = re.sub(r'[\\/:_]+', ' ', s)
# Drop remaining punctuation but keep word chars (incl. CJK) + spaces.
s = re.sub(r'[^\w\s]', '', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def similarity(a: str, b: str) -> float:
"""Similarity (0.01.0) between two strings after normalization."""
na, nb = normalize(a), normalize(b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
return SequenceMatcher(None, na, nb).ratio()
_match_engine = None
def _detect_title_version(title: str) -> str:
"""Version label ('original'/'instrumental'/'live'/'remix'/...) for a title."""
global _match_engine
if not title:
return 'original'
if _match_engine is None:
from core.matching_engine import MusicMatchingEngine
_match_engine = MusicMatchingEngine()
version_type, _ = _match_engine.detect_version_type(title)
return version_type
def _alias_aware_artist_sim(expected_artist: str, actual_artist: str,
aliases: Optional[Any] = None) -> float:
"""Best artist similarity across (expected, *aliases) vs actual.
Bridges cross-script artist comparisons (kanji↔romaji etc) when MusicBrainz
aliases are available. ``aliases`` is an iterable of alias strings, or a
callable resolving them lazily (only invoked when direct similarity falls
below threshold — keeps the happy path lookup-free).
"""
from core.matching.artist_aliases import artist_names_match
direct = similarity(expected_artist, actual_artist)
if aliases is None:
return direct
if direct >= ARTIST_MATCH_THRESHOLD:
return direct
resolved = aliases() if callable(aliases) else aliases
if not resolved:
return direct
_matched, score = artist_names_match(
expected_artist, actual_artist, aliases=resolved,
threshold=ARTIST_MATCH_THRESHOLD, similarity=similarity,
)
# Diagnostic: an alias rescued a comparison direct similarity would have
# failed. INFO since it's a user-visible decision (PASS instead of FAIL).
if score >= ARTIST_MATCH_THRESHOLD and direct < ARTIST_MATCH_THRESHOLD:
from core.matching.artist_aliases import best_alias_match
winner, _ = best_alias_match(
expected_artist, actual_artist, resolved, similarity=similarity,
)
logger.info(
"Artist alias rescued comparison: expected=%r vs actual=%r "
"(direct sim=%.2f, alias %r → score=%.2f)",
expected_artist, actual_artist, direct, winner, score,
)
return score
def _find_best_title_artist_match(recordings, expected_title, expected_artist,
aliases=None):
"""Return (best_recording, title_sim, artist_sim) — title weighted higher."""
best_rec = None
best_title_sim = 0.0
best_artist_sim = 0.0
best_combined = 0.0
for rec in recordings:
title = rec.get('title') or ''
artist = rec.get('artist') or ''
title_sim = similarity(expected_title, title)
artist_sim = _alias_aware_artist_sim(expected_artist, artist, aliases)
combined = (title_sim * 0.6) + (artist_sim * 0.4)
if combined > best_combined:
best_combined = combined
best_rec = rec
best_title_sim = title_sim
best_artist_sim = artist_sim
return best_rec, best_title_sim, best_artist_sim
def evaluate(expected_title: str, expected_artist: str,
recordings: List[dict], *, fingerprint_score: float,
aliases_provider: Optional[Any] = None) -> Outcome:
"""Decide PASS / SKIP / FAIL for a fingerprinted file against expected
title/artist. Pure: no I/O. Shared by import verification and library scan.
``aliases_provider``: iterable or callable of expected-artist aliases
(kanji/cyrillic/etc) used to bridge cross-script comparisons.
Note: fingerprint-collision duration checks are the caller's responsibility
(the library scan pre-checks the top recording's length before calling this)
so the decision here stays purely about title/artist/version identity.
"""
from core.matching.script_compat import is_cross_script_mismatch
from core.matching.version_mismatch import is_acceptable_version_mismatch
# No expected artist on record (legacy/compilation rows): compare on title
# only — the old scanner treated this as artist-match=1.0 and a missing DB
# value is no evidence the file is wrong.
no_expected_artist = not normalize(expected_artist or '')
best_rec, title_sim, artist_sim = _find_best_title_artist_match(
recordings, expected_title, expected_artist, aliases_provider,
)
if no_expected_artist:
artist_sim = 1.0
if not best_rec:
return Outcome(Decision.SKIP, reason="No recordings with title/artist info")
matched_title = best_rec.get('title', '?') or '?'
matched_artist = best_rec.get('artist', '?') or '?'
def out(dec, reason):
return Outcome(dec, title_sim, artist_sim, matched_title, matched_artist, reason)
# Version gate: original vs instrumental/live/remix is a real difference.
expected_version = _detect_title_version(expected_title)
matched_version = _detect_title_version(matched_title)
if expected_version != matched_version:
if not is_acceptable_version_mismatch(
expected_version, matched_version,
fingerprint_score=fingerprint_score,
title_similarity=title_sim, artist_similarity=artist_sim,
):
return out(Decision.FAIL,
f"Version mismatch: expected ({expected_version}) "
f"but file is ({matched_version})")
# Clean match.
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD:
return out(Decision.PASS, "Audio verified")
# Title matches, artist doesn't — cover/collab vs genuinely wrong.
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD:
for rec in recordings:
if _alias_aware_artist_sim(
expected_artist, rec.get('artist', ''), aliases_provider,
) >= ARTIST_MATCH_THRESHOLD:
return out(Decision.PASS, "Expected artist found in AcoustID results")
if artist_sim < CLEAR_MISMATCH_THRESHOLD:
return out(Decision.FAIL,
f"Audio mismatch: '{matched_title}' by '{matched_artist}' "
f"— expected artist not found")
return out(Decision.SKIP, "Title matches but artist ambiguous (cover/collab?)")
# Title doesn't match — scan all recordings for a version-matched hit.
def _title_sim(a, b):
return similarity(a, b)
def _artist_sim(ea, aa):
return _alias_aware_artist_sim(ea, aa, aliases_provider)
candidate = None
for rec in recordings:
if _detect_title_version(rec.get('title') or '') != expected_version:
continue
if (similarity(expected_title, rec.get('title') or '') >= TITLE_MATCH_THRESHOLD
and _alias_aware_artist_sim(
expected_artist, rec.get('artist', ''), aliases_provider,
) >= ARTIST_MATCH_THRESHOLD):
candidate = rec
break
if candidate is not None:
return out(Decision.PASS, "Scan match found in AcoustID results")
# High-confidence / cross-script skips (don't quarantine a correct file).
has_non_ascii = (any(ord(c) > 127 for c in (expected_title or ''))
or any(ord(c) > 127 for c in matched_title))
language_script_skip = (fingerprint_score >= 0.95 and has_non_ascii
and artist_sim >= ARTIST_MATCH_THRESHOLD)
high_confidence_strong_match_skip = (fingerprint_score >= 0.95
and title_sim >= 0.80
and artist_sim >= ARTIST_MATCH_THRESHOLD)
cross_script_artist_skip = (fingerprint_score >= MIN_ACOUSTID_SCORE
and artist_sim >= ARTIST_MATCH_THRESHOLD
and is_cross_script_mismatch(expected_artist, matched_artist))
if (language_script_skip or high_confidence_strong_match_skip
or cross_script_artist_skip):
return out(Decision.SKIP, "Likely same song in different language/script")
return out(Decision.FAIL,
f"Audio mismatch: file identified as '{matched_title}' by "
f"'{matched_artist}', expected '{expected_title}' by '{expected_artist}'")