|
|
"""Shared audio-verification decision core (pure; no file/DB I/O).
|
|
|
|
|
|
Single source of truth for normalization + the PASS/SKIP/FAIL decision used by
|
|
|
BOTH import-time verification (``core/acoustid_verification.py``) and the library
|
|
|
scan (``core/repair_jobs/acoustid_scanner.py``). Historically each path had its
|
|
|
own ``_normalize`` and decision branches that drifted apart and produced
|
|
|
inconsistent results (a correct cross-script anime-OST track passed at import but
|
|
|
was false-flagged by the scan). Centralising the decision here means the
|
|
|
thresholds, normalization, alias-aware comparison, cross-script handling, version
|
|
|
gate and duration guard are defined exactly once.
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
from dataclasses import dataclass
|
|
|
from difflib import SequenceMatcher
|
|
|
from enum import Enum
|
|
|
from typing import Any, List, Optional
|
|
|
|
|
|
from utils.logging_config import get_logger
|
|
|
|
|
|
logger = get_logger("audio_verification")
|
|
|
|
|
|
# Thresholds — the single definition both paths share.
|
|
|
MIN_ACOUSTID_SCORE = 0.80 # Minimum fingerprint score to trust a match.
|
|
|
TITLE_MATCH_THRESHOLD = 0.70 # Title similarity to consider a match.
|
|
|
ARTIST_MATCH_THRESHOLD = 0.60 # Artist similarity to consider a match.
|
|
|
CLEAR_MISMATCH_THRESHOLD = 0.30 # Below this artist sim = clear wrong song.
|
|
|
|
|
|
|
|
|
class Decision(Enum):
|
|
|
PASS = "pass"
|
|
|
SKIP = "skip"
|
|
|
FAIL = "fail"
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
class Outcome:
|
|
|
decision: Decision
|
|
|
title_sim: float = 0.0
|
|
|
artist_sim: float = 0.0
|
|
|
matched_title: str = ""
|
|
|
matched_artist: str = ""
|
|
|
reason: str = ""
|
|
|
|
|
|
|
|
|
def normalize(text: str) -> str:
|
|
|
"""Normalize a title/artist for comparison.
|
|
|
|
|
|
lowercase; strip ``()`` / ``[]`` / ``<>`` annotations (version tags,
|
|
|
performer credits like ``<Vocal: MIKA KOBAYASHI>``); strip trailing
|
|
|
version / featuring tags; KEEP CJK characters (``\\w`` is unicode-aware) so
|
|
|
Japanese/Chinese/Korean titles produce a comparable form instead of an empty
|
|
|
string; collapse whitespace.
|
|
|
"""
|
|
|
if not text:
|
|
|
return ""
|
|
|
s = text.lower().strip()
|
|
|
# Annotations that are metadata, not core identity.
|
|
|
s = re.sub(r'\s*\([^)]*\)', '', s)
|
|
|
s = re.sub(r'\s*\[[^\]]*\]', '', s)
|
|
|
s = re.sub(r'\s*<[^>]*>', '', s)
|
|
|
# Trailing featuring / version tags.
|
|
|
s = re.sub(r'\s+(?:feat\.?|ft\.?|featuring)\s+.*$', '', s, flags=re.IGNORECASE)
|
|
|
s = re.sub(
|
|
|
r'\s*-\s*(?:vocal|instrumental|acoustic|live|remix|cover|clean|explicit|'
|
|
|
r'radio\s*edit|original\s*mix|extended\s*mix|club\s*mix)\s*$',
|
|
|
'', s, flags=re.IGNORECASE,
|
|
|
)
|
|
|
s = re.sub(r'\s*-\s*from\s+.+$', '', s, flags=re.IGNORECASE)
|
|
|
# Path/separator punctuation -> space so a title keeps matching a source
|
|
|
# filename that substituted '_' for an illegal '/' or ':' (#851): the on-disk
|
|
|
# "You See Big Girl _ T_T" must normalize the same as "You See Big Girl / T:T".
|
|
|
# Done before the strip below so they become word boundaries, not joins.
|
|
|
s = re.sub(r'[\\/:_]+', ' ', s)
|
|
|
# Drop remaining punctuation but keep word chars (incl. CJK) + spaces.
|
|
|
s = re.sub(r'[^\w\s]', '', s)
|
|
|
s = re.sub(r'\s+', ' ', s).strip()
|
|
|
return s
|
|
|
|
|
|
|
|
|
def similarity(a: str, b: str) -> float:
|
|
|
"""Similarity (0.0–1.0) between two strings after normalization."""
|
|
|
na, nb = normalize(a), normalize(b)
|
|
|
if not na or not nb:
|
|
|
return 0.0
|
|
|
if na == nb:
|
|
|
return 1.0
|
|
|
return SequenceMatcher(None, na, nb).ratio()
|
|
|
|
|
|
|
|
|
_match_engine = None
|
|
|
|
|
|
|
|
|
def _detect_title_version(title: str) -> str:
|
|
|
"""Version label ('original'/'instrumental'/'live'/'remix'/...) for a title."""
|
|
|
global _match_engine
|
|
|
if not title:
|
|
|
return 'original'
|
|
|
if _match_engine is None:
|
|
|
from core.matching_engine import MusicMatchingEngine
|
|
|
_match_engine = MusicMatchingEngine()
|
|
|
version_type, _ = _match_engine.detect_version_type(title)
|
|
|
return version_type
|
|
|
|
|
|
|
|
|
def _alias_aware_artist_sim(expected_artist: str, actual_artist: str,
|
|
|
aliases: Optional[Any] = None) -> float:
|
|
|
"""Best artist similarity across (expected, *aliases) vs actual.
|
|
|
|
|
|
Bridges cross-script artist comparisons (kanji↔romaji etc) when MusicBrainz
|
|
|
aliases are available. ``aliases`` is an iterable of alias strings, or a
|
|
|
callable resolving them lazily (only invoked when direct similarity falls
|
|
|
below threshold — keeps the happy path lookup-free).
|
|
|
"""
|
|
|
from core.matching.artist_aliases import artist_names_match
|
|
|
|
|
|
direct = similarity(expected_artist, actual_artist)
|
|
|
if aliases is None:
|
|
|
return direct
|
|
|
if direct >= ARTIST_MATCH_THRESHOLD:
|
|
|
return direct
|
|
|
resolved = aliases() if callable(aliases) else aliases
|
|
|
if not resolved:
|
|
|
return direct
|
|
|
_matched, score = artist_names_match(
|
|
|
expected_artist, actual_artist, aliases=resolved,
|
|
|
threshold=ARTIST_MATCH_THRESHOLD, similarity=similarity,
|
|
|
)
|
|
|
# Diagnostic: an alias rescued a comparison direct similarity would have
|
|
|
# failed. INFO since it's a user-visible decision (PASS instead of FAIL).
|
|
|
if score >= ARTIST_MATCH_THRESHOLD and direct < ARTIST_MATCH_THRESHOLD:
|
|
|
from core.matching.artist_aliases import best_alias_match
|
|
|
winner, _ = best_alias_match(
|
|
|
expected_artist, actual_artist, resolved, similarity=similarity,
|
|
|
)
|
|
|
logger.info(
|
|
|
"Artist alias rescued comparison: expected=%r vs actual=%r "
|
|
|
"(direct sim=%.2f, alias %r → score=%.2f)",
|
|
|
expected_artist, actual_artist, direct, winner, score,
|
|
|
)
|
|
|
return score
|
|
|
|
|
|
|
|
|
def _find_best_title_artist_match(recordings, expected_title, expected_artist,
|
|
|
aliases=None):
|
|
|
"""Return (best_recording, title_sim, artist_sim) — title weighted higher."""
|
|
|
best_rec = None
|
|
|
best_title_sim = 0.0
|
|
|
best_artist_sim = 0.0
|
|
|
best_combined = 0.0
|
|
|
for rec in recordings:
|
|
|
title = rec.get('title') or ''
|
|
|
artist = rec.get('artist') or ''
|
|
|
title_sim = similarity(expected_title, title)
|
|
|
artist_sim = _alias_aware_artist_sim(expected_artist, artist, aliases)
|
|
|
combined = (title_sim * 0.6) + (artist_sim * 0.4)
|
|
|
if combined > best_combined:
|
|
|
best_combined = combined
|
|
|
best_rec = rec
|
|
|
best_title_sim = title_sim
|
|
|
best_artist_sim = artist_sim
|
|
|
return best_rec, best_title_sim, best_artist_sim
|
|
|
|
|
|
|
|
|
def evaluate(expected_title: str, expected_artist: str,
|
|
|
recordings: List[dict], *, fingerprint_score: float,
|
|
|
aliases_provider: Optional[Any] = None) -> Outcome:
|
|
|
"""Decide PASS / SKIP / FAIL for a fingerprinted file against expected
|
|
|
title/artist. Pure: no I/O. Shared by import verification and library scan.
|
|
|
|
|
|
``aliases_provider``: iterable or callable of expected-artist aliases
|
|
|
(kanji/cyrillic/etc) used to bridge cross-script comparisons.
|
|
|
|
|
|
Note: fingerprint-collision duration checks are the caller's responsibility
|
|
|
(the library scan pre-checks the top recording's length before calling this)
|
|
|
so the decision here stays purely about title/artist/version identity.
|
|
|
"""
|
|
|
from core.matching.script_compat import is_cross_script_mismatch
|
|
|
from core.matching.version_mismatch import is_acceptable_version_mismatch
|
|
|
|
|
|
# No expected artist on record (legacy/compilation rows): compare on title
|
|
|
# only — the old scanner treated this as artist-match=1.0 and a missing DB
|
|
|
# value is no evidence the file is wrong.
|
|
|
no_expected_artist = not normalize(expected_artist or '')
|
|
|
|
|
|
best_rec, title_sim, artist_sim = _find_best_title_artist_match(
|
|
|
recordings, expected_title, expected_artist, aliases_provider,
|
|
|
)
|
|
|
if no_expected_artist:
|
|
|
artist_sim = 1.0
|
|
|
if not best_rec:
|
|
|
return Outcome(Decision.SKIP, reason="No recordings with title/artist info")
|
|
|
|
|
|
matched_title = best_rec.get('title', '?') or '?'
|
|
|
matched_artist = best_rec.get('artist', '?') or '?'
|
|
|
|
|
|
def out(dec, reason):
|
|
|
return Outcome(dec, title_sim, artist_sim, matched_title, matched_artist, reason)
|
|
|
|
|
|
# Version gate: original vs instrumental/live/remix is a real difference.
|
|
|
expected_version = _detect_title_version(expected_title)
|
|
|
matched_version = _detect_title_version(matched_title)
|
|
|
if expected_version != matched_version:
|
|
|
if not is_acceptable_version_mismatch(
|
|
|
expected_version, matched_version,
|
|
|
fingerprint_score=fingerprint_score,
|
|
|
title_similarity=title_sim, artist_similarity=artist_sim,
|
|
|
):
|
|
|
return out(Decision.FAIL,
|
|
|
f"Version mismatch: expected ({expected_version}) "
|
|
|
f"but file is ({matched_version})")
|
|
|
|
|
|
# Clean match.
|
|
|
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim >= ARTIST_MATCH_THRESHOLD:
|
|
|
return out(Decision.PASS, "Audio verified")
|
|
|
|
|
|
# Title matches, artist doesn't — cover/collab vs genuinely wrong.
|
|
|
if title_sim >= TITLE_MATCH_THRESHOLD and artist_sim < ARTIST_MATCH_THRESHOLD:
|
|
|
for rec in recordings:
|
|
|
if _alias_aware_artist_sim(
|
|
|
expected_artist, rec.get('artist', ''), aliases_provider,
|
|
|
) >= ARTIST_MATCH_THRESHOLD:
|
|
|
return out(Decision.PASS, "Expected artist found in AcoustID results")
|
|
|
if artist_sim < CLEAR_MISMATCH_THRESHOLD:
|
|
|
return out(Decision.FAIL,
|
|
|
f"Audio mismatch: '{matched_title}' by '{matched_artist}' "
|
|
|
f"— expected artist not found")
|
|
|
return out(Decision.SKIP, "Title matches but artist ambiguous (cover/collab?)")
|
|
|
|
|
|
# Title doesn't match — scan all recordings for a version-matched hit.
|
|
|
def _title_sim(a, b):
|
|
|
return similarity(a, b)
|
|
|
|
|
|
def _artist_sim(ea, aa):
|
|
|
return _alias_aware_artist_sim(ea, aa, aliases_provider)
|
|
|
|
|
|
candidate = None
|
|
|
for rec in recordings:
|
|
|
if _detect_title_version(rec.get('title') or '') != expected_version:
|
|
|
continue
|
|
|
if (similarity(expected_title, rec.get('title') or '') >= TITLE_MATCH_THRESHOLD
|
|
|
and _alias_aware_artist_sim(
|
|
|
expected_artist, rec.get('artist', ''), aliases_provider,
|
|
|
) >= ARTIST_MATCH_THRESHOLD):
|
|
|
candidate = rec
|
|
|
break
|
|
|
if candidate is not None:
|
|
|
return out(Decision.PASS, "Scan match found in AcoustID results")
|
|
|
|
|
|
# High-confidence / cross-script skips (don't quarantine a correct file).
|
|
|
has_non_ascii = (any(ord(c) > 127 for c in (expected_title or ''))
|
|
|
or any(ord(c) > 127 for c in matched_title))
|
|
|
language_script_skip = (fingerprint_score >= 0.95 and has_non_ascii
|
|
|
and artist_sim >= ARTIST_MATCH_THRESHOLD)
|
|
|
high_confidence_strong_match_skip = (fingerprint_score >= 0.95
|
|
|
and title_sim >= 0.80
|
|
|
and artist_sim >= ARTIST_MATCH_THRESHOLD)
|
|
|
cross_script_artist_skip = (fingerprint_score >= MIN_ACOUSTID_SCORE
|
|
|
and artist_sim >= ARTIST_MATCH_THRESHOLD
|
|
|
and is_cross_script_mismatch(expected_artist, matched_artist))
|
|
|
if (language_script_skip or high_confidence_strong_match_skip
|
|
|
or cross_script_artist_skip):
|
|
|
return out(Decision.SKIP, "Likely same song in different language/script")
|
|
|
|
|
|
return out(Decision.FAIL,
|
|
|
f"Audio mismatch: file identified as '{matched_title}' by "
|
|
|
f"'{matched_artist}', expected '{expected_title}' by '{expected_artist}'")
|