mirror of https://github.com/Nezreka/SoulSync.git
Add minimum 60% title similarity gate to match_recording() — prevents artist bonus + MB score from pushing unrelated titles past the confidence threshold (e.g. "Sweet Surrender" matching "Answers" by same artist). New MBID Mismatch Detector repair job reads embedded MusicBrainz recording IDs from audio files, verifies them against the MusicBrainz API, and flags tracks where the MBID points to a different song. Fix action strips the bad MBID tag so media servers like Navidrome fall back to correct file tags.pull/253/head
parent
7871f4581c
commit
87b39634a0
@ -0,0 +1,406 @@
|
||||
"""MBID Mismatch Detector — finds tracks with embedded MusicBrainz IDs that
|
||||
don't match the track's actual title/artist.
|
||||
|
||||
When a wrong MBID is embedded, media servers like Navidrome use it to look up
|
||||
metadata from MusicBrainz, overriding the file's correct title/artist tags.
|
||||
This causes tracks to display with wrong names in the media server even though
|
||||
SoulSync shows them correctly.
|
||||
"""
|
||||
|
||||
import os
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
from core.repair_jobs import register_job
|
||||
from core.repair_jobs.base import JobContext, JobResult, RepairJob
|
||||
from utils.logging_config import get_logger
|
||||
|
||||
logger = get_logger("repair_job.mbid_mismatch")
|
||||
|
||||
# Tag name → format mappings (must match web_server.py write logic)
|
||||
_MBID_TAG_KEYS = {
|
||||
# MP3 (ID3): UFID frame with owner 'http://musicbrainz.org'
|
||||
'mp3_ufid_owner': 'http://musicbrainz.org',
|
||||
# FLAC/OGG: Vorbis comment key
|
||||
'vorbis': 'MUSICBRAINZ_TRACKID',
|
||||
# MP4/M4A: freeform key
|
||||
'mp4': '----:com.apple.iTunes:MusicBrainz Track Id',
|
||||
}
|
||||
|
||||
TITLE_SIMILARITY_THRESHOLD = 0.55
|
||||
|
||||
|
||||
def _normalize(s):
|
||||
"""Lowercase, strip whitespace and common suffixes for comparison."""
|
||||
if not s:
|
||||
return ''
|
||||
import re
|
||||
s = s.lower().strip()
|
||||
# Strip parentheticals like (Live), (Remastered), (feat. X)
|
||||
s = re.sub(r'\s*\(.*?\)\s*', ' ', s)
|
||||
# Strip brackets like [Deluxe Edition]
|
||||
s = re.sub(r'\s*\[.*?\]\s*', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def _title_matches(file_title, mb_title):
|
||||
"""Check if two titles are similar enough to be the same track."""
|
||||
a = _normalize(file_title)
|
||||
b = _normalize(mb_title)
|
||||
if not a or not b:
|
||||
return True # Can't compare, assume OK
|
||||
if a == b:
|
||||
return True
|
||||
ratio = SequenceMatcher(None, a, b).ratio()
|
||||
return ratio >= TITLE_SIMILARITY_THRESHOLD
|
||||
|
||||
|
||||
def _read_mbid_from_file(file_path):
|
||||
"""Read the MusicBrainz recording MBID from an audio file's tags.
|
||||
|
||||
Returns (mbid_string, format_name) or (None, None) if not present.
|
||||
"""
|
||||
try:
|
||||
from mutagen import File as MutagenFile
|
||||
from mutagen.id3 import ID3
|
||||
from mutagen.flac import FLAC
|
||||
from mutagen.oggvorbis import OggVorbis
|
||||
from mutagen.mp4 import MP4
|
||||
|
||||
audio = MutagenFile(file_path)
|
||||
if audio is None:
|
||||
return None, None
|
||||
|
||||
if isinstance(audio.tags, ID3):
|
||||
# MP3: UFID frame
|
||||
ufid_key = f'UFID:{_MBID_TAG_KEYS["mp3_ufid_owner"]}'
|
||||
ufid = audio.tags.get(ufid_key)
|
||||
if ufid and ufid.data:
|
||||
return ufid.data.decode('ascii', errors='ignore'), 'mp3'
|
||||
# Also check TXXX fallback (some taggers use this)
|
||||
for key in ['TXXX:MusicBrainz Track Id', 'TXXX:MUSICBRAINZ_TRACKID']:
|
||||
txxx = audio.tags.get(key)
|
||||
if txxx and txxx.text:
|
||||
return txxx.text[0], 'mp3'
|
||||
return None, None
|
||||
|
||||
elif isinstance(audio, (FLAC, OggVorbis)):
|
||||
vals = audio.get(_MBID_TAG_KEYS['vorbis'], [])
|
||||
if not vals:
|
||||
vals = audio.get('musicbrainz_trackid', [])
|
||||
if vals:
|
||||
return vals[0], 'flac' if isinstance(audio, FLAC) else 'ogg'
|
||||
return None, None
|
||||
|
||||
elif isinstance(audio, MP4):
|
||||
vals = audio.get(_MBID_TAG_KEYS['mp4'], [])
|
||||
if vals:
|
||||
raw = vals[0]
|
||||
if isinstance(raw, bytes):
|
||||
return raw.decode('utf-8', errors='ignore'), 'mp4'
|
||||
return str(raw), 'mp4'
|
||||
return None, None
|
||||
|
||||
return None, None
|
||||
except Exception as e:
|
||||
logger.debug("Error reading MBID from %s: %s", file_path, e)
|
||||
return None, None
|
||||
|
||||
|
||||
def _remove_mbid_from_file(file_path):
|
||||
"""Remove the MusicBrainz recording MBID tag from an audio file.
|
||||
|
||||
Returns True if tag was removed and file saved, False otherwise.
|
||||
"""
|
||||
try:
|
||||
from mutagen import File as MutagenFile
|
||||
from mutagen.id3 import ID3
|
||||
from mutagen.flac import FLAC
|
||||
from mutagen.oggvorbis import OggVorbis
|
||||
from mutagen.mp4 import MP4
|
||||
|
||||
audio = MutagenFile(file_path)
|
||||
if audio is None:
|
||||
return False
|
||||
|
||||
removed = False
|
||||
|
||||
if isinstance(audio.tags, ID3):
|
||||
ufid_key = f'UFID:{_MBID_TAG_KEYS["mp3_ufid_owner"]}'
|
||||
if ufid_key in audio.tags:
|
||||
del audio.tags[ufid_key]
|
||||
removed = True
|
||||
for key in ['TXXX:MusicBrainz Track Id', 'TXXX:MUSICBRAINZ_TRACKID']:
|
||||
if key in audio.tags:
|
||||
del audio.tags[key]
|
||||
removed = True
|
||||
|
||||
elif isinstance(audio, (FLAC, OggVorbis)):
|
||||
for key in [_MBID_TAG_KEYS['vorbis'], 'musicbrainz_trackid']:
|
||||
if key in audio:
|
||||
del audio[key]
|
||||
removed = True
|
||||
|
||||
elif isinstance(audio, MP4):
|
||||
mp4_key = _MBID_TAG_KEYS['mp4']
|
||||
if mp4_key in audio:
|
||||
del audio[mp4_key]
|
||||
removed = True
|
||||
|
||||
if removed:
|
||||
audio.save()
|
||||
return removed
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error removing MBID from %s: %s", file_path, e)
|
||||
return False
|
||||
|
||||
|
||||
def _resolve_file_path(file_path, transfer_folder, download_folder=None):
|
||||
"""Resolve a stored DB path to an actual file on disk."""
|
||||
if not file_path:
|
||||
return None
|
||||
if os.path.exists(file_path):
|
||||
return file_path
|
||||
|
||||
path_parts = file_path.replace('\\', '/').split('/')
|
||||
for base_dir in [transfer_folder, download_folder]:
|
||||
if not base_dir or not os.path.isdir(base_dir):
|
||||
continue
|
||||
for i in range(1, len(path_parts)):
|
||||
candidate = os.path.join(base_dir, *path_parts[i:])
|
||||
if os.path.exists(candidate):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
@register_job
|
||||
class MbidMismatchDetectorJob(RepairJob):
|
||||
job_id = 'mbid_mismatch_detector'
|
||||
display_name = 'MBID Mismatch Detector'
|
||||
description = 'Finds tracks with wrong MusicBrainz IDs that cause media server mismatches'
|
||||
help_text = (
|
||||
'Scans your library for tracks that have an embedded MusicBrainz recording ID '
|
||||
'(MBID) that doesn\'t match the track\'s actual title.\n\n'
|
||||
'When a wrong MBID is embedded in an audio file, media servers like Navidrome '
|
||||
'use it to look up metadata from MusicBrainz, overriding the file\'s correct '
|
||||
'title and artist tags. This causes tracks to display with wrong names in the '
|
||||
'media server even though SoulSync shows them correctly.\n\n'
|
||||
'The fix action removes the bad MBID tag from the audio file, allowing the media '
|
||||
'server to fall back to the file\'s actual title/artist tags.\n\n'
|
||||
'This job reads each audio file\'s tags and queries MusicBrainz to verify the '
|
||||
'embedded MBID points to the correct recording. Rate-limited to avoid overloading '
|
||||
'the MusicBrainz API.'
|
||||
)
|
||||
icon = 'repair-icon-mbid'
|
||||
default_enabled = False
|
||||
default_interval_hours = 168 # weekly
|
||||
default_settings = {
|
||||
'similarity_threshold': 0.55,
|
||||
}
|
||||
auto_fix = False
|
||||
|
||||
def scan(self, context: JobContext) -> JobResult:
|
||||
result = JobResult()
|
||||
|
||||
# Get all tracks with file paths
|
||||
tracks = []
|
||||
conn = None
|
||||
try:
|
||||
conn = context.db._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT t.id, t.title, ar.name, al.title, t.file_path,
|
||||
al.thumb_url, ar.thumb_url
|
||||
FROM tracks t
|
||||
LEFT JOIN artists ar ON ar.id = t.artist_id
|
||||
LEFT JOIN albums al ON al.id = t.album_id
|
||||
WHERE t.file_path IS NOT NULL AND t.file_path != ''
|
||||
""")
|
||||
tracks = cursor.fetchall()
|
||||
except Exception as e:
|
||||
logger.error("Error fetching tracks: %s", e, exc_info=True)
|
||||
result.errors += 1
|
||||
return result
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
total = len(tracks)
|
||||
if context.update_progress:
|
||||
context.update_progress(0, total)
|
||||
if context.report_progress:
|
||||
context.report_progress(phase=f'Scanning {total} tracks for MBID mismatches...', total=total)
|
||||
|
||||
download_folder = None
|
||||
if context.config_manager:
|
||||
download_folder = context.config_manager.get('soulseek.download_path', '')
|
||||
|
||||
# We need a MusicBrainz client for MBID lookups
|
||||
mb_client = None
|
||||
if context.mb_client:
|
||||
mb_client = context.mb_client
|
||||
else:
|
||||
try:
|
||||
from core.musicbrainz_client import MusicBrainzClient
|
||||
mb_client = MusicBrainzClient()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not mb_client:
|
||||
logger.warning("MusicBrainz client not available, skipping MBID mismatch scan")
|
||||
if context.report_progress:
|
||||
context.report_progress(
|
||||
log_line='MusicBrainz client not available — cannot verify MBIDs',
|
||||
log_type='error'
|
||||
)
|
||||
return result
|
||||
|
||||
checked = 0
|
||||
import time
|
||||
|
||||
for i, row in enumerate(tracks):
|
||||
if context.check_stop():
|
||||
return result
|
||||
if i % 100 == 0 and context.wait_if_paused():
|
||||
return result
|
||||
|
||||
track_id, title, artist_name, album_title, file_path, album_thumb, artist_thumb = row
|
||||
|
||||
if context.update_progress and (i + 1) % 50 == 0:
|
||||
context.update_progress(i + 1, total)
|
||||
|
||||
# Resolve the file path
|
||||
resolved = _resolve_file_path(file_path, context.transfer_folder, download_folder)
|
||||
if not resolved:
|
||||
result.scanned += 1
|
||||
continue
|
||||
|
||||
# Read MBID from file tags
|
||||
mbid, fmt = _read_mbid_from_file(resolved)
|
||||
if not mbid:
|
||||
result.scanned += 1
|
||||
continue
|
||||
|
||||
# Validate the MBID against MusicBrainz
|
||||
checked += 1
|
||||
|
||||
if context.report_progress and checked % 10 == 0:
|
||||
context.report_progress(
|
||||
scanned=i + 1, total=total,
|
||||
phase=f'Verifying MBIDs ({checked} checked, {i + 1}/{total} files)',
|
||||
log_line=f'Checking: {title or "Unknown"} — {artist_name or "Unknown"}',
|
||||
log_type='info'
|
||||
)
|
||||
|
||||
try:
|
||||
# Rate limit: MusicBrainz allows ~1 req/sec
|
||||
time.sleep(1.1)
|
||||
|
||||
recording = mb_client.get_recording(mbid, includes=['artist-credits'])
|
||||
if not recording:
|
||||
# MBID doesn't exist — definitely wrong
|
||||
self._create_mismatch_finding(
|
||||
context, result, track_id, title, artist_name, album_title,
|
||||
resolved, album_thumb, artist_thumb, mbid,
|
||||
mb_title='[MBID not found]', mb_artist='[Unknown]',
|
||||
reason='MBID does not exist in MusicBrainz'
|
||||
)
|
||||
result.scanned += 1
|
||||
continue
|
||||
|
||||
mb_title = recording.get('title', '')
|
||||
mb_artists = recording.get('artist-credit', [])
|
||||
mb_artist = ''
|
||||
if mb_artists:
|
||||
for credit in mb_artists:
|
||||
if isinstance(credit, dict) and 'artist' in credit:
|
||||
mb_artist = credit['artist'].get('name', '')
|
||||
break
|
||||
|
||||
# Compare: does the MBID's title match the file's title?
|
||||
if not _title_matches(title, mb_title):
|
||||
self._create_mismatch_finding(
|
||||
context, result, track_id, title, artist_name, album_title,
|
||||
resolved, album_thumb, artist_thumb, mbid,
|
||||
mb_title=mb_title, mb_artist=mb_artist,
|
||||
reason=f'MBID points to "{mb_title}" by {mb_artist}, expected "{title}"'
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Error verifying MBID %s for track %s: %s", mbid, track_id, e)
|
||||
# Don't count as error — could be transient network issue
|
||||
|
||||
result.scanned += 1
|
||||
|
||||
if context.update_progress:
|
||||
context.update_progress(total, total)
|
||||
|
||||
logger.info("MBID mismatch scan: %d files scanned, %d with MBIDs verified, %d mismatches found",
|
||||
total, checked, result.findings_created)
|
||||
|
||||
if context.report_progress:
|
||||
context.report_progress(
|
||||
scanned=total, total=total,
|
||||
phase='Complete',
|
||||
log_line=f'Verified {checked} MBIDs — {result.findings_created} mismatches found',
|
||||
log_type='success' if result.findings_created == 0 else 'warning'
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def _create_mismatch_finding(self, context, result, track_id, title, artist_name,
|
||||
album_title, file_path, album_thumb, artist_thumb,
|
||||
mbid, mb_title, mb_artist, reason):
|
||||
"""Create a finding for a mismatched MBID."""
|
||||
if context.report_progress:
|
||||
context.report_progress(
|
||||
log_line=f'Mismatch: "{title}" has MBID for "{mb_title}"',
|
||||
log_type='error'
|
||||
)
|
||||
if context.create_finding:
|
||||
try:
|
||||
context.create_finding(
|
||||
job_id=self.job_id,
|
||||
finding_type='mbid_mismatch',
|
||||
severity='warning',
|
||||
entity_type='track',
|
||||
entity_id=str(track_id),
|
||||
file_path=file_path,
|
||||
title=f'MBID mismatch: {title or "Unknown"}',
|
||||
description=(
|
||||
f'Track "{title}" by {artist_name or "Unknown"} has an embedded '
|
||||
f'MusicBrainz ID that points to "{mb_title}" by {mb_artist}. '
|
||||
f'This causes media servers like Navidrome to display the wrong track name.'
|
||||
),
|
||||
details={
|
||||
'track_id': track_id,
|
||||
'title': title,
|
||||
'artist': artist_name,
|
||||
'album': album_title,
|
||||
'file_path': file_path,
|
||||
'mbid': mbid,
|
||||
'mb_title': mb_title,
|
||||
'mb_artist': mb_artist,
|
||||
'reason': reason,
|
||||
'album_thumb_url': album_thumb or None,
|
||||
'artist_thumb_url': artist_thumb or None,
|
||||
}
|
||||
)
|
||||
result.findings_created += 1
|
||||
except Exception as e:
|
||||
logger.debug("Error creating MBID mismatch finding for track %s: %s", track_id, e)
|
||||
result.errors += 1
|
||||
|
||||
def estimate_scope(self, context: JobContext) -> int:
|
||||
conn = None
|
||||
try:
|
||||
conn = context.db._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM tracks WHERE file_path IS NOT NULL AND file_path != ''")
|
||||
row = cursor.fetchone()
|
||||
return row[0] if row else 0
|
||||
except Exception:
|
||||
return 0
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
Loading…
Reference in new issue