diff --git a/core/musicbrainz_service.py b/core/musicbrainz_service.py index 331a0598..78ab995c 100644 --- a/core/musicbrainz_service.py +++ b/core/musicbrainz_service.py @@ -302,10 +302,17 @@ class MusicBrainzService: for result in results: mb_title = result.get('title', '') mb_score = result.get('score', 0) - + # Calculate title similarity title_similarity = self._calculate_similarity(track_name, mb_title) - + + # Hard gate: title must be at least 60% similar. + # Without this, artist bonus + MB score can push totally + # different titles (e.g. "Sweet Surrender" → "Answers") + # past the confidence threshold. + if title_similarity < 0.6: + continue + # If we have artist info, check artist match too artist_bonus = 0 if artist_name and 'artist-credit' in result: @@ -317,14 +324,14 @@ class MusicBrainzService: if artist_similarity > 0.7: artist_bonus = 20 break - + # Combine scores - cap at 100 confidence = min(100, int((title_similarity * 50) + (mb_score / 100 * 30) + artist_bonus)) - + if confidence > best_confidence: best_confidence = confidence best_match = result - + # Only return matches with confidence >= 70% if best_match and best_confidence >= 70: mbid = best_match.get('id') diff --git a/core/repair_jobs/__init__.py b/core/repair_jobs/__init__.py index 79d023de..0ae7bb48 100644 --- a/core/repair_jobs/__init__.py +++ b/core/repair_jobs/__init__.py @@ -37,6 +37,7 @@ _JOB_MODULES = [ 'core.repair_jobs.album_completeness', 'core.repair_jobs.fake_lossless_detector', 'core.repair_jobs.library_reorganize', + 'core.repair_jobs.mbid_mismatch_detector', ] diff --git a/core/repair_jobs/mbid_mismatch_detector.py b/core/repair_jobs/mbid_mismatch_detector.py new file mode 100644 index 00000000..465de1a7 --- /dev/null +++ b/core/repair_jobs/mbid_mismatch_detector.py @@ -0,0 +1,406 @@ +"""MBID Mismatch Detector — finds tracks with embedded MusicBrainz IDs that +don't match the track's actual title/artist. + +When a wrong MBID is embedded, media servers like Navidrome use it to look up +metadata from MusicBrainz, overriding the file's correct title/artist tags. +This causes tracks to display with wrong names in the media server even though +SoulSync shows them correctly. +""" + +import os +from difflib import SequenceMatcher + +from core.repair_jobs import register_job +from core.repair_jobs.base import JobContext, JobResult, RepairJob +from utils.logging_config import get_logger + +logger = get_logger("repair_job.mbid_mismatch") + +# Tag name → format mappings (must match web_server.py write logic) +_MBID_TAG_KEYS = { + # MP3 (ID3): UFID frame with owner 'http://musicbrainz.org' + 'mp3_ufid_owner': 'http://musicbrainz.org', + # FLAC/OGG: Vorbis comment key + 'vorbis': 'MUSICBRAINZ_TRACKID', + # MP4/M4A: freeform key + 'mp4': '----:com.apple.iTunes:MusicBrainz Track Id', +} + +TITLE_SIMILARITY_THRESHOLD = 0.55 + + +def _normalize(s): + """Lowercase, strip whitespace and common suffixes for comparison.""" + if not s: + return '' + import re + s = s.lower().strip() + # Strip parentheticals like (Live), (Remastered), (feat. X) + s = re.sub(r'\s*\(.*?\)\s*', ' ', s) + # Strip brackets like [Deluxe Edition] + s = re.sub(r'\s*\[.*?\]\s*', ' ', s) + return s.strip() + + +def _title_matches(file_title, mb_title): + """Check if two titles are similar enough to be the same track.""" + a = _normalize(file_title) + b = _normalize(mb_title) + if not a or not b: + return True # Can't compare, assume OK + if a == b: + return True + ratio = SequenceMatcher(None, a, b).ratio() + return ratio >= TITLE_SIMILARITY_THRESHOLD + + +def _read_mbid_from_file(file_path): + """Read the MusicBrainz recording MBID from an audio file's tags. + + Returns (mbid_string, format_name) or (None, None) if not present. + """ + try: + from mutagen import File as MutagenFile + from mutagen.id3 import ID3 + from mutagen.flac import FLAC + from mutagen.oggvorbis import OggVorbis + from mutagen.mp4 import MP4 + + audio = MutagenFile(file_path) + if audio is None: + return None, None + + if isinstance(audio.tags, ID3): + # MP3: UFID frame + ufid_key = f'UFID:{_MBID_TAG_KEYS["mp3_ufid_owner"]}' + ufid = audio.tags.get(ufid_key) + if ufid and ufid.data: + return ufid.data.decode('ascii', errors='ignore'), 'mp3' + # Also check TXXX fallback (some taggers use this) + for key in ['TXXX:MusicBrainz Track Id', 'TXXX:MUSICBRAINZ_TRACKID']: + txxx = audio.tags.get(key) + if txxx and txxx.text: + return txxx.text[0], 'mp3' + return None, None + + elif isinstance(audio, (FLAC, OggVorbis)): + vals = audio.get(_MBID_TAG_KEYS['vorbis'], []) + if not vals: + vals = audio.get('musicbrainz_trackid', []) + if vals: + return vals[0], 'flac' if isinstance(audio, FLAC) else 'ogg' + return None, None + + elif isinstance(audio, MP4): + vals = audio.get(_MBID_TAG_KEYS['mp4'], []) + if vals: + raw = vals[0] + if isinstance(raw, bytes): + return raw.decode('utf-8', errors='ignore'), 'mp4' + return str(raw), 'mp4' + return None, None + + return None, None + except Exception as e: + logger.debug("Error reading MBID from %s: %s", file_path, e) + return None, None + + +def _remove_mbid_from_file(file_path): + """Remove the MusicBrainz recording MBID tag from an audio file. + + Returns True if tag was removed and file saved, False otherwise. + """ + try: + from mutagen import File as MutagenFile + from mutagen.id3 import ID3 + from mutagen.flac import FLAC + from mutagen.oggvorbis import OggVorbis + from mutagen.mp4 import MP4 + + audio = MutagenFile(file_path) + if audio is None: + return False + + removed = False + + if isinstance(audio.tags, ID3): + ufid_key = f'UFID:{_MBID_TAG_KEYS["mp3_ufid_owner"]}' + if ufid_key in audio.tags: + del audio.tags[ufid_key] + removed = True + for key in ['TXXX:MusicBrainz Track Id', 'TXXX:MUSICBRAINZ_TRACKID']: + if key in audio.tags: + del audio.tags[key] + removed = True + + elif isinstance(audio, (FLAC, OggVorbis)): + for key in [_MBID_TAG_KEYS['vorbis'], 'musicbrainz_trackid']: + if key in audio: + del audio[key] + removed = True + + elif isinstance(audio, MP4): + mp4_key = _MBID_TAG_KEYS['mp4'] + if mp4_key in audio: + del audio[mp4_key] + removed = True + + if removed: + audio.save() + return removed + + except Exception as e: + logger.error("Error removing MBID from %s: %s", file_path, e) + return False + + +def _resolve_file_path(file_path, transfer_folder, download_folder=None): + """Resolve a stored DB path to an actual file on disk.""" + if not file_path: + return None + if os.path.exists(file_path): + return file_path + + path_parts = file_path.replace('\\', '/').split('/') + for base_dir in [transfer_folder, download_folder]: + if not base_dir or not os.path.isdir(base_dir): + continue + for i in range(1, len(path_parts)): + candidate = os.path.join(base_dir, *path_parts[i:]) + if os.path.exists(candidate): + return candidate + return None + + +@register_job +class MbidMismatchDetectorJob(RepairJob): + job_id = 'mbid_mismatch_detector' + display_name = 'MBID Mismatch Detector' + description = 'Finds tracks with wrong MusicBrainz IDs that cause media server mismatches' + help_text = ( + 'Scans your library for tracks that have an embedded MusicBrainz recording ID ' + '(MBID) that doesn\'t match the track\'s actual title.\n\n' + 'When a wrong MBID is embedded in an audio file, media servers like Navidrome ' + 'use it to look up metadata from MusicBrainz, overriding the file\'s correct ' + 'title and artist tags. This causes tracks to display with wrong names in the ' + 'media server even though SoulSync shows them correctly.\n\n' + 'The fix action removes the bad MBID tag from the audio file, allowing the media ' + 'server to fall back to the file\'s actual title/artist tags.\n\n' + 'This job reads each audio file\'s tags and queries MusicBrainz to verify the ' + 'embedded MBID points to the correct recording. Rate-limited to avoid overloading ' + 'the MusicBrainz API.' + ) + icon = 'repair-icon-mbid' + default_enabled = False + default_interval_hours = 168 # weekly + default_settings = { + 'similarity_threshold': 0.55, + } + auto_fix = False + + def scan(self, context: JobContext) -> JobResult: + result = JobResult() + + # Get all tracks with file paths + tracks = [] + conn = None + try: + conn = context.db._get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT t.id, t.title, ar.name, al.title, t.file_path, + al.thumb_url, ar.thumb_url + FROM tracks t + LEFT JOIN artists ar ON ar.id = t.artist_id + LEFT JOIN albums al ON al.id = t.album_id + WHERE t.file_path IS NOT NULL AND t.file_path != '' + """) + tracks = cursor.fetchall() + except Exception as e: + logger.error("Error fetching tracks: %s", e, exc_info=True) + result.errors += 1 + return result + finally: + if conn: + conn.close() + + total = len(tracks) + if context.update_progress: + context.update_progress(0, total) + if context.report_progress: + context.report_progress(phase=f'Scanning {total} tracks for MBID mismatches...', total=total) + + download_folder = None + if context.config_manager: + download_folder = context.config_manager.get('soulseek.download_path', '') + + # We need a MusicBrainz client for MBID lookups + mb_client = None + if context.mb_client: + mb_client = context.mb_client + else: + try: + from core.musicbrainz_client import MusicBrainzClient + mb_client = MusicBrainzClient() + except Exception: + pass + + if not mb_client: + logger.warning("MusicBrainz client not available, skipping MBID mismatch scan") + if context.report_progress: + context.report_progress( + log_line='MusicBrainz client not available — cannot verify MBIDs', + log_type='error' + ) + return result + + checked = 0 + import time + + for i, row in enumerate(tracks): + if context.check_stop(): + return result + if i % 100 == 0 and context.wait_if_paused(): + return result + + track_id, title, artist_name, album_title, file_path, album_thumb, artist_thumb = row + + if context.update_progress and (i + 1) % 50 == 0: + context.update_progress(i + 1, total) + + # Resolve the file path + resolved = _resolve_file_path(file_path, context.transfer_folder, download_folder) + if not resolved: + result.scanned += 1 + continue + + # Read MBID from file tags + mbid, fmt = _read_mbid_from_file(resolved) + if not mbid: + result.scanned += 1 + continue + + # Validate the MBID against MusicBrainz + checked += 1 + + if context.report_progress and checked % 10 == 0: + context.report_progress( + scanned=i + 1, total=total, + phase=f'Verifying MBIDs ({checked} checked, {i + 1}/{total} files)', + log_line=f'Checking: {title or "Unknown"} — {artist_name or "Unknown"}', + log_type='info' + ) + + try: + # Rate limit: MusicBrainz allows ~1 req/sec + time.sleep(1.1) + + recording = mb_client.get_recording(mbid, includes=['artist-credits']) + if not recording: + # MBID doesn't exist — definitely wrong + self._create_mismatch_finding( + context, result, track_id, title, artist_name, album_title, + resolved, album_thumb, artist_thumb, mbid, + mb_title='[MBID not found]', mb_artist='[Unknown]', + reason='MBID does not exist in MusicBrainz' + ) + result.scanned += 1 + continue + + mb_title = recording.get('title', '') + mb_artists = recording.get('artist-credit', []) + mb_artist = '' + if mb_artists: + for credit in mb_artists: + if isinstance(credit, dict) and 'artist' in credit: + mb_artist = credit['artist'].get('name', '') + break + + # Compare: does the MBID's title match the file's title? + if not _title_matches(title, mb_title): + self._create_mismatch_finding( + context, result, track_id, title, artist_name, album_title, + resolved, album_thumb, artist_thumb, mbid, + mb_title=mb_title, mb_artist=mb_artist, + reason=f'MBID points to "{mb_title}" by {mb_artist}, expected "{title}"' + ) + + except Exception as e: + logger.debug("Error verifying MBID %s for track %s: %s", mbid, track_id, e) + # Don't count as error — could be transient network issue + + result.scanned += 1 + + if context.update_progress: + context.update_progress(total, total) + + logger.info("MBID mismatch scan: %d files scanned, %d with MBIDs verified, %d mismatches found", + total, checked, result.findings_created) + + if context.report_progress: + context.report_progress( + scanned=total, total=total, + phase='Complete', + log_line=f'Verified {checked} MBIDs — {result.findings_created} mismatches found', + log_type='success' if result.findings_created == 0 else 'warning' + ) + + return result + + def _create_mismatch_finding(self, context, result, track_id, title, artist_name, + album_title, file_path, album_thumb, artist_thumb, + mbid, mb_title, mb_artist, reason): + """Create a finding for a mismatched MBID.""" + if context.report_progress: + context.report_progress( + log_line=f'Mismatch: "{title}" has MBID for "{mb_title}"', + log_type='error' + ) + if context.create_finding: + try: + context.create_finding( + job_id=self.job_id, + finding_type='mbid_mismatch', + severity='warning', + entity_type='track', + entity_id=str(track_id), + file_path=file_path, + title=f'MBID mismatch: {title or "Unknown"}', + description=( + f'Track "{title}" by {artist_name or "Unknown"} has an embedded ' + f'MusicBrainz ID that points to "{mb_title}" by {mb_artist}. ' + f'This causes media servers like Navidrome to display the wrong track name.' + ), + details={ + 'track_id': track_id, + 'title': title, + 'artist': artist_name, + 'album': album_title, + 'file_path': file_path, + 'mbid': mbid, + 'mb_title': mb_title, + 'mb_artist': mb_artist, + 'reason': reason, + 'album_thumb_url': album_thumb or None, + 'artist_thumb_url': artist_thumb or None, + } + ) + result.findings_created += 1 + except Exception as e: + logger.debug("Error creating MBID mismatch finding for track %s: %s", track_id, e) + result.errors += 1 + + def estimate_scope(self, context: JobContext) -> int: + conn = None + try: + conn = context.db._get_connection() + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM tracks WHERE file_path IS NOT NULL AND file_path != ''") + row = cursor.fetchone() + return row[0] if row else 0 + except Exception: + return 0 + finally: + if conn: + conn.close() diff --git a/core/repair_worker.py b/core/repair_worker.py index c49044d4..d444459a 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -776,6 +776,7 @@ class RepairWorker: 'missing_cover_art': self._fix_missing_cover_art, 'metadata_gap': self._fix_metadata_gap, 'duplicate_tracks': self._fix_duplicates, + 'mbid_mismatch': self._fix_mbid_mismatch, } handler = handlers.get(finding_type) if not handler: @@ -991,6 +992,36 @@ class RepairWorker: msg += f' and {files_deleted} file(s) from disk' return {'success': True, 'action': 'removed_duplicates', 'message': msg} + def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details): + """Remove the mismatched MusicBrainz recording ID from the audio file.""" + if not file_path: + return {'success': False, 'error': 'No file path associated with this finding'} + + # Resolve path + download_folder = None + if self._config_manager: + download_folder = self._config_manager.get('soulseek.download_path', '') + resolved = _resolve_file_path(file_path, self.transfer_folder, download_folder) + if not resolved or not os.path.exists(resolved): + return {'success': False, 'error': f'File not found: {file_path}'} + + try: + from core.repair_jobs.mbid_mismatch_detector import _remove_mbid_from_file + removed = _remove_mbid_from_file(resolved) + if removed: + mbid = details.get('mbid', 'unknown') + mb_title = details.get('mb_title', 'unknown') + title = details.get('title', 'unknown') + return { + 'success': True, + 'action': 'removed_mbid', + 'message': f'Removed wrong MBID ({mbid[:8]}...) from "{title}" — was pointing to "{mb_title}"' + } + else: + return {'success': False, 'error': 'MBID tag not found in file (may have been removed already)'} + except Exception as e: + return {'success': False, 'error': f'Failed to remove MBID: {str(e)}'} + def dismiss_finding(self, finding_id: int) -> bool: """Dismiss a finding.""" conn = None