From 223522ce99a6a483d3a3dc47a6263be4bd35c665 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:26:39 -0700 Subject: [PATCH] Upgrade AcoustID scanner to scan full library with actionable fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrote the AcoustID scanner job to scan all library tracks (via DB file paths resolved to disk) instead of only the Transfer folder. Checkpoints by track ID for robust resume across restarts. Defaults changed to enabled, 24h interval, batch size 200. Added _fix_acoustid_mismatch handler with three actions: - retag: update DB title/artist to match actual audio content - redownload: add expected track to wishlist and delete wrong file - delete: remove wrong file and DB record This catches cases like a file tagged as "Dinosaur Bones" that is actually "Helicopters" — the scanner fingerprints the audio, detects the mismatch, and the user can fix it from Library Maintenance findings. --- core/repair_jobs/acoustid_scanner.py | 227 ++++++++++++++------------- core/repair_worker.py | 103 ++++++++++++ 2 files changed, 217 insertions(+), 113 deletions(-) diff --git a/core/repair_jobs/acoustid_scanner.py b/core/repair_jobs/acoustid_scanner.py index 560cdd64..e3002430 100644 --- a/core/repair_jobs/acoustid_scanner.py +++ b/core/repair_jobs/acoustid_scanner.py @@ -1,8 +1,14 @@ -"""AcoustID Background Scanner Job — fingerprints tracks to detect wrong downloads.""" +"""AcoustID Scanner Job — fingerprints library tracks to detect wrong downloads. + +Scans the entire library (not just Transfer) by resolving DB file paths to +actual files on disk. Creates actionable findings that can be fixed: + - 'retag': Update DB metadata to match what the file actually is + - 'redownload': Add the expected track to wishlist and delete the wrong file + - 'delete': Remove the wrong file and its DB record +""" import os import re -import time from difflib import SequenceMatcher from typing import Optional @@ -19,29 +25,33 @@ AUDIO_EXTENSIONS = {'.mp3', '.flac', '.ogg', '.opus', '.m4a', '.aac', '.wav', '. class AcoustIDScannerJob(RepairJob): job_id = 'acoustid_scanner' display_name = 'AcoustID Scanner' - description = 'Fingerprints tracks to detect wrong downloads' + description = 'Fingerprints library tracks to detect wrong downloads' help_text = ( - 'Generates audio fingerprints using the AcoustID/Chromaprint service and compares ' - 'the identified recording against what you expected to download. This catches cases ' - 'where the wrong song was served — even if the filename looks correct.\n\n' - 'The job processes tracks in batches and saves a checkpoint so it can resume where ' - 'it left off across runs. Requires an AcoustID API key (set in Settings).\n\n' + 'Scans your music library by fingerprinting audio files and comparing ' + 'them against the AcoustID database. Detects cases where the wrong song ' + 'was downloaded — even if the filename and tags look correct.\n\n' + 'When a mismatch is found, you can:\n' + '• Retag — update the DB record to match the actual audio content\n' + '• Redownload — add the correct track to your wishlist and remove the wrong file\n' + '• Delete — remove the wrong file entirely\n\n' + 'The job processes tracks in batches with checkpointing so it resumes ' + 'where it left off across runs. Requires an AcoustID API key (Settings).\n\n' 'Settings:\n' - '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0 - 1.0)\n' - '- Title Similarity: How closely the identified title must match your expected title\n' + '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0–1.0)\n' + '- Title Similarity: How closely the identified title must match\n' '- Artist Similarity: How closely the identified artist must match\n' - '- Batch Size: Number of tracks to process per scan run' + '- Batch Size: Tracks per scan run (checkpoint saved between batches)' ) icon = 'repair-icon-acoustid' - default_enabled = False - default_interval_hours = 168 + default_enabled = True + default_interval_hours = 24 default_settings = { 'fingerprint_threshold': 0.80, 'title_similarity': 0.70, 'artist_similarity': 0.60, - 'batch_size': 50, + 'batch_size': 200, } - auto_fix = False + auto_fix = False # User chooses fix action per finding def scan(self, context: JobContext) -> JobResult: result = JobResult() @@ -50,7 +60,7 @@ class AcoustIDScannerJob(RepairJob): fp_threshold = settings.get('fingerprint_threshold', 0.80) title_threshold = settings.get('title_similarity', 0.70) artist_threshold = settings.get('artist_similarity', 0.60) - batch_size = settings.get('batch_size', 50) + batch_size = settings.get('batch_size', 200) # Get AcoustID client acoustid_client = context.acoustid_client @@ -62,64 +72,56 @@ class AcoustIDScannerJob(RepairJob): logger.warning("AcoustID client not available: %s", e) return result - transfer = context.transfer_folder - if not os.path.isdir(transfer): - logger.warning("Transfer folder does not exist: %s", transfer) + # Load all library tracks from DB with their file paths + db_tracks = self._load_db_tracks(context) + if not db_tracks: + logger.info("No library tracks with file paths found") return result - # Read checkpoint (last processed file path) to resume from - checkpoint = None + # Read checkpoint (last processed track ID) to resume from + checkpoint_id = None if context.config_manager: - checkpoint = context.config_manager.get( - f'repair.jobs.{self.job_id}.checkpoint', None + checkpoint_id = context.config_manager.get( + f'repair.jobs.{self.job_id}.checkpoint_id', None ) - # Collect all audio files - audio_files = [] - for root, _dirs, files in os.walk(transfer): - if context.check_stop(): - return result - for fname in sorted(files): - ext = os.path.splitext(fname)[1].lower() - if ext in AUDIO_EXTENSIONS: - audio_files.append(os.path.join(root, fname)) - - # Sort for deterministic order (important for checkpoint) - audio_files.sort() + # Build ordered list of (track_id, info) sorted by ID for deterministic order + track_list = sorted(db_tracks.items(), key=lambda x: x[0]) # Skip past checkpoint if resuming - if checkpoint: - try: - idx = audio_files.index(checkpoint) - audio_files = audio_files[idx + 1:] - logger.info("Resuming AcoustID scan from checkpoint (%d files remaining)", len(audio_files)) - except ValueError: - logger.debug("Checkpoint file not found, starting from beginning") - - total = len(audio_files) + if checkpoint_id is not None: + original_len = len(track_list) + track_list = [(tid, info) for tid, info in track_list if tid > checkpoint_id] + if len(track_list) < original_len: + logger.info("Resuming AcoustID scan from checkpoint ID %s (%d tracks remaining)", + checkpoint_id, len(track_list)) + + total = len(track_list) + if context.report_progress: + context.report_progress(phase=f'Scanning {total} library tracks...', total=total) if context.update_progress: context.update_progress(0, total) - # Build a lookup of known tracks from DB for comparison - db_tracks = self._load_db_tracks(context) - - if context.report_progress: - context.report_progress(phase=f'Fingerprinting {total} files...', total=total) - batch_count = 0 - for i, fpath in enumerate(audio_files): + for i, (track_id, track_info) in enumerate(track_list): if context.check_stop(): - # Save checkpoint before stopping - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) return result if i % 10 == 0 and context.wait_if_paused(): - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) return result + # Resolve the DB path to an actual file on disk + file_path = track_info.get('file_path', '') + resolved = self._resolve_path(file_path, context) + if not resolved: + result.skipped += 1 + continue + result.scanned += 1 batch_count += 1 - fname = os.path.basename(fpath) + fname = os.path.basename(resolved) if context.report_progress: context.report_progress( scanned=i + 1, total=total, @@ -130,47 +132,38 @@ class AcoustIDScannerJob(RepairJob): try: self._scan_file( - fpath, acoustid_client, db_tracks, context, result, + resolved, track_id, track_info, acoustid_client, context, result, fp_threshold, title_threshold, artist_threshold ) except Exception as e: logger.debug("Error scanning %s: %s", fname, e) result.errors += 1 - # Rate limit: pause between batches + # Rate limit: pause between batches to avoid hammering AcoustID API if batch_count >= batch_size: batch_count = 0 - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) if context.sleep_or_stop(2): return result if context.update_progress and (i + 1) % 10 == 0: context.update_progress(i + 1, total) - # Clear checkpoint on completion - self._save_checkpoint(context, None) + # Clear checkpoint on full completion + self._save_checkpoint_id(context, None) if context.update_progress: context.update_progress(total, total) - logger.info("AcoustID scan: %d files scanned, %d mismatches found, %d errors", - result.scanned, result.findings_created, result.errors) + logger.info("AcoustID scan: %d scanned, %d skipped, %d mismatches, %d errors", + result.scanned, result.skipped, result.findings_created, result.errors) return result - def _scan_file(self, fpath, acoustid_client, db_tracks, context, result, + def _scan_file(self, fpath, track_id, expected, acoustid_client, context, result, fp_threshold, title_threshold, artist_threshold): """Fingerprint a single file and check for mismatches.""" fname = os.path.basename(fpath) - # Get expected title/artist from DB or filename - expected = db_tracks.get(os.path.normpath(fpath)) - if not expected: - # Try to extract from filename: "01 - Artist - Title.flac" or "01 Title.flac" - base = os.path.splitext(fname)[0] - # Strip leading track number - base = re.sub(r'^\d{1,3}[\s.\-_]*', '', base) - expected = {'title': base, 'artist': '', 'track_id': None} - # Fingerprint the file try: fp_result = acoustid_client.fingerprint_and_lookup(fpath) @@ -178,29 +171,18 @@ class AcoustIDScannerJob(RepairJob): logger.debug("Fingerprint failed for %s: %s", fname, e) result.errors += 1 if context.report_progress: - context.report_progress( - log_line=f'Error: {fname} — {e}', - log_type='error' - ) + context.report_progress(log_line=f'Error: {fname} — {e}', log_type='error') return if not fp_result or not fp_result.get('recordings'): - # No match — could be API error, rare track, or invalid key - # Don't create findings for "no match" — these flood the list - # and are usually not actionable. Only log for visibility. if context.report_progress: - context.report_progress( - log_line=f'No match: {fname}', - log_type='skip' - ) + context.report_progress(log_line=f'No match: {fname}', log_type='skip') return - # Check best recording match best_score = fp_result.get('best_score', 0) if best_score < fp_threshold: - return # Low confidence fingerprint, skip + return - # Compare best AcoustID result against expected best_recording = fp_result['recordings'][0] aid_title = best_recording.get('title', '') aid_artist = best_recording.get('artist', '') @@ -217,7 +199,6 @@ class AcoustIDScannerJob(RepairJob): title_sim = SequenceMatcher(None, norm_expected_title, norm_aid_title).ratio() artist_sim = SequenceMatcher(None, norm_expected_artist, norm_aid_artist).ratio() if norm_expected_artist else 1.0 - # If both title AND artist match well, no issue if title_sim >= title_threshold and artist_sim >= artist_threshold: return @@ -234,13 +215,14 @@ class AcoustIDScannerJob(RepairJob): finding_type='acoustid_mismatch', severity=severity, entity_type='track', - entity_id=str(expected.get('track_id') or ''), + entity_id=str(track_id), file_path=fpath, - title=f'Possible wrong download: {fname}', + title=f'Wrong download: "{expected["title"]}" is actually "{aid_title}"', description=( f'Expected "{expected["title"]}" by {expected["artist"]}, ' - f'but fingerprint matches "{aid_title}" by {aid_artist} ' - f'(fp: {best_score:.0%}, title: {title_sim:.0%}, artist: {artist_sim:.0%})' + f'but audio fingerprint matches "{aid_title}" by {aid_artist} ' + f'(fingerprint: {best_score:.0%}, title match: {title_sim:.0%}, ' + f'artist match: {artist_sim:.0%})' ), details={ 'expected_title': expected['title'], @@ -252,20 +234,22 @@ class AcoustIDScannerJob(RepairJob): 'artist_similarity': round(artist_sim, 3), 'album_thumb_url': expected.get('album_thumb_url'), 'artist_thumb_url': expected.get('artist_thumb_url'), + 'album_title': expected.get('album_title', ''), + 'track_number': expected.get('track_number'), } ) result.findings_created += 1 def _load_db_tracks(self, context: JobContext) -> dict: - """Load all tracks from DB keyed by normalized file_path.""" + """Load all tracks from DB keyed by track ID.""" tracks = {} conn = None try: conn = context.db._get_connection() cursor = conn.cursor() cursor.execute(""" - SELECT t.id, t.title, ar.name, t.file_path, - al.thumb_url, ar.thumb_url + SELECT t.id, t.title, ar.name, t.file_path, t.track_number, + al.title AS album_title, al.thumb_url, ar.thumb_url FROM tracks t LEFT JOIN artists ar ON ar.id = t.artist_id LEFT JOIN albums al ON al.id = t.album_id @@ -273,13 +257,15 @@ class AcoustIDScannerJob(RepairJob): AND t.title IS NOT NULL AND t.title != '' """) for row in cursor.fetchall(): - track_id, title, artist_name, file_path, album_thumb, artist_thumb = row - tracks[os.path.normpath(file_path)] = { - 'track_id': track_id, - 'title': title or '', - 'artist': artist_name or '', - 'album_thumb_url': album_thumb or None, - 'artist_thumb_url': artist_thumb or None, + track_id = row[0] + tracks[track_id] = { + 'title': row[1] or '', + 'artist': row[2] or '', + 'file_path': row[3] or '', + 'track_number': row[4], + 'album_title': row[5] or '', + 'album_thumb_url': row[6] or None, + 'artist_thumb_url': row[7] or None, } except Exception as e: logger.error("Error loading tracks from DB: %s", e) @@ -288,12 +274,21 @@ class AcoustIDScannerJob(RepairJob): conn.close() return tracks - def _save_checkpoint(self, context: JobContext, fpath): - """Save or clear the scan checkpoint.""" + def _resolve_path(self, file_path, context): + """Resolve a DB file path to an actual file on disk.""" + if not file_path: + return None + if os.path.exists(file_path): + return file_path + # Try the repair_worker's resolver + from core.repair_worker import _resolve_file_path + return _resolve_file_path(file_path, context.transfer_folder) + + def _save_checkpoint_id(self, context: JobContext, track_id): + """Save or clear the scan checkpoint by track ID.""" if context.config_manager: context.config_manager.set( - f'repair.jobs.{self.job_id}.checkpoint', - fpath + f'repair.jobs.{self.job_id}.checkpoint_id', track_id ) def _get_settings(self, context: JobContext) -> dict: @@ -305,15 +300,21 @@ class AcoustIDScannerJob(RepairJob): return merged def estimate_scope(self, context: JobContext) -> int: - transfer = context.transfer_folder - if not os.path.isdir(transfer): + conn = None + try: + conn = context.db._get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT COUNT(*) FROM tracks + WHERE file_path IS NOT NULL AND file_path != '' + AND title IS NOT NULL AND title != '' + """) + return cursor.fetchone()[0] + except Exception: return 0 - count = 0 - for _root, _dirs, files in os.walk(transfer): - for fname in files: - if os.path.splitext(fname)[1].lower() in AUDIO_EXTENSIONS: - count += 1 - return count + finally: + if conn: + conn.close() def _normalize(text: str) -> str: diff --git a/core/repair_worker.py b/core/repair_worker.py index 91656ae5..230bc2a4 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -840,6 +840,7 @@ class RepairWorker: 'missing_lossy_copy': self._fix_missing_lossy_copy, 'unwanted_content': self._fix_unwanted_content, 'unknown_artist': self._fix_unknown_artist, + 'acoustid_mismatch': self._fix_acoustid_mismatch, } handler = handlers.get(finding_type) if not handler: @@ -1509,6 +1510,108 @@ class RepairWorker: return {'success': True, 'action': 'fixed_unknown_artist', 'message': f'Fixed: {corrected_artist} - {corrected_title}'} + def _fix_acoustid_mismatch(self, entity_type, entity_id, file_path, details): + """Fix an AcoustID mismatch. Actions: + 'retag' (default): Update DB title/artist to match the actual audio content + 'redownload': Add the expected (correct) track to wishlist and delete the wrong file + 'delete': Just delete the wrong file and DB record + """ + fix_action = details.get('_fix_action', 'retag') + track_id = entity_id + + if fix_action == 'delete': + # Delete file + DB record + if file_path: + resolved = _resolve_file_path(file_path, self.transfer_folder) + if resolved and os.path.exists(resolved): + try: + os.remove(resolved) + except Exception as e: + logger.warning("Could not delete file %s: %s", resolved, e) + if track_id: + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,)) + conn.commit() + conn.close() + except Exception as e: + return {'success': False, 'error': f'DB delete failed: {e}'} + return {'success': True, 'action': 'deleted', + 'message': f'Deleted wrong file: {os.path.basename(file_path or "")}'} + + if fix_action == 'redownload': + # Add expected track to wishlist, then delete the wrong file + expected_title = details.get('expected_title', '') + expected_artist = details.get('expected_artist', '') + album_title = details.get('album_title', '') + if expected_title and expected_artist: + try: + import json, uuid + track_data = { + 'id': f'acoustid_fix_{uuid.uuid4().hex[:8]}', + 'name': expected_title, + 'artists': [{'name': expected_artist}], + 'album': {'name': album_title} if album_title else {'name': expected_title}, + } + self.db.add_to_wishlist( + spotify_track_data=track_data, + failure_reason='AcoustID mismatch — re-downloading correct track', + source_type='repair', + ) + logger.info("Added '%s' by '%s' to wishlist for re-download", + expected_title, expected_artist) + except Exception as e: + logger.warning("Could not add to wishlist: %s", e) + # Delete wrong file + if file_path: + resolved = _resolve_file_path(file_path, self.transfer_folder) + if resolved and os.path.exists(resolved): + try: + os.remove(resolved) + except Exception: + pass + if track_id: + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,)) + conn.commit() + conn.close() + except Exception: + pass + return {'success': True, 'action': 'redownload', + 'message': f'Added "{expected_title}" to wishlist, removed wrong file'} + + # Default: retag — update DB record to match the actual audio content + aid_title = details.get('acoustid_title', '') + aid_artist = details.get('acoustid_artist', '') + if not aid_title: + return {'success': False, 'error': 'No AcoustID title available to retag'} + + try: + conn = self.db._get_connection() + cursor = conn.cursor() + # Update track title + cursor.execute("UPDATE tracks SET title = ? WHERE id = ?", (aid_title, track_id)) + # Update artist if we have one and it differs + if aid_artist: + cursor.execute("SELECT id FROM artists WHERE LOWER(name) = LOWER(?)", (aid_artist,)) + row = cursor.fetchone() + if row: + cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?", (row[0], track_id)) + else: + cursor.execute("INSERT INTO artists (name) VALUES (?)", (aid_artist,)) + cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?", + (cursor.lastrowid, track_id)) + conn.commit() + conn.close() + except Exception as e: + return {'success': False, 'error': f'DB update failed: {e}'} + + return {'success': True, 'action': 'retagged', + 'message': f'Updated to: "{aid_title}" by {aid_artist}'} + def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details): """Remove the mismatched MusicBrainz recording ID from the audio file.""" if not file_path: