diff --git a/core/repair_jobs/acoustid_scanner.py b/core/repair_jobs/acoustid_scanner.py index 560cdd64..e3002430 100644 --- a/core/repair_jobs/acoustid_scanner.py +++ b/core/repair_jobs/acoustid_scanner.py @@ -1,8 +1,14 @@ -"""AcoustID Background Scanner Job — fingerprints tracks to detect wrong downloads.""" +"""AcoustID Scanner Job — fingerprints library tracks to detect wrong downloads. + +Scans the entire library (not just Transfer) by resolving DB file paths to +actual files on disk. Creates actionable findings that can be fixed: + - 'retag': Update DB metadata to match what the file actually is + - 'redownload': Add the expected track to wishlist and delete the wrong file + - 'delete': Remove the wrong file and its DB record +""" import os import re -import time from difflib import SequenceMatcher from typing import Optional @@ -19,29 +25,33 @@ AUDIO_EXTENSIONS = {'.mp3', '.flac', '.ogg', '.opus', '.m4a', '.aac', '.wav', '. class AcoustIDScannerJob(RepairJob): job_id = 'acoustid_scanner' display_name = 'AcoustID Scanner' - description = 'Fingerprints tracks to detect wrong downloads' + description = 'Fingerprints library tracks to detect wrong downloads' help_text = ( - 'Generates audio fingerprints using the AcoustID/Chromaprint service and compares ' - 'the identified recording against what you expected to download. This catches cases ' - 'where the wrong song was served — even if the filename looks correct.\n\n' - 'The job processes tracks in batches and saves a checkpoint so it can resume where ' - 'it left off across runs. Requires an AcoustID API key (set in Settings).\n\n' + 'Scans your music library by fingerprinting audio files and comparing ' + 'them against the AcoustID database. Detects cases where the wrong song ' + 'was downloaded — even if the filename and tags look correct.\n\n' + 'When a mismatch is found, you can:\n' + '• Retag — update the DB record to match the actual audio content\n' + '• Redownload — add the correct track to your wishlist and remove the wrong file\n' + '• Delete — remove the wrong file entirely\n\n' + 'The job processes tracks in batches with checkpointing so it resumes ' + 'where it left off across runs. Requires an AcoustID API key (Settings).\n\n' 'Settings:\n' - '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0 - 1.0)\n' - '- Title Similarity: How closely the identified title must match your expected title\n' + '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0–1.0)\n' + '- Title Similarity: How closely the identified title must match\n' '- Artist Similarity: How closely the identified artist must match\n' - '- Batch Size: Number of tracks to process per scan run' + '- Batch Size: Tracks per scan run (checkpoint saved between batches)' ) icon = 'repair-icon-acoustid' - default_enabled = False - default_interval_hours = 168 + default_enabled = True + default_interval_hours = 24 default_settings = { 'fingerprint_threshold': 0.80, 'title_similarity': 0.70, 'artist_similarity': 0.60, - 'batch_size': 50, + 'batch_size': 200, } - auto_fix = False + auto_fix = False # User chooses fix action per finding def scan(self, context: JobContext) -> JobResult: result = JobResult() @@ -50,7 +60,7 @@ class AcoustIDScannerJob(RepairJob): fp_threshold = settings.get('fingerprint_threshold', 0.80) title_threshold = settings.get('title_similarity', 0.70) artist_threshold = settings.get('artist_similarity', 0.60) - batch_size = settings.get('batch_size', 50) + batch_size = settings.get('batch_size', 200) # Get AcoustID client acoustid_client = context.acoustid_client @@ -62,64 +72,56 @@ class AcoustIDScannerJob(RepairJob): logger.warning("AcoustID client not available: %s", e) return result - transfer = context.transfer_folder - if not os.path.isdir(transfer): - logger.warning("Transfer folder does not exist: %s", transfer) + # Load all library tracks from DB with their file paths + db_tracks = self._load_db_tracks(context) + if not db_tracks: + logger.info("No library tracks with file paths found") return result - # Read checkpoint (last processed file path) to resume from - checkpoint = None + # Read checkpoint (last processed track ID) to resume from + checkpoint_id = None if context.config_manager: - checkpoint = context.config_manager.get( - f'repair.jobs.{self.job_id}.checkpoint', None + checkpoint_id = context.config_manager.get( + f'repair.jobs.{self.job_id}.checkpoint_id', None ) - # Collect all audio files - audio_files = [] - for root, _dirs, files in os.walk(transfer): - if context.check_stop(): - return result - for fname in sorted(files): - ext = os.path.splitext(fname)[1].lower() - if ext in AUDIO_EXTENSIONS: - audio_files.append(os.path.join(root, fname)) - - # Sort for deterministic order (important for checkpoint) - audio_files.sort() + # Build ordered list of (track_id, info) sorted by ID for deterministic order + track_list = sorted(db_tracks.items(), key=lambda x: x[0]) # Skip past checkpoint if resuming - if checkpoint: - try: - idx = audio_files.index(checkpoint) - audio_files = audio_files[idx + 1:] - logger.info("Resuming AcoustID scan from checkpoint (%d files remaining)", len(audio_files)) - except ValueError: - logger.debug("Checkpoint file not found, starting from beginning") - - total = len(audio_files) + if checkpoint_id is not None: + original_len = len(track_list) + track_list = [(tid, info) for tid, info in track_list if tid > checkpoint_id] + if len(track_list) < original_len: + logger.info("Resuming AcoustID scan from checkpoint ID %s (%d tracks remaining)", + checkpoint_id, len(track_list)) + + total = len(track_list) + if context.report_progress: + context.report_progress(phase=f'Scanning {total} library tracks...', total=total) if context.update_progress: context.update_progress(0, total) - # Build a lookup of known tracks from DB for comparison - db_tracks = self._load_db_tracks(context) - - if context.report_progress: - context.report_progress(phase=f'Fingerprinting {total} files...', total=total) - batch_count = 0 - for i, fpath in enumerate(audio_files): + for i, (track_id, track_info) in enumerate(track_list): if context.check_stop(): - # Save checkpoint before stopping - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) return result if i % 10 == 0 and context.wait_if_paused(): - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) return result + # Resolve the DB path to an actual file on disk + file_path = track_info.get('file_path', '') + resolved = self._resolve_path(file_path, context) + if not resolved: + result.skipped += 1 + continue + result.scanned += 1 batch_count += 1 - fname = os.path.basename(fpath) + fname = os.path.basename(resolved) if context.report_progress: context.report_progress( scanned=i + 1, total=total, @@ -130,47 +132,38 @@ class AcoustIDScannerJob(RepairJob): try: self._scan_file( - fpath, acoustid_client, db_tracks, context, result, + resolved, track_id, track_info, acoustid_client, context, result, fp_threshold, title_threshold, artist_threshold ) except Exception as e: logger.debug("Error scanning %s: %s", fname, e) result.errors += 1 - # Rate limit: pause between batches + # Rate limit: pause between batches to avoid hammering AcoustID API if batch_count >= batch_size: batch_count = 0 - self._save_checkpoint(context, fpath) + self._save_checkpoint_id(context, track_id) if context.sleep_or_stop(2): return result if context.update_progress and (i + 1) % 10 == 0: context.update_progress(i + 1, total) - # Clear checkpoint on completion - self._save_checkpoint(context, None) + # Clear checkpoint on full completion + self._save_checkpoint_id(context, None) if context.update_progress: context.update_progress(total, total) - logger.info("AcoustID scan: %d files scanned, %d mismatches found, %d errors", - result.scanned, result.findings_created, result.errors) + logger.info("AcoustID scan: %d scanned, %d skipped, %d mismatches, %d errors", + result.scanned, result.skipped, result.findings_created, result.errors) return result - def _scan_file(self, fpath, acoustid_client, db_tracks, context, result, + def _scan_file(self, fpath, track_id, expected, acoustid_client, context, result, fp_threshold, title_threshold, artist_threshold): """Fingerprint a single file and check for mismatches.""" fname = os.path.basename(fpath) - # Get expected title/artist from DB or filename - expected = db_tracks.get(os.path.normpath(fpath)) - if not expected: - # Try to extract from filename: "01 - Artist - Title.flac" or "01 Title.flac" - base = os.path.splitext(fname)[0] - # Strip leading track number - base = re.sub(r'^\d{1,3}[\s.\-_]*', '', base) - expected = {'title': base, 'artist': '', 'track_id': None} - # Fingerprint the file try: fp_result = acoustid_client.fingerprint_and_lookup(fpath) @@ -178,29 +171,18 @@ class AcoustIDScannerJob(RepairJob): logger.debug("Fingerprint failed for %s: %s", fname, e) result.errors += 1 if context.report_progress: - context.report_progress( - log_line=f'Error: {fname} — {e}', - log_type='error' - ) + context.report_progress(log_line=f'Error: {fname} — {e}', log_type='error') return if not fp_result or not fp_result.get('recordings'): - # No match — could be API error, rare track, or invalid key - # Don't create findings for "no match" — these flood the list - # and are usually not actionable. Only log for visibility. if context.report_progress: - context.report_progress( - log_line=f'No match: {fname}', - log_type='skip' - ) + context.report_progress(log_line=f'No match: {fname}', log_type='skip') return - # Check best recording match best_score = fp_result.get('best_score', 0) if best_score < fp_threshold: - return # Low confidence fingerprint, skip + return - # Compare best AcoustID result against expected best_recording = fp_result['recordings'][0] aid_title = best_recording.get('title', '') aid_artist = best_recording.get('artist', '') @@ -217,7 +199,6 @@ class AcoustIDScannerJob(RepairJob): title_sim = SequenceMatcher(None, norm_expected_title, norm_aid_title).ratio() artist_sim = SequenceMatcher(None, norm_expected_artist, norm_aid_artist).ratio() if norm_expected_artist else 1.0 - # If both title AND artist match well, no issue if title_sim >= title_threshold and artist_sim >= artist_threshold: return @@ -234,13 +215,14 @@ class AcoustIDScannerJob(RepairJob): finding_type='acoustid_mismatch', severity=severity, entity_type='track', - entity_id=str(expected.get('track_id') or ''), + entity_id=str(track_id), file_path=fpath, - title=f'Possible wrong download: {fname}', + title=f'Wrong download: "{expected["title"]}" is actually "{aid_title}"', description=( f'Expected "{expected["title"]}" by {expected["artist"]}, ' - f'but fingerprint matches "{aid_title}" by {aid_artist} ' - f'(fp: {best_score:.0%}, title: {title_sim:.0%}, artist: {artist_sim:.0%})' + f'but audio fingerprint matches "{aid_title}" by {aid_artist} ' + f'(fingerprint: {best_score:.0%}, title match: {title_sim:.0%}, ' + f'artist match: {artist_sim:.0%})' ), details={ 'expected_title': expected['title'], @@ -252,20 +234,22 @@ class AcoustIDScannerJob(RepairJob): 'artist_similarity': round(artist_sim, 3), 'album_thumb_url': expected.get('album_thumb_url'), 'artist_thumb_url': expected.get('artist_thumb_url'), + 'album_title': expected.get('album_title', ''), + 'track_number': expected.get('track_number'), } ) result.findings_created += 1 def _load_db_tracks(self, context: JobContext) -> dict: - """Load all tracks from DB keyed by normalized file_path.""" + """Load all tracks from DB keyed by track ID.""" tracks = {} conn = None try: conn = context.db._get_connection() cursor = conn.cursor() cursor.execute(""" - SELECT t.id, t.title, ar.name, t.file_path, - al.thumb_url, ar.thumb_url + SELECT t.id, t.title, ar.name, t.file_path, t.track_number, + al.title AS album_title, al.thumb_url, ar.thumb_url FROM tracks t LEFT JOIN artists ar ON ar.id = t.artist_id LEFT JOIN albums al ON al.id = t.album_id @@ -273,13 +257,15 @@ class AcoustIDScannerJob(RepairJob): AND t.title IS NOT NULL AND t.title != '' """) for row in cursor.fetchall(): - track_id, title, artist_name, file_path, album_thumb, artist_thumb = row - tracks[os.path.normpath(file_path)] = { - 'track_id': track_id, - 'title': title or '', - 'artist': artist_name or '', - 'album_thumb_url': album_thumb or None, - 'artist_thumb_url': artist_thumb or None, + track_id = row[0] + tracks[track_id] = { + 'title': row[1] or '', + 'artist': row[2] or '', + 'file_path': row[3] or '', + 'track_number': row[4], + 'album_title': row[5] or '', + 'album_thumb_url': row[6] or None, + 'artist_thumb_url': row[7] or None, } except Exception as e: logger.error("Error loading tracks from DB: %s", e) @@ -288,12 +274,21 @@ class AcoustIDScannerJob(RepairJob): conn.close() return tracks - def _save_checkpoint(self, context: JobContext, fpath): - """Save or clear the scan checkpoint.""" + def _resolve_path(self, file_path, context): + """Resolve a DB file path to an actual file on disk.""" + if not file_path: + return None + if os.path.exists(file_path): + return file_path + # Try the repair_worker's resolver + from core.repair_worker import _resolve_file_path + return _resolve_file_path(file_path, context.transfer_folder) + + def _save_checkpoint_id(self, context: JobContext, track_id): + """Save or clear the scan checkpoint by track ID.""" if context.config_manager: context.config_manager.set( - f'repair.jobs.{self.job_id}.checkpoint', - fpath + f'repair.jobs.{self.job_id}.checkpoint_id', track_id ) def _get_settings(self, context: JobContext) -> dict: @@ -305,15 +300,21 @@ class AcoustIDScannerJob(RepairJob): return merged def estimate_scope(self, context: JobContext) -> int: - transfer = context.transfer_folder - if not os.path.isdir(transfer): + conn = None + try: + conn = context.db._get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT COUNT(*) FROM tracks + WHERE file_path IS NOT NULL AND file_path != '' + AND title IS NOT NULL AND title != '' + """) + return cursor.fetchone()[0] + except Exception: return 0 - count = 0 - for _root, _dirs, files in os.walk(transfer): - for fname in files: - if os.path.splitext(fname)[1].lower() in AUDIO_EXTENSIONS: - count += 1 - return count + finally: + if conn: + conn.close() def _normalize(text: str) -> str: diff --git a/core/repair_worker.py b/core/repair_worker.py index 91656ae5..230bc2a4 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -840,6 +840,7 @@ class RepairWorker: 'missing_lossy_copy': self._fix_missing_lossy_copy, 'unwanted_content': self._fix_unwanted_content, 'unknown_artist': self._fix_unknown_artist, + 'acoustid_mismatch': self._fix_acoustid_mismatch, } handler = handlers.get(finding_type) if not handler: @@ -1509,6 +1510,108 @@ class RepairWorker: return {'success': True, 'action': 'fixed_unknown_artist', 'message': f'Fixed: {corrected_artist} - {corrected_title}'} + def _fix_acoustid_mismatch(self, entity_type, entity_id, file_path, details): + """Fix an AcoustID mismatch. Actions: + 'retag' (default): Update DB title/artist to match the actual audio content + 'redownload': Add the expected (correct) track to wishlist and delete the wrong file + 'delete': Just delete the wrong file and DB record + """ + fix_action = details.get('_fix_action', 'retag') + track_id = entity_id + + if fix_action == 'delete': + # Delete file + DB record + if file_path: + resolved = _resolve_file_path(file_path, self.transfer_folder) + if resolved and os.path.exists(resolved): + try: + os.remove(resolved) + except Exception as e: + logger.warning("Could not delete file %s: %s", resolved, e) + if track_id: + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,)) + conn.commit() + conn.close() + except Exception as e: + return {'success': False, 'error': f'DB delete failed: {e}'} + return {'success': True, 'action': 'deleted', + 'message': f'Deleted wrong file: {os.path.basename(file_path or "")}'} + + if fix_action == 'redownload': + # Add expected track to wishlist, then delete the wrong file + expected_title = details.get('expected_title', '') + expected_artist = details.get('expected_artist', '') + album_title = details.get('album_title', '') + if expected_title and expected_artist: + try: + import json, uuid + track_data = { + 'id': f'acoustid_fix_{uuid.uuid4().hex[:8]}', + 'name': expected_title, + 'artists': [{'name': expected_artist}], + 'album': {'name': album_title} if album_title else {'name': expected_title}, + } + self.db.add_to_wishlist( + spotify_track_data=track_data, + failure_reason='AcoustID mismatch — re-downloading correct track', + source_type='repair', + ) + logger.info("Added '%s' by '%s' to wishlist for re-download", + expected_title, expected_artist) + except Exception as e: + logger.warning("Could not add to wishlist: %s", e) + # Delete wrong file + if file_path: + resolved = _resolve_file_path(file_path, self.transfer_folder) + if resolved and os.path.exists(resolved): + try: + os.remove(resolved) + except Exception: + pass + if track_id: + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,)) + conn.commit() + conn.close() + except Exception: + pass + return {'success': True, 'action': 'redownload', + 'message': f'Added "{expected_title}" to wishlist, removed wrong file'} + + # Default: retag — update DB record to match the actual audio content + aid_title = details.get('acoustid_title', '') + aid_artist = details.get('acoustid_artist', '') + if not aid_title: + return {'success': False, 'error': 'No AcoustID title available to retag'} + + try: + conn = self.db._get_connection() + cursor = conn.cursor() + # Update track title + cursor.execute("UPDATE tracks SET title = ? WHERE id = ?", (aid_title, track_id)) + # Update artist if we have one and it differs + if aid_artist: + cursor.execute("SELECT id FROM artists WHERE LOWER(name) = LOWER(?)", (aid_artist,)) + row = cursor.fetchone() + if row: + cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?", (row[0], track_id)) + else: + cursor.execute("INSERT INTO artists (name) VALUES (?)", (aid_artist,)) + cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?", + (cursor.lastrowid, track_id)) + conn.commit() + conn.close() + except Exception as e: + return {'success': False, 'error': f'DB update failed: {e}'} + + return {'success': True, 'action': 'retagged', + 'message': f'Updated to: "{aid_title}" by {aid_artist}'} + def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details): """Remove the mismatched MusicBrainz recording ID from the audio file.""" if not file_path: