Upgrade AcoustID scanner to scan full library with actionable fixes

Rewrote the AcoustID scanner job to scan all library tracks (via DB file paths resolved to disk) instead of only the Transfer folder. Checkpoints by track ID for robust resume across restarts. Defaults changed to enabled, 24h interval, batch size 200. Added _fix_acoustid_mismatch handler with three actions: - retag: update DB title/artist to match actual audio content - redownload: add expected track to wishlist and delete wrong file - delete: remove wrong file and DB record This catches cases like a file tagged as "Dinosaur Bones" that is actually "Helicopters" — the scanner fingerprints the audio, detects the mismatch, and the user can fix it from Library Maintenance findings.
1 month ago · 223522ce99
parent 0b60986f44
commit 223522ce99
2 changed files with 217 additions and 113 deletions
--- a/core/repair_jobs/acoustid_scanner.py
+++ b/core/repair_jobs/acoustid_scanner.py
@ -1,8 +1,14 @@
-"""AcoustID Background Scanner Job — fingerprints tracks to detect wrong downloads."""
+"""AcoustID Scanner Job — fingerprints library tracks to detect wrong downloads.
+
+Scans the entire library (not just Transfer) by resolving DB file paths to
+actual files on disk. Creates actionable findings that can be fixed:
+  - 'retag': Update DB metadata to match what the file actually is
+  - 'redownload': Add the expected track to wishlist and delete the wrong file
+  - 'delete': Remove the wrong file and its DB record
+"""

 import os
 import re
-import time
 from difflib import SequenceMatcher
 from typing import Optional

@ -19,29 +25,33 @@ AUDIO_EXTENSIONS = {'.mp3', '.flac', '.ogg', '.opus', '.m4a', '.aac', '.wav', '.
 class AcoustIDScannerJob(RepairJob):
    job_id = 'acoustid_scanner'
    display_name = 'AcoustID Scanner'
-    description = 'Fingerprints tracks to detect wrong downloads'
+    description = 'Fingerprints library tracks to detect wrong downloads'
    help_text = (
-        'Generates audio fingerprints using the AcoustID/Chromaprint service and compares '
-        'the identified recording against what you expected to download. This catches cases '
-        'where the wrong song was served — even if the filename looks correct.\n\n'
-        'The job processes tracks in batches and saves a checkpoint so it can resume where '
-        'it left off across runs. Requires an AcoustID API key (set in Settings).\n\n'
+        'Scans your music library by fingerprinting audio files and comparing '
+        'them against the AcoustID database. Detects cases where the wrong song '
+        'was downloaded — even if the filename and tags look correct.\n\n'
+        'When a mismatch is found, you can:\n'
+        '• Retag — update the DB record to match the actual audio content\n'
+        '• Redownload — add the correct track to your wishlist and remove the wrong file\n'
+        '• Delete — remove the wrong file entirely\n\n'
+        'The job processes tracks in batches with checkpointing so it resumes '
+        'where it left off across runs. Requires an AcoustID API key (Settings).\n\n'
        'Settings:\n'
-        '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0 - 1.0)\n'
-        '- Title Similarity: How closely the identified title must match your expected title\n'
+        '- Fingerprint Threshold: Minimum AcoustID match confidence (0.0–1.0)\n'
+        '- Title Similarity: How closely the identified title must match\n'
        '- Artist Similarity: How closely the identified artist must match\n'
-        '- Batch Size: Number of tracks to process per scan run'
+        '- Batch Size: Tracks per scan run (checkpoint saved between batches)'
    )
    icon = 'repair-icon-acoustid'
-    default_enabled = False
-    default_interval_hours = 168
+    default_enabled = True
+    default_interval_hours = 24
    default_settings = {
        'fingerprint_threshold': 0.80,
        'title_similarity': 0.70,
        'artist_similarity': 0.60,
-        'batch_size': 50,
+        'batch_size': 200,
    }
-    auto_fix = False
+    auto_fix = False  # User chooses fix action per finding

    def scan(self, context: JobContext) -> JobResult:
        result = JobResult()
@ -50,7 +60,7 @@ class AcoustIDScannerJob(RepairJob):
        fp_threshold = settings.get('fingerprint_threshold', 0.80)
        title_threshold = settings.get('title_similarity', 0.70)
        artist_threshold = settings.get('artist_similarity', 0.60)
-        batch_size = settings.get('batch_size', 50)
+        batch_size = settings.get('batch_size', 200)

        # Get AcoustID client
        acoustid_client = context.acoustid_client
@ -62,64 +72,56 @@ class AcoustIDScannerJob(RepairJob):
                logger.warning("AcoustID client not available: %s", e)
                return result

-        transfer = context.transfer_folder
-        if not os.path.isdir(transfer):
-            logger.warning("Transfer folder does not exist: %s", transfer)
+        # Load all library tracks from DB with their file paths
+        db_tracks = self._load_db_tracks(context)
+        if not db_tracks:
+            logger.info("No library tracks with file paths found")
            return result

-        # Read checkpoint (last processed file path) to resume from
-        checkpoint = None
+        # Read checkpoint (last processed track ID) to resume from
+        checkpoint_id = None
        if context.config_manager:
-            checkpoint = context.config_manager.get(
-                f'repair.jobs.{self.job_id}.checkpoint', None
+            checkpoint_id = context.config_manager.get(
+                f'repair.jobs.{self.job_id}.checkpoint_id', None
            )

-        # Collect all audio files
-        audio_files = []
-        for root, _dirs, files in os.walk(transfer):
-            if context.check_stop():
-                return result
-            for fname in sorted(files):
-                ext = os.path.splitext(fname)[1].lower()
-                if ext in AUDIO_EXTENSIONS:
-                    audio_files.append(os.path.join(root, fname))
-
-        # Sort for deterministic order (important for checkpoint)
-        audio_files.sort()
+        # Build ordered list of (track_id, info) sorted by ID for deterministic order
+        track_list = sorted(db_tracks.items(), key=lambda x: x[0])

        # Skip past checkpoint if resuming
-        if checkpoint:
-            try:
-                idx = audio_files.index(checkpoint)
-                audio_files = audio_files[idx + 1:]
-                logger.info("Resuming AcoustID scan from checkpoint (%d files remaining)", len(audio_files))
-            except ValueError:
-                logger.debug("Checkpoint file not found, starting from beginning")
-
-        total = len(audio_files)
+        if checkpoint_id is not None:
+            original_len = len(track_list)
+            track_list = [(tid, info) for tid, info in track_list if tid > checkpoint_id]
+            if len(track_list) < original_len:
+                logger.info("Resuming AcoustID scan from checkpoint ID %s (%d tracks remaining)",
+                            checkpoint_id, len(track_list))
+
+        total = len(track_list)
+        if context.report_progress:
+            context.report_progress(phase=f'Scanning {total} library tracks...', total=total)
        if context.update_progress:
            context.update_progress(0, total)

-        # Build a lookup of known tracks from DB for comparison
-        db_tracks = self._load_db_tracks(context)
-
-        if context.report_progress:
-            context.report_progress(phase=f'Fingerprinting {total} files...', total=total)
-
        batch_count = 0
-        for i, fpath in enumerate(audio_files):
+        for i, (track_id, track_info) in enumerate(track_list):
            if context.check_stop():
-                # Save checkpoint before stopping
-                self._save_checkpoint(context, fpath)
+                self._save_checkpoint_id(context, track_id)
                return result
            if i % 10 == 0 and context.wait_if_paused():
-                self._save_checkpoint(context, fpath)
+                self._save_checkpoint_id(context, track_id)
                return result

+            # Resolve the DB path to an actual file on disk
+            file_path = track_info.get('file_path', '')
+            resolved = self._resolve_path(file_path, context)
+            if not resolved:
+                result.skipped += 1
+                continue
+
            result.scanned += 1
            batch_count += 1

-            fname = os.path.basename(fpath)
+            fname = os.path.basename(resolved)
            if context.report_progress:
                context.report_progress(
                    scanned=i + 1, total=total,
@ -130,47 +132,38 @@ class AcoustIDScannerJob(RepairJob):

            try:
                self._scan_file(
-                    fpath, acoustid_client, db_tracks, context, result,
+                    resolved, track_id, track_info, acoustid_client, context, result,
                    fp_threshold, title_threshold, artist_threshold
                )
            except Exception as e:
                logger.debug("Error scanning %s: %s", fname, e)
                result.errors += 1

-            # Rate limit: pause between batches
+            # Rate limit: pause between batches to avoid hammering AcoustID API
            if batch_count >= batch_size:
                batch_count = 0
-                self._save_checkpoint(context, fpath)
+                self._save_checkpoint_id(context, track_id)
                if context.sleep_or_stop(2):
                    return result

            if context.update_progress and (i + 1) % 10 == 0:
                context.update_progress(i + 1, total)

-        # Clear checkpoint on completion
-        self._save_checkpoint(context, None)
+        # Clear checkpoint on full completion
+        self._save_checkpoint_id(context, None)

        if context.update_progress:
            context.update_progress(total, total)

-        logger.info("AcoustID scan: %d files scanned, %d mismatches found, %d errors",
-                     result.scanned, result.findings_created, result.errors)
+        logger.info("AcoustID scan: %d scanned, %d skipped, %d mismatches, %d errors",
+                     result.scanned, result.skipped, result.findings_created, result.errors)
        return result

-    def _scan_file(self, fpath, acoustid_client, db_tracks, context, result,
+    def _scan_file(self, fpath, track_id, expected, acoustid_client, context, result,
                   fp_threshold, title_threshold, artist_threshold):
        """Fingerprint a single file and check for mismatches."""
        fname = os.path.basename(fpath)

-        # Get expected title/artist from DB or filename
-        expected = db_tracks.get(os.path.normpath(fpath))
-        if not expected:
-            # Try to extract from filename: "01 - Artist - Title.flac" or "01 Title.flac"
-            base = os.path.splitext(fname)[0]
-            # Strip leading track number
-            base = re.sub(r'^\d{1,3}[\s.\-_]*', '', base)
-            expected = {'title': base, 'artist': '', 'track_id': None}
-
        # Fingerprint the file
        try:
            fp_result = acoustid_client.fingerprint_and_lookup(fpath)
@ -178,29 +171,18 @@ class AcoustIDScannerJob(RepairJob):
            logger.debug("Fingerprint failed for %s: %s", fname, e)
            result.errors += 1
            if context.report_progress:
-                context.report_progress(
-                    log_line=f'Error: {fname} — {e}',
-                    log_type='error'
-                )
+                context.report_progress(log_line=f'Error: {fname} — {e}', log_type='error')
            return

        if not fp_result or not fp_result.get('recordings'):
-            # No match — could be API error, rare track, or invalid key
-            # Don't create findings for "no match" — these flood the list
-            # and are usually not actionable. Only log for visibility.
            if context.report_progress:
-                context.report_progress(
-                    log_line=f'No match: {fname}',
-                    log_type='skip'
-                )
+                context.report_progress(log_line=f'No match: {fname}', log_type='skip')
            return

-        # Check best recording match
        best_score = fp_result.get('best_score', 0)
        if best_score < fp_threshold:
-            return  # Low confidence fingerprint, skip
+            return

-        # Compare best AcoustID result against expected
        best_recording = fp_result['recordings'][0]
        aid_title = best_recording.get('title', '')
        aid_artist = best_recording.get('artist', '')
@ -217,7 +199,6 @@ class AcoustIDScannerJob(RepairJob):
        title_sim = SequenceMatcher(None, norm_expected_title, norm_aid_title).ratio()
        artist_sim = SequenceMatcher(None, norm_expected_artist, norm_aid_artist).ratio() if norm_expected_artist else 1.0

-        # If both title AND artist match well, no issue
        if title_sim >= title_threshold and artist_sim >= artist_threshold:
            return

@ -234,13 +215,14 @@ class AcoustIDScannerJob(RepairJob):
                finding_type='acoustid_mismatch',
                severity=severity,
                entity_type='track',
-                entity_id=str(expected.get('track_id') or ''),
+                entity_id=str(track_id),
                file_path=fpath,
-                title=f'Possible wrong download: {fname}',
+                title=f'Wrong download: "{expected["title"]}" is actually "{aid_title}"',
                description=(
                    f'Expected "{expected["title"]}" by {expected["artist"]}, '
-                    f'but fingerprint matches "{aid_title}" by {aid_artist} '
-                    f'(fp: {best_score:.0%}, title: {title_sim:.0%}, artist: {artist_sim:.0%})'
+                    f'but audio fingerprint matches "{aid_title}" by {aid_artist} '
+                    f'(fingerprint: {best_score:.0%}, title match: {title_sim:.0%}, '
+                    f'artist match: {artist_sim:.0%})'
                ),
                details={
                    'expected_title': expected['title'],
@ -252,20 +234,22 @@ class AcoustIDScannerJob(RepairJob):
                    'artist_similarity': round(artist_sim, 3),
                    'album_thumb_url': expected.get('album_thumb_url'),
                    'artist_thumb_url': expected.get('artist_thumb_url'),
+                    'album_title': expected.get('album_title', ''),
+                    'track_number': expected.get('track_number'),
                }
            )
            result.findings_created += 1

    def _load_db_tracks(self, context: JobContext) -> dict:
-        """Load all tracks from DB keyed by normalized file_path."""
+        """Load all tracks from DB keyed by track ID."""
        tracks = {}
        conn = None
        try:
            conn = context.db._get_connection()
            cursor = conn.cursor()
            cursor.execute("""
-                SELECT t.id, t.title, ar.name, t.file_path,
-                       al.thumb_url, ar.thumb_url
+                SELECT t.id, t.title, ar.name, t.file_path, t.track_number,
+                       al.title AS album_title, al.thumb_url, ar.thumb_url
                FROM tracks t
                LEFT JOIN artists ar ON ar.id = t.artist_id
                LEFT JOIN albums al ON al.id = t.album_id
@ -273,13 +257,15 @@ class AcoustIDScannerJob(RepairJob):
                  AND t.title IS NOT NULL AND t.title != ''
            """)
            for row in cursor.fetchall():
-                track_id, title, artist_name, file_path, album_thumb, artist_thumb = row
-                tracks[os.path.normpath(file_path)] = {
-                    'track_id': track_id,
-                    'title': title or '',
-                    'artist': artist_name or '',
-                    'album_thumb_url': album_thumb or None,
-                    'artist_thumb_url': artist_thumb or None,
+                track_id = row[0]
+                tracks[track_id] = {
+                    'title': row[1] or '',
+                    'artist': row[2] or '',
+                    'file_path': row[3] or '',
+                    'track_number': row[4],
+                    'album_title': row[5] or '',
+                    'album_thumb_url': row[6] or None,
+                    'artist_thumb_url': row[7] or None,
                }
        except Exception as e:
            logger.error("Error loading tracks from DB: %s", e)
@ -288,12 +274,21 @@ class AcoustIDScannerJob(RepairJob):
                conn.close()
        return tracks

-    def _save_checkpoint(self, context: JobContext, fpath):
-        """Save or clear the scan checkpoint."""
+    def _resolve_path(self, file_path, context):
+        """Resolve a DB file path to an actual file on disk."""
+        if not file_path:
+            return None
+        if os.path.exists(file_path):
+            return file_path
+        # Try the repair_worker's resolver
+        from core.repair_worker import _resolve_file_path
+        return _resolve_file_path(file_path, context.transfer_folder)
+
+    def _save_checkpoint_id(self, context: JobContext, track_id):
+        """Save or clear the scan checkpoint by track ID."""
        if context.config_manager:
            context.config_manager.set(
-                f'repair.jobs.{self.job_id}.checkpoint',
-                fpath
+                f'repair.jobs.{self.job_id}.checkpoint_id', track_id
            )

    def _get_settings(self, context: JobContext) -> dict:
@ -305,15 +300,21 @@ class AcoustIDScannerJob(RepairJob):
        return merged

    def estimate_scope(self, context: JobContext) -> int:
-        transfer = context.transfer_folder
-        if not os.path.isdir(transfer):
+        conn = None
+        try:
+            conn = context.db._get_connection()
+            cursor = conn.cursor()
+            cursor.execute("""
+                SELECT COUNT(*) FROM tracks
+                WHERE file_path IS NOT NULL AND file_path != ''
+                  AND title IS NOT NULL AND title != ''
+            """)
+            return cursor.fetchone()[0]
+        except Exception:
            return 0
-        count = 0
-        for _root, _dirs, files in os.walk(transfer):
-            for fname in files:
-                if os.path.splitext(fname)[1].lower() in AUDIO_EXTENSIONS:
-                    count += 1
-        return count
+        finally:
+            if conn:
+                conn.close()


 def _normalize(text: str) -> str:
--- a/core/repair_worker.py
+++ b/core/repair_worker.py
@ -840,6 +840,7 @@ class RepairWorker:
            'missing_lossy_copy': self._fix_missing_lossy_copy,
            'unwanted_content': self._fix_unwanted_content,
            'unknown_artist': self._fix_unknown_artist,
+            'acoustid_mismatch': self._fix_acoustid_mismatch,
        }
        handler = handlers.get(finding_type)
        if not handler:
@ -1509,6 +1510,108 @@ class RepairWorker:
        return {'success': True, 'action': 'fixed_unknown_artist',
                'message': f'Fixed: {corrected_artist} - {corrected_title}'}

+    def _fix_acoustid_mismatch(self, entity_type, entity_id, file_path, details):
+        """Fix an AcoustID mismatch. Actions:
+           'retag' (default): Update DB title/artist to match the actual audio content
+           'redownload': Add the expected (correct) track to wishlist and delete the wrong file
+           'delete': Just delete the wrong file and DB record
+        """
+        fix_action = details.get('_fix_action', 'retag')
+        track_id = entity_id
+
+        if fix_action == 'delete':
+            # Delete file + DB record
+            if file_path:
+                resolved = _resolve_file_path(file_path, self.transfer_folder)
+                if resolved and os.path.exists(resolved):
+                    try:
+                        os.remove(resolved)
+                    except Exception as e:
+                        logger.warning("Could not delete file %s: %s", resolved, e)
+            if track_id:
+                try:
+                    conn = self.db._get_connection()
+                    cursor = conn.cursor()
+                    cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,))
+                    conn.commit()
+                    conn.close()
+                except Exception as e:
+                    return {'success': False, 'error': f'DB delete failed: {e}'}
+            return {'success': True, 'action': 'deleted',
+                    'message': f'Deleted wrong file: {os.path.basename(file_path or "")}'}
+
+        if fix_action == 'redownload':
+            # Add expected track to wishlist, then delete the wrong file
+            expected_title = details.get('expected_title', '')
+            expected_artist = details.get('expected_artist', '')
+            album_title = details.get('album_title', '')
+            if expected_title and expected_artist:
+                try:
+                    import json, uuid
+                    track_data = {
+                        'id': f'acoustid_fix_{uuid.uuid4().hex[:8]}',
+                        'name': expected_title,
+                        'artists': [{'name': expected_artist}],
+                        'album': {'name': album_title} if album_title else {'name': expected_title},
+                    }
+                    self.db.add_to_wishlist(
+                        spotify_track_data=track_data,
+                        failure_reason='AcoustID mismatch — re-downloading correct track',
+                        source_type='repair',
+                    )
+                    logger.info("Added '%s' by '%s' to wishlist for re-download",
+                                expected_title, expected_artist)
+                except Exception as e:
+                    logger.warning("Could not add to wishlist: %s", e)
+            # Delete wrong file
+            if file_path:
+                resolved = _resolve_file_path(file_path, self.transfer_folder)
+                if resolved and os.path.exists(resolved):
+                    try:
+                        os.remove(resolved)
+                    except Exception:
+                        pass
+            if track_id:
+                try:
+                    conn = self.db._get_connection()
+                    cursor = conn.cursor()
+                    cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,))
+                    conn.commit()
+                    conn.close()
+                except Exception:
+                    pass
+            return {'success': True, 'action': 'redownload',
+                    'message': f'Added "{expected_title}" to wishlist, removed wrong file'}
+
+        # Default: retag — update DB record to match the actual audio content
+        aid_title = details.get('acoustid_title', '')
+        aid_artist = details.get('acoustid_artist', '')
+        if not aid_title:
+            return {'success': False, 'error': 'No AcoustID title available to retag'}
+
+        try:
+            conn = self.db._get_connection()
+            cursor = conn.cursor()
+            # Update track title
+            cursor.execute("UPDATE tracks SET title = ? WHERE id = ?", (aid_title, track_id))
+            # Update artist if we have one and it differs
+            if aid_artist:
+                cursor.execute("SELECT id FROM artists WHERE LOWER(name) = LOWER(?)", (aid_artist,))
+                row = cursor.fetchone()
+                if row:
+                    cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?", (row[0], track_id))
+                else:
+                    cursor.execute("INSERT INTO artists (name) VALUES (?)", (aid_artist,))
+                    cursor.execute("UPDATE tracks SET artist_id = ? WHERE id = ?",
+                                   (cursor.lastrowid, track_id))
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            return {'success': False, 'error': f'DB update failed: {e}'}
+
+        return {'success': True, 'action': 'retagged',
+                'message': f'Updated to: "{aid_title}" by {aid_artist}'}
+
    def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details):
        """Remove the mismatched MusicBrainz recording ID from the audio file."""
        if not file_path: