SoulSync/core/auto_import_worker.py

"""Auto-Import Worker — watches staging folder, identifies music, and processes automatically.

Scans the staging folder for audio files and album folders, identifies them
using tags/filenames/AcoustID, matches to metadata source tracklists, and
processes high-confidence matches through the post-processing pipeline.
Lower-confidence matches are queued for user review.

Supports both album folders (directories containing audio files) and single
loose audio files in the staging root.
"""

import hashlib
import json
import os
import re
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from datetime import datetime
from difflib import SequenceMatcher
from typing import Any, Callable, Dict, List, Optional

from utils.logging_config import get_logger

logger = get_logger("auto_import")

AUDIO_EXTENSIONS = {'.mp3', '.flac', '.ogg', '.opus', '.m4a', '.aac', '.wav', '.wma', '.aiff', '.aif', '.ape'}
DISC_FOLDER_RE = re.compile(r'^(?:disc|cd|disk)\s*(\d+)$', re.IGNORECASE)


@dataclass
class FolderCandidate:
    path: str
    name: str
    audio_files: List[str] = field(default_factory=list)
    disc_structure: Dict[int, List[str]] = field(default_factory=dict)  # disc_num -> files
    folder_hash: str = ''
    is_single: bool = False  # True for loose files in staging root
    # True when the candidate "folder" is the staging root itself (user dropped
    # disc folders directly into staging without an album wrapper). The name is
    # meaningless ("Staging", "Music", etc.) — folder-name identification must
    # be skipped or it will false-match against random albums.
    is_staging_root: bool = False


@dataclass
class _ActiveImport:
    """Per-candidate UI state for an in-flight import.

    Multiple instances can exist simultaneously when the executor pool
    runs candidates in parallel. Each is keyed on `folder_hash` in the
    worker's `_active_imports` dict; mutations are gated by
    `_active_lock` so the polling UI sees a coherent snapshot.

    Pre-refactor the worker had scalar `_current_folder` /
    `_current_status` / `_current_track_*` fields stomped by every pool
    worker — three concurrent imports would interleave each other's
    folder name + track index in the UI. This dataclass + the dict
    keyed on folder_hash makes per-candidate state isolated.
    """
    folder_hash: str
    folder_name: str
    status: str = 'queued'   # 'queued' | 'identifying' | 'matching' | 'processing'
    track_index: int = 0
    track_total: int = 0
    track_name: str = ''


def _compute_folder_hash(audio_files: List[str]) -> str:
    """Deterministic hash of folder contents for change detection."""
    items = []
    for f in sorted(audio_files):
        try:
            items.append(f"{os.path.basename(f)}:{os.path.getsize(f)}")
        except OSError:
            items.append(os.path.basename(f))
    return hashlib.md5('|'.join(items).encode()).hexdigest()


def _read_file_tags(file_path: str) -> Dict[str, Any]:
    """Read embedded tags from an audio file.

    Returns dict with: title, artist, album, track_number, disc_number,
    year, genres, isrc, mbid, duration_ms.

    The exact-identifier fields (``isrc``, ``mbid``) and the audio
    duration enable the ID-based fast paths + duration sanity gate in
    ``core/imports/album_matching.py``. Tagged files (Picard-tagged
    libraries always carry MBID; most metadata sources carry ISRC) get
    perfect-match identification without going through fuzzy scoring.

    ``genres`` is a list of strings — Mutagen's easy mode returns the
    GENRE tag as a list (some files carry multiple genres). Empty list
    when the tag is absent. Worker aggregates these across an album's
    tracks to populate the artist row's genres column at insert time
    (matches the soulsync_client deep-scan behaviour).

    All exact-identifier fields default to empty string when the tag
    isn't present — callers treat empty as "not available, fall back to
    fuzzy matching".
    """
    result = {
        'title': '', 'artist': '', 'album': '',
        'track_number': 0, 'disc_number': 1, 'year': '',
        'genres': [], 'isrc': '', 'mbid': '', 'duration_ms': 0,
    }
    try:
        from mutagen import File as MutagenFile
        audio = MutagenFile(file_path, easy=True)
        if audio:
            # Audio length comes off audio.info, not tags. Mutagen returns
            # seconds as a float; convert to int milliseconds to match the
            # metadata-source convention (Spotify/Deezer/iTunes all return
            # duration_ms).
            length_s = getattr(getattr(audio, 'info', None), 'length', 0) or 0
            try:
                result['duration_ms'] = int(round(float(length_s) * 1000))
            except (TypeError, ValueError):
                pass

            if audio.tags:
                tags = audio.tags
                result['title'] = (tags.get('title', [''])[0] or '').strip()
                # Prefer albumartist for album-level identification (per-track
                # artist often includes features like "Kendrick Lamar, Drake"
                # which fragment consensus when grouping tracks into an album).
                # Fall back to artist for files that lack albumartist.
                result['artist'] = (tags.get('albumartist', [''])[0] or tags.get('artist', [''])[0] or '').strip()
                result['album'] = (tags.get('album', [''])[0] or '').strip()
                # Date/year — try 'date' first, fall back to 'year'
                date_str = (tags.get('date', [''])[0] or tags.get('year', [''])[0] or '').strip()
                if date_str and len(date_str) >= 4:
                    result['year'] = date_str[:4]
                tn = tags.get('tracknumber', ['0'])[0]
                try:
                    result['track_number'] = int(str(tn).split('/')[0])
                except (ValueError, TypeError):
                    pass
                dn = tags.get('discnumber', ['1'])[0]
                try:
                    result['disc_number'] = int(str(dn).split('/')[0])
                except (ValueError, TypeError):
                    pass
                # GENRE — Mutagen easy mode returns a list (some files
                # carry multiple genres, e.g. "Hip-Hop;Rap;Trap"). Skip
                # empty / whitespace entries so the aggregator doesn't
                # have to filter them.
                raw_genres = tags.get('genre', []) or []
                if isinstance(raw_genres, str):
                    raw_genres = [raw_genres]
                result['genres'] = [
                    str(g).strip() for g in raw_genres if str(g).strip()
                ]
                # ISRC — International Standard Recording Code. Per-recording
                # unique identifier; metadata sources expose it as `isrc` on
                # tracks. Picard / Beets both write this tag from MusicBrainz.
                result['isrc'] = (tags.get('isrc', [''])[0] or '').strip().upper()
                # MusicBrainz Recording ID — Picard's primary identifier.
                # Stored in `musicbrainz_trackid` for ID3, or
                # `MUSICBRAINZ_TRACKID` for Vorbis comments. Mutagen's easy
                # mode normalizes the key.
                result['mbid'] = (tags.get('musicbrainz_trackid', [''])[0] or '').strip().lower()
    except Exception as e:
        logger.debug(f"Could not read tags from {os.path.basename(file_path)}: {e}")
    return result


def _parse_folder_name(folder_name: str):
    """Try to extract artist and album from folder name. Returns (artist, album) or (None, folder_name)."""
    # Pattern: "Artist - Album"
    if ' - ' in folder_name:
        parts = folder_name.split(' - ', 1)
        return parts[0].strip(), parts[1].strip()
    # Pattern: just the folder name as album
    return None, folder_name.strip()


def _normalize(text: str) -> str:
    if not text:
        return ''
    t = text.lower().strip()
    t = re.sub(r'\(.*?\)', '', t)
    t = re.sub(r'\[.*?\]', '', t)
    t = re.sub(r'[^\w\s]', '', t)
    return ' '.join(t.split())


def _similarity(a: str, b: str) -> float:
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, _normalize(a), _normalize(b)).ratio()


def _quality_rank(ext: str) -> int:
    """Higher = better quality."""
    ranks = {'.flac': 10, '.wav': 9, '.aiff': 9, '.aif': 9, '.ape': 8,
             '.m4a': 7, '.ogg': 6, '.opus': 6, '.mp3': 5, '.wma': 3, '.aac': 5}
    return ranks.get(ext.lower(), 1)


# Weight constants for `_score_album_search_result` — exposed at module
# level so they're greppable + bumpable in one place. Pre-fix these were
# magic numbers inline.
_ALBUM_NAME_WEIGHT = 0.5    # title fuzzy similarity
_ARTIST_NAME_WEIGHT = 0.2   # primary artist fuzzy similarity (skipped when target is empty)
_TRACK_COUNT_WEIGHT = 0.3   # how close the source's track count is to the file count


def _score_album_search_result(album_result, target_album: str,
                               target_artist: Optional[str],
                               file_count: int) -> float:
    """Pure scoring helper for `_search_metadata_source`.

    Weights how well an `album_result` from a metadata source's
    `search_albums` matches the search inputs. Returns float in [0.0, 1.0].
    Pre-extraction this lived inline in the loop body; lifting it out
    lets the weight math be pinned independently of the orchestrator
    (per-source iteration, exception containment, threshold check).

    `album_result` is expected to expose:
      - `.name` (str)
      - `.artists` (list of dict-like with 'name', optional 'id') or list[str]
      - `.total_tracks` (int, optional)
    """
    score = 0.0

    # Album name similarity (default 50%)
    name = getattr(album_result, 'name', '') or ''
    score += _similarity(target_album, name) * _ALBUM_NAME_WEIGHT

    # Artist similarity (default 20%) — only when target_artist provided
    if target_artist:
        artists = getattr(album_result, 'artists', None) or []
        r_artist = artists[0] if artists else ''
        if isinstance(r_artist, dict):
            r_artist = r_artist.get('name', '')
        score += _similarity(target_artist, str(r_artist)) * _ARTIST_NAME_WEIGHT

    # Track count match (default 30%) — only when both sides have a count
    r_tracks = getattr(album_result, 'total_tracks', 0) or 0
    if r_tracks > 0 and file_count > 0:
        count_ratio = 1.0 - abs(r_tracks - file_count) / max(r_tracks, file_count)
        score += max(0.0, count_ratio) * _TRACK_COUNT_WEIGHT

    return score


class AutoImportWorker:
    """Background worker that watches the staging folder and auto-imports music.

    Concurrency model:

    - **One scan thread** (the `_run` timer loop) enumerates the staging
      folder periodically. Manual "Scan Now" requests share the same
      scan via `trigger_scan()` — non-blocking lock means duplicate
      requests no-op instead of stacking up parallel scanners.
    - **Bounded process pool** (`ThreadPoolExecutor`, default 3 workers)
      handles per-candidate work: identification, matching, file move,
      tagging, DB write. Each candidate runs to completion in its own
      pool thread; multiple candidates run in parallel up to the pool
      size.
    - The scan thread is FAST (just enumeration + submit), the pool
      threads are SLOW (per-candidate work).

    Pre-refactor, the manual-scan endpoint spawned a fresh
    `threading.Thread(target=_scan_cycle)` per click — emergent
    parallelism with no upper bound, no shared queue, no graceful
    shutdown. Fixed by routing both the timer + the manual button
    through `trigger_scan()` and submitting per-candidate work to a
    shared executor.
    """

    def __init__(self, database, staging_path: str = './Staging',
                 transfer_path: str = './Transfer',
                 process_callback: Optional[Callable] = None,
                 config_manager: Any = None,
                 automation_engine: Any = None,
                 max_workers: int = 3):
        self.database = database
        self.staging_path = staging_path
        self.transfer_path = transfer_path
        self._process_callback = process_callback
        self._config_manager = config_manager
        self._automation_engine = automation_engine

        # Pool size — defaults to 3 to match the existing pool patterns
        # (`missing_download_executor`, `sync_executor`,
        # `import_singles_executor`). Configurable via the
        # `auto_import.max_workers` config key on init; not hot-
        # reloadable (the executor is created once and lives for the
        # worker's lifetime).
        if config_manager:
            max_workers = config_manager.get('auto_import.max_workers', max_workers)
        self._max_workers = max(1, int(max_workers))

        self.running = False
        self.paused = False
        self.should_stop = False
        self._thread = None
        self._stop_event = threading.Event()
        # Bounded executor for per-candidate processing work. Created
        # in `start()` so a stopped+restarted worker gets a fresh pool.
        self._executor: Optional[ThreadPoolExecutor] = None
        # Non-blocking lock that gates concurrent scans. Both the timer
        # loop and the manual "Scan Now" endpoint route through
        # `trigger_scan()`; a `try-acquire` here means whichever caller
        # gets there first runs the scan and the rest no-op.
        self._scan_lock = threading.Lock()

        # State
        self._folder_snapshots: Dict[str, float] = {}  # path -> mtime_sum
        # Candidates currently submitted to the pool OR running in a
        # pool worker. Keyed on folder_hash, NOT path — multiple
        # candidates can share a path (each loose-file group at staging
        # root has the same parent directory but a distinct hash from
        # its own audio files). Path-keyed dedup would treat siblings
        # as duplicates and silently skip all but the first.
        # Rebranded from `_processing_hashes` to `_submitted_hashes`
        # because submission to the pool happens immediately (queued
        # OR running) — both states need to gate next-scan submissions.
        self._submitted_hashes: set = set()
        self._submitted_lock = threading.Lock()

        # Per-candidate UI state, keyed on folder_hash. Multiple pool
        # workers populate this dict simultaneously; `_active_lock`
        # gates every read/write so the polling UI sees a coherent
        # snapshot. Replaces the scalar `_current_folder` /
        # `_current_status` / `_current_track_*` fields — those were
        # safe under the old sequential model but stomped each other
        # under parallel executor workers.
        self._active_imports: Dict[str, _ActiveImport] = {}
        self._active_lock = threading.Lock()

        # Whether a scan-cycle (enumeration phase) is currently
        # running. Distinct from per-candidate processing — the scan
        # is fast (seconds) and runs at most once at a time
        # (gated by `_scan_lock`). Per-candidate work runs concurrently
        # in the pool, tracked in `_active_imports`.
        self._scan_in_progress = False

        # `_stats[x] += 1` from multiple pool threads is read-modify-
        # write — under load the counters drift. `_stats_lock` gates
        # every mutation via `_bump_stat`.
        self._stats = {'scanned': 0, 'auto_processed': 0, 'pending_review': 0, 'failed': 0}
        self._stats_lock = threading.Lock()
        self._last_scan_time = None

    # ── Per-candidate UI state helpers ──

    def _register_active(self, candidate: 'FolderCandidate', status: str = 'queued') -> None:
        """Insert/refresh the active-import entry for a candidate."""
        with self._active_lock:
            entry = self._active_imports.get(candidate.folder_hash)
            if entry is None:
                entry = _ActiveImport(
                    folder_hash=candidate.folder_hash,
                    folder_name=candidate.name,
                    status=status,
                )
                self._active_imports[candidate.folder_hash] = entry
            else:
                # Refresh in case the candidate name changed across scans
                entry.folder_name = candidate.name
                entry.status = status

    def _update_active(self, folder_hash: str, **fields: Any) -> None:
        """Mutate fields on an active-import entry. No-op if the entry
        isn't registered (e.g. test calling helpers directly without
        going through `_register_active`)."""
        with self._active_lock:
            entry = self._active_imports.get(folder_hash)
            if entry is None:
                return
            for key, value in fields.items():
                if hasattr(entry, key):
                    setattr(entry, key, value)

    def _unregister_active(self, folder_hash: str) -> None:
        with self._active_lock:
            self._active_imports.pop(folder_hash, None)

    def _snapshot_active(self) -> List[Dict[str, Any]]:
        """Coherent list snapshot for the UI poller. Order is insertion
        order so the legacy single-import fields (which read the first
        entry) are stable for any given UI poll cycle."""
        with self._active_lock:
            return [
                {
                    'folder_hash': e.folder_hash,
                    'folder_name': e.folder_name,
                    'status': e.status,
                    'track_index': e.track_index,
                    'track_total': e.track_total,
                    'track_name': e.track_name,
                }
                for e in self._active_imports.values()
            ]

    def _bump_stat(self, key: str) -> None:
        """Thread-safe increment of `_stats[key]`. Pool workers call
        this from multiple threads; raw `self._stats[k] += 1` is read-
        modify-write and drops counts under load."""
        with self._stats_lock:
            self._stats[key] = self._stats.get(key, 0) + 1

    # Read-only back-compat properties — the test fixture (and the
    # polling UI's legacy fields) read these. Resolve to the FIRST
    # active import so the existing single-track-progress UI keeps
    # working when only one candidate is in flight (the common case).
    # When N candidates run in parallel the UI should iterate
    # `active_imports` from `get_status()` instead.

    @property
    def _current_folder(self) -> str:
        with self._active_lock:
            if not self._active_imports:
                return ''
            return next(iter(self._active_imports.values())).folder_name

    @property
    def _current_status(self) -> str:
        with self._active_lock:
            for e in self._active_imports.values():
                if e.status == 'processing':
                    return 'processing'
            if self._active_imports:
                # An active import that hasn't reached 'processing' yet
                # is still in identification/matching — keep showing
                # 'scanning' for the legacy UI (no separate state).
                return 'scanning'
        return 'scanning' if self._scan_in_progress else 'idle'

    @property
    def _current_track_index(self) -> int:
        with self._active_lock:
            if not self._active_imports:
                return 0
            return next(iter(self._active_imports.values())).track_index

    @property
    def _current_track_total(self) -> int:
        with self._active_lock:
            if not self._active_imports:
                return 0
            return next(iter(self._active_imports.values())).track_total

    @property
    def _current_track_name(self) -> str:
        with self._active_lock:
            if not self._active_imports:
                return ''
            return next(iter(self._active_imports.values())).track_name

    def start(self):
        if self.running:
            return
        self.should_stop = False
        self._stop_event.clear()
        self.running = True
        # Fresh pool per start so a stop+start cycle gets a clean
        # executor (the previous one is shut down in `stop()`).
        self._executor = ThreadPoolExecutor(
            max_workers=self._max_workers,
            thread_name_prefix='AutoImport',
        )
        self._thread = threading.Thread(target=self._run, daemon=True, name='AutoImportWorker')
        self._thread.start()
        logger.info(f"Auto-import worker started (max_workers={self._max_workers})")

    def stop(self):
        self.should_stop = True
        self._stop_event.set()
        self.running = False
        if self._thread and self._thread.is_alive():
            self._thread.join(timeout=5)
        # Wait for in-flight pool work to finish before reporting
        # stopped. Without `wait=True` we'd return while file moves /
        # tag writes / DB inserts are still mid-flight, which can
        # corrupt state on shutdown.
        if self._executor is not None:
            self._executor.shutdown(wait=True)
            self._executor = None
        logger.info("Auto-import worker stopped")

    def pause(self):
        self.paused = True
        logger.info("Auto-import worker paused")

    def resume(self):
        self.paused = False
        logger.info("Auto-import worker resumed")

    def get_status(self) -> dict:
        active = self._snapshot_active()
        # Aggregate top-level status: 'processing' if any active import
        # is in the per-track loop, else 'scanning' if a scan or any
        # earlier-phase import is in flight, else 'idle'.
        if any(a['status'] == 'processing' for a in active):
            current_status = 'processing'
        elif active or self._scan_in_progress:
            current_status = 'scanning'
        else:
            current_status = 'idle'
        # Legacy single-import scalars — pulled from the first active
        # entry so the existing UI keeps rendering one folder at a
        # time. Multi-import-aware UIs should read `active_imports`.
        first = active[0] if active else None
        with self._stats_lock:
            stats_snapshot = self._stats.copy()
        return {
            'running': self.running,
            'paused': self.paused,
            'current_status': current_status,
            'current_folder': first['folder_name'] if first else '',
            'current_track_index': first['track_index'] if first else 0,
            'current_track_total': first['track_total'] if first else 0,
            'current_track_name': first['track_name'] if first else '',
            'active_imports': active,
            'stats': stats_snapshot,
            'last_scan_time': self._last_scan_time,
        }

    def _interruptible_sleep(self, seconds: float) -> bool:
        """Sleep in small increments. Returns True if should stop."""
        return self._stop_event.wait(seconds)

    def _run(self):
        """Main worker loop — calls `trigger_scan()` periodically."""
        interval = 60
        if self._config_manager:
            interval = self._config_manager.get('auto_import.scan_interval', 60)

        # Initial delay to let the app start up
        if self._interruptible_sleep(10):
            return

        while not self.should_stop:
            if not self.paused:
                enabled = True
                if self._config_manager:
                    enabled = self._config_manager.get('auto_import.enabled', False)

                if enabled:
                    self.trigger_scan()

            if self._interruptible_sleep(interval):
                break

    def trigger_scan(self):
        """Run one scan cycle — single canonical entry point for both
        the timer loop AND the manual "Scan Now" endpoint.

        Non-blocking: if a scan is already running, returns immediately
        without spawning a duplicate. The in-flight scan will pick up
        any new files anyway, and stacking parallel scanners caused
        unbounded thread growth pre-refactor (each "Scan Now" click
        spawned a fresh `_scan_cycle` thread).

        Per-candidate processing happens on the bounded executor pool
        — this method just enumerates + submits, so it returns fast.
        """
        if not self._scan_lock.acquire(blocking=False):
            logger.debug("[Auto-Import] Scan already running, skipping duplicate trigger")
            return

        try:
            self._scan_in_progress = True
            self._scan_and_submit()
            self._last_scan_time = datetime.now().isoformat()
        except Exception as e:
            logger.error(f"Auto-import scan cycle error: {e}")
        finally:
            self._scan_in_progress = False
            self._scan_lock.release()

    def _scan_and_submit(self):
        """Enumerate staging candidates + submit each to the executor.

        Fast — does NOT block on per-candidate processing. The pool
        runs `_process_one_candidate` in parallel up to `max_workers`.
        """
        staging = self._resolve_staging_path()
        if not staging or not os.path.isdir(staging):
            logger.warning(f"[Auto-Import] Staging path not found or invalid: {self.staging_path}")
            return

        candidates = self._enumerate_folders(staging)
        logger.info(f"[Auto-Import] Scan cycle: {len(candidates)} candidates in {staging}")
        if not candidates:
            return

        if self._executor is None:
            logger.warning("[Auto-Import] Executor not initialized — skipping scan")
            return

        for candidate in candidates:
            if self.should_stop or self.paused:
                break

            # Skip if already processed (DB-level dedup)
            if self._is_already_processed(candidate.folder_hash):
                continue

            # Skip if already submitted to / running in the pool. This
            # de-dupes across the timer loop + manual scan triggers
            # (both share the `_submitted_hashes` set).
            with self._submitted_lock:
                if candidate.folder_hash in self._submitted_hashes:
                    logger.debug(
                        f"[Auto-Import] Skipping {candidate.name} — "
                        f"already queued in pool"
                    )
                    continue

            # Stability gate (files not changing). Done OUTSIDE the
            # submitted-hashes critical section so a slow stat() call
            # doesn't hold the lock across other candidates.
            if not self._is_folder_stable(candidate):
                continue

            with self._submitted_lock:
                # Re-check inside the lock — another scanner could have
                # claimed this candidate between the first check + here.
                if candidate.folder_hash in self._submitted_hashes:
                    continue
                self._submitted_hashes.add(candidate.folder_hash)

            try:
                self._executor.submit(self._process_one_candidate, candidate)
            except RuntimeError as exc:
                # Executor was shut down while we were submitting —
                # release our claim so a future scan can retry.
                logger.debug("[Auto-Import] Executor rejected submit: %s", exc)
                with self._submitted_lock:
                    self._submitted_hashes.discard(candidate.folder_hash)

    def _process_one_candidate(self, candidate: 'FolderCandidate'):
        """Per-candidate processing — runs in a pool worker thread.

        Identical logic to the old `_scan_cycle` for-loop body, just
        moved into a method so the executor can run multiple
        candidates in parallel.

        Each pool worker registers its candidate in `_active_imports`
        on entry + unregisters on exit. UI status fields are scoped
        per-candidate so concurrent workers don't stomp each other.
        """
        self._bump_stat('scanned')
        self._register_active(candidate, status='identifying')
        logger.info(f"[Auto-Import] Processing folder: {candidate.name} ({len(candidate.audio_files)} files)")

        threshold = 0.9
        if self._config_manager:
            threshold = self._config_manager.get('auto_import.confidence_threshold', 0.9)

        auto_process = True
        if self._config_manager:
            auto_process = self._config_manager.get('auto_import.auto_process', True)

        try:
            # Phase 3: Identify
            identification = self._identify_folder(candidate)
            if not identification:
                self._record_result(candidate, 'needs_identification', 0.0,
                                    error_message='Could not identify album from tags, folder name, or fingerprint')
                self._bump_stat('failed')
                return

            # Phase 4: Match tracks
            self._update_active(candidate.folder_hash, status='matching')
            match_result = self._match_tracks(candidate, identification)
            if not match_result:
                self._record_result(candidate, 'needs_identification', 0.0,
                                    album_id=identification.get('album_id'),
                                    album_name=identification.get('album_name'),
                                    artist_name=identification.get('artist_name'),
                                    image_url=identification.get('image_url'),
                                    error_message='Could not match tracks to album tracklist')
                self._bump_stat('failed')
                return

            confidence = match_result['confidence']
            status = 'matched'

            # Check if individual track matches are strong even if overall confidence
            # is low (e.g. only 2 of 18 album tracks present → low coverage kills
            # overall score, but the 2 tracks match perfectly and should still import)
            high_conf_matches = [m for m in match_result.get('matches', []) if m['confidence'] >= 0.8]
            has_strong_individual_matches = len(high_conf_matches) > 0

            if (confidence >= threshold or has_strong_individual_matches) and auto_process:
                # Phase 5: Auto-process — insert an in-progress row
                # so the UI sees the import the moment it starts,
                # then update it with the final status when done.
                effective_conf = max(confidence, min(m['confidence'] for m in high_conf_matches) if high_conf_matches else 0)
                logger.info(f"[Auto-Import] Processing {candidate.name} — "
                            f"overall: {confidence:.0%}, {len(high_conf_matches)} strong matches, "
                            f"{match_result.get('matched_count', 0)}/{match_result.get('total_tracks', '?')} tracks")

                in_progress_row_id = self._record_in_progress(
                    candidate, identification, match_result,
                )
                self._update_active(candidate.folder_hash, status='processing')

                success = self._process_matches(candidate, identification, match_result)
                status = 'completed' if success else 'failed'
                confidence = max(confidence, effective_conf)
                if success:
                    self._bump_stat('auto_processed')
                else:
                    self._bump_stat('failed')

                # Update the in-progress row in place — UI shows the
                # final result without a separate insert race.
                self._finalize_result(in_progress_row_id, status, confidence)
            elif confidence >= 0.7:
                status = 'pending_review'
                self._bump_stat('pending_review')
                logger.info(f"[Auto-Import] Medium confidence ({confidence:.0%}) — pending review: {candidate.name}")
                self._record_result(candidate, status, confidence,
                                    album_id=identification.get('album_id'),
                                    album_name=identification.get('album_name'),
                                    artist_name=identification.get('artist_name'),
                                    image_url=identification.get('image_url'),
                                    identification_method=identification.get('method'),
                                    match_data=match_result)
            else:
                status = 'needs_identification'
                self._bump_stat('failed')
                logger.info(f"[Auto-Import] Low confidence ({confidence:.0%}) — needs manual ID: {candidate.name}")
                self._record_result(candidate, status, confidence,
                                    album_id=identification.get('album_id'),
                                    album_name=identification.get('album_name'),
                                    artist_name=identification.get('artist_name'),
                                    image_url=identification.get('image_url'),
                                    identification_method=identification.get('method'),
                                    match_data=match_result)

        except Exception as e:
            logger.error(f"[Auto-Import] Error processing {candidate.name}: {e}")
            self._record_result(candidate, 'failed', 0.0, error_message=str(e))
            self._bump_stat('failed')
        finally:
            with self._submitted_lock:
                self._submitted_hashes.discard(candidate.folder_hash)
            # Per-candidate UI state goes away with the candidate.
            # No stale "processing track 3/14" because the entry is
            # gone — the UI's polling read returns an empty array.
            self._unregister_active(candidate.folder_hash)

    # ── Scanning ──

    def _resolve_staging_path(self) -> Optional[str]:
        path = self.staging_path
        if self._config_manager:
            path = self._config_manager.get('import.staging_path', path)
        # Docker path resolution
        if os.path.isdir(path):
            return path
        for candidate in ['./Staging', '/app/Staging']:
            if os.path.isdir(candidate):
                return candidate
        return None

    def _enumerate_folders(self, staging: str) -> List[FolderCandidate]:
        """Find album folder and single file candidates in staging directory (recursive)."""
        candidates = []
        self._scan_directory(staging, candidates, staging_root=staging)
        return candidates

    def _scan_directory(self, directory: str, candidates: List[FolderCandidate], staging_root: str = ''):
        """Recursively scan a directory for album folders and loose audio files.

        Loose-file handling:
        - Read each loose file's `album` tag and group by normalised
          album name. Each group becomes its own candidate so a chaotic
          staging root (multiple albums dumped loose) imports correctly
          instead of bundling everything into one fake "album."
        - Untagged loose files become individual single candidates (they
          have nothing to group with).
        - Disc folders at the same level attach to the loose-file group
          whose album tag matches the disc-folder files (typical layout:
          loose files for disc 1 + `Disc 2/`, `Disc 3/` subfolders).
        - Disc folders with no matching loose group become standalone
          multi-disc candidates.

        Recursion rule:
        - Always recurse into non-disc subdirectories. The previous
          rule "only recurse when no loose files exist" silently
          ignored album subfolders sitting next to loose files —
          common when a user moves some tracks out of an album folder
          while leaving the parent album folder intact.
        """
        try:
            entries = sorted(os.listdir(directory))
        except OSError:
            return

        loose_files = []
        subdirs = []

        for entry in entries:
            full_path = os.path.join(directory, entry)
            if os.path.isfile(full_path) and os.path.splitext(entry)[1].lower() in AUDIO_EXTENSIONS:
                loose_files.append(full_path)
            elif os.path.isdir(full_path):
                subdirs.append((entry, full_path))

        disc_subdirs = [(n, p) for n, p in subdirs if DISC_FOLDER_RE.match(n)]
        non_disc_subdirs = [(n, p) for n, p in subdirs if not DISC_FOLDER_RE.match(n)]

        # Build disc_structure from disc subdirs once — referenced by
        # both the loose-files branch (to attach matching discs to the
        # right loose-file group) and the disc-only branch.
        disc_files_by_num: Dict[int, List[str]] = {}
        for sub_name, sub_path in disc_subdirs:
            disc_num = int(DISC_FOLDER_RE.match(sub_name).group(1))
            try:
                disc_files = [os.path.join(sub_path, f) for f in sorted(os.listdir(sub_path))
                              if os.path.isfile(os.path.join(sub_path, f))
                              and os.path.splitext(f)[1].lower() in AUDIO_EXTENSIONS]
            except OSError:
                disc_files = []
            if disc_files:
                disc_files_by_num[disc_num] = disc_files

        if loose_files:
            self._build_loose_file_candidates(
                directory, loose_files, disc_files_by_num, candidates,
            )
        elif disc_files_by_num and not non_disc_subdirs:
            # Disc-only directory — treat THIS directory as the album.
            # Common when a user drops `Disc 1/`, `Disc 2/` straight
            # into staging without an album-level loose-file group.
            audio_files: List[str] = []
            disc_structure: Dict[int, List[str]] = {}
            for disc_num, disc_files in disc_files_by_num.items():
                disc_structure[disc_num] = disc_files
                audio_files.extend(disc_files)

            if audio_files:
                folder_name = os.path.basename(directory)
                folder_hash = _compute_folder_hash(audio_files)
                is_staging_root = bool(staging_root) and os.path.normpath(directory) == os.path.normpath(staging_root)
                candidates.append(FolderCandidate(
                    path=directory, name=folder_name, audio_files=audio_files,
                    disc_structure=disc_structure, folder_hash=folder_hash,
                    is_staging_root=is_staging_root,
                ))

        # Always recurse into non-disc subdirectories — even when this
        # level has loose files. Otherwise album subfolders sitting
        # beside loose tracks get silently ignored (the bug a chaotic
        # staging root surfaced on 2026-05-09).
        for _sub_name, sub_path in non_disc_subdirs:
            self._scan_directory(sub_path, candidates, staging_root=staging_root)

    def _build_loose_file_candidates(
        self,
        directory: str,
        loose_files: List[str],
        disc_files_by_num: Dict[int, List[str]],
        candidates: List[FolderCandidate],
    ) -> None:
        """Group loose audio files by `album` tag, build one candidate
        per album group + attach matching disc folders.

        - Tagged files cluster by their album name (case-insensitive,
          whitespace-stripped).
        - Untagged files become individual single candidates (can't
          group what we don't have a key for).
        - Disc folders attach to whichever loose group's album tag
          matches the first disc-folder track's album tag. Disc folders
          with no matching loose group fall through to a standalone
          multi-disc candidate scoped to that album.
        - When all loose files share one album AND disc folders attach
          to it, the result matches the previous "bundle everything"
          behavior — so single-album staging with parallel disc folders
          (the user's Mr. Morale layout) keeps working unchanged.
        """
        # Group by normalised album tag
        groups: Dict[str, List[str]] = {}
        untagged: List[str] = []
        for f in loose_files:
            try:
                tags = _read_file_tags(f)
            except Exception as exc:
                logger.debug("scan tag read failed for %s: %s", f, exc)
                tags = {}
            album_key = (tags.get('album') or '').strip().lower()
            if album_key:
                groups.setdefault(album_key, []).append(f)
            else:
                untagged.append(f)

        # Attach disc folders to matching groups. Read the first track
        # of each disc to find its album tag and merge accordingly.
        disc_attached_to: Dict[int, str] = {}  # disc_num → album_key
        for disc_num, disc_files in disc_files_by_num.items():
            try:
                first_disc_tags = _read_file_tags(disc_files[0])
            except Exception:
                first_disc_tags = {}
            disc_album_key = (first_disc_tags.get('album') or '').strip().lower()
            if disc_album_key and disc_album_key in groups:
                disc_attached_to[disc_num] = disc_album_key

        # Track which disc nums got merged into a loose group so we
        # don't double-count them in the standalone-disc fallback.
        merged_disc_nums = set(disc_attached_to.keys())

        # Build a candidate per loose-file group
        for album_key, group_files in groups.items():
            audio_files = list(group_files)
            disc_structure: Dict[int, List[str]] = {0: list(group_files)}
            for disc_num, attached_album in disc_attached_to.items():
                if attached_album == album_key:
                    audio_files.extend(disc_files_by_num[disc_num])
                    disc_structure[disc_num] = list(disc_files_by_num[disc_num])

            folder_hash = _compute_folder_hash(audio_files)
            # Use the album tag for the candidate name so the import
            # history shows something meaningful instead of always the
            # parent directory name.
            display_name = group_files[0]
            try:
                first_tags = _read_file_tags(group_files[0])
                if first_tags.get('album'):
                    display_name = first_tags['album']
            except Exception as exc:
                logger.debug("display-name tag read failed for %s: %s", group_files[0], exc)

            candidates.append(FolderCandidate(
                path=directory,
                name=os.path.basename(directory) if len(groups) == 1 else str(display_name),
                audio_files=audio_files,
                disc_structure=disc_structure if len(disc_structure) > 1 else {},
                folder_hash=folder_hash,
            ))

        # Untagged singles — one candidate per file. Can't group them.
        for f in untagged:
            audio_files = [f]
            folder_hash = _compute_folder_hash(audio_files)
            candidates.append(FolderCandidate(
                path=f, name=os.path.basename(f),
                audio_files=audio_files, folder_hash=folder_hash, is_single=True,
            ))

        # Standalone disc folders (no loose group claimed them) — bundle
        # into a multi-disc candidate scoped to the directory.
        unattached_discs = {
            n: files for n, files in disc_files_by_num.items()
            if n not in merged_disc_nums
        }
        if unattached_discs:
            audio_files = []
            disc_structure = {}
            for disc_num, disc_files in unattached_discs.items():
                disc_structure[disc_num] = disc_files
                audio_files.extend(disc_files)
            folder_hash = _compute_folder_hash(audio_files)
            candidates.append(FolderCandidate(
                path=directory,
                name=f"{os.path.basename(directory)} (loose discs)",
                audio_files=audio_files,
                disc_structure=disc_structure,
                folder_hash=folder_hash,
            ))

    def _is_folder_stable(self, candidate: FolderCandidate) -> bool:
        """Check if the candidate's audio files have stopped changing.

        Keyed on folder_hash, NOT path — multiple candidates can share
        a path (loose-file groups at the same directory level) so
        path-keyed snapshots would overwrite each other's mtimes and
        make stability checks unreliable for sibling candidates.
        """
        try:
            current_mtime = sum(os.path.getmtime(f) for f in candidate.audio_files if os.path.exists(f))
        except OSError:
            return False

        prev = self._folder_snapshots.get(candidate.folder_hash)
        self._folder_snapshots[candidate.folder_hash] = current_mtime

        if prev is None:
            return False  # First scan — wait for next cycle to confirm stability
        return abs(current_mtime - prev) < 0.01  # Unchanged

    def _is_already_processed(self, folder_hash: str) -> bool:
        """Check if this folder was already processed."""
        try:
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("SELECT status FROM auto_import_history WHERE folder_hash = ? ORDER BY created_at DESC LIMIT 1",
                           (folder_hash,))
            row = cursor.fetchone()
            conn.close()
            return row and row['status'] in ('completed', 'pending_review', 'needs_identification', 'failed', 'rejected')
        except Exception:
            return False

    # ── Identification ──

    def _identify_folder(self, candidate: FolderCandidate) -> Optional[Dict]:
        """Identify what album/track a folder or single file contains."""

        if candidate.is_single:
            return self._identify_single(candidate)

        # Strategy 1: Read tags
        tag_result = self._identify_from_tags(candidate)
        if tag_result:
            return tag_result

        # Strategy 2: Parse folder name (skip when the candidate is the staging
        # root itself — the folder name is meaningless and will false-match
        # against random albums in the metadata source).
        if candidate.is_staging_root:
            logger.info(f"[Auto-Import] Skipping folder-name identification for staging root '{candidate.name}' — would false-match. Falling through to AcoustID.")
        else:
            folder_result = self._identify_from_folder_name(candidate)
            if folder_result:
                return folder_result

        # Strategy 3: AcoustID fingerprint
        acoustid_result = self._identify_from_acoustid(candidate)
        if acoustid_result:
            return acoustid_result

        return None

    def _identify_single(self, candidate: FolderCandidate) -> Optional[Dict]:
        """Identify a single audio file from tags, filename, or AcoustID."""
        file_path = candidate.audio_files[0]
        tags = _read_file_tags(file_path)

        artist = tags.get('artist', '')
        title = tags.get('title', '')
        album = tags.get('album', '')

        # Fallback: parse filename (Artist - Title.ext)
        if not artist or not title:
            basename = os.path.splitext(os.path.basename(file_path))[0]
            parts = re.split(r'\s*[-–—]\s*', basename, maxsplit=1)
            if len(parts) == 2:
                artist = artist or parts[0].strip()
                title = title or parts[1].strip()
            elif not title:
                title = basename.strip()

        if not title:
            return None

        # Search metadata source for track
        result = self._search_single_track(artist, title, album)
        if result and result.get('identification_confidence', 0) >= 0.8:
            return result

        # Fallback: AcoustID fingerprint (also used when metadata match is weak)
        try:
            from core.acoustid_client import AcoustIDClient
            client = AcoustIDClient()
            fp_result = client.fingerprint_and_lookup(file_path)
            if fp_result and fp_result.get('recordings'):
                best = fp_result['recordings'][0]
                # AcoustID can return None for artist/title on new releases —
                # fall back to tag data we already have
                fp_artist = best.get('artist') or artist
                fp_title = best.get('title') or title
                if fp_artist and fp_title:
                    fp_result2 = self._search_single_track(fp_artist, fp_title, '')
                    if fp_result2 and fp_result2.get('identification_confidence', 0) >= 0.8:
                        fp_result2['method'] = 'acoustid'
                        return fp_result2
                    # Keep weak AcoustID result as fallback
                    if fp_result2 and (not result or fp_result2.get('identification_confidence', 0) > result.get('identification_confidence', 0)):
                        result = fp_result2
        except Exception as e:
            logger.debug("acoustid fingerprint fallback failed: %s", e)

        # If we have good tag data (artist + title), prefer tag-based identification
        # over a weak metadata/AcoustID result — tags from post-processed files are reliable
        if artist and title and tags.get('artist'):
            tag_conf = 0.85  # High confidence for files with proper embedded tags
            # Use the metadata result's image/album data if available, but trust tag identity
            tag_result = {
                'album_id': result.get('album_id') if result else None,
                'album_name': album or (result.get('album_name') if result else None) or title,
                'artist_name': artist,
                # Carry the metadata-source artist ID forward when the
                # search result had one — without this the standalone
                # library write can't populate the source-id column on
                # the artists row even though we know the ID.
                'artist_id': result.get('artist_id', '') if result else '',
                'track_name': title,
                'image_url': result.get('image_url', '') if result else '',
                'release_date': tags.get('year', '') or (result.get('release_date', '') if result else ''),
                'track_number': tags.get('track_number', 1),
                'total_tracks': result.get('total_tracks', 1) if result else 1,
                'source': result.get('source', 'tags') if result else 'tags',
                'method': 'tags',
                'identification_confidence': tag_conf,
                'is_single': True,
                'track_id': result.get('track_id', '') if result else '',
            }
            return tag_result

        # If AcoustID didn't help but we had a weak metadata match, use it
        if result:
            return result

        # Last resort: filename-only identification
        if title:
            return {
                'album_id': None,
                'album_name': title,
                'artist_name': artist or 'Unknown Artist',
                'track_name': title,
                'image_url': '',
                'release_date': '',
                'track_number': 1,
                'total_tracks': 1,
                'source': 'tags',
                'method': 'filename',
                'identification_confidence': 0.5,
                'is_single': True,
            }

        return None

    def _search_single_track(self, artist: str, title: str, album: str) -> Optional[Dict]:
        """Search metadata source for a single track match."""
        try:
            from core.metadata_service import get_primary_source, get_client_for_source

            source = get_primary_source()
            client = get_client_for_source(source)
            if not client or not hasattr(client, 'search_tracks'):
                return None

            query = f"{artist} {title}" if artist else title
            results = client.search_tracks(query, limit=5)
            if not results:
                return None

            # Score results
            best_result = None
            best_score = 0

            for r in results:
                r_title = getattr(r, 'name', '') or getattr(r, 'title', '') or ''
                r_artists = getattr(r, 'artists', [])
                r_artist = ''
                if r_artists:
                    a = r_artists[0]
                    r_artist = a.get('name', str(a)) if isinstance(a, dict) else str(a)

                score = _similarity(title, r_title) * 0.6
                if artist:
                    score += _similarity(artist, r_artist) * 0.4

                if score > best_score:
                    best_score = score
                    best_result = r

            if not best_result or best_score < 0.5:
                return None

            r_artist = ''
            r_artist_id = ''
            r_album = ''
            r_album_id = ''
            r_image = ''
            if hasattr(best_result, 'artists') and best_result.artists:
                a = best_result.artists[0]
                if isinstance(a, dict):
                    r_artist = a.get('name', str(a))
                    r_artist_id = str(a.get('id', '') or '')
                else:
                    r_artist = str(a)

            # Extract image — try direct image_url first (Deezer), then album.images (Spotify)
            r_image = getattr(best_result, 'image_url', '') or ''
            if hasattr(best_result, 'album'):
                alb = best_result.album
                if isinstance(alb, dict):
                    r_album = alb.get('name', '')
                    r_album_id = alb.get('id', '')
                    if not r_image:
                        images = alb.get('images', [])
                        if images:
                            r_image = images[0].get('url', '') if isinstance(images[0], dict) else str(images[0])
                elif isinstance(alb, str):
                    r_album = alb

            # Extract track number and release date from the matched result
            r_track_number = getattr(best_result, 'track_number', None) or 1
            r_release_date = getattr(best_result, 'release_date', '') or ''

            return {
                'album_id': r_album_id or None,
                'album_name': r_album or title,
                'artist_name': r_artist or artist or '',
                'artist_id': r_artist_id,
                'track_name': getattr(best_result, 'name', '') or title,
                'track_id': getattr(best_result, 'id', ''),
                'image_url': r_image,
                'release_date': r_release_date,
                'track_number': r_track_number,
                'total_tracks': getattr(best_result, 'total_tracks', 1) or 1,
                'source': source,
                'method': 'tags',
                'identification_confidence': best_score,
                'is_single': True,
            }

        except Exception as e:
            logger.debug(f"Single track search failed for '{artist} - {title}': {e}")
            return None

    def _identify_from_tags(self, candidate: FolderCandidate) -> Optional[Dict]:
        """Try to identify album from embedded file tags."""
        tags_list = []
        sampled = candidate.audio_files[:20]  # Cap at 20 files
        for f in sampled:
            tags = _read_file_tags(f)
            if tags['album'] and tags['artist']:
                tags_list.append(tags)

        if len(tags_list) < max(1, len(sampled) * 0.5):
            logger.info(f"[Auto-Import] Tag identification rejected for '{candidate.name}' — only {len(tags_list)}/{len(sampled)} files have album+artist tags (need >=50%)")
            return None  # Less than 50% of files have usable tags

        # Group by album first (album-level identity). Per-track artist often
        # varies due to features ("Artist", "Artist, Drake", etc.) so grouping
        # by (album, artist) fragments consensus on a real album. Pick the
        # dominant album, then within that album pick the most-common artist
        # (which will usually be the album's primary artist).
        album_counts = {}
        for t in tags_list:
            album_key = t['album'].lower().strip()
            album_counts[album_key] = album_counts.get(album_key, 0) + 1

        if not album_counts:
            return None

        best_album, best_album_count = max(album_counts.items(), key=lambda x: x[1])
        if best_album_count < len(tags_list) * 0.6:
            sample = ', '.join([f"'{a}' x{c}" for a, c in sorted(album_counts.items(), key=lambda x: -x[1])[:3]])
            logger.info(f"[Auto-Import] Tag identification rejected for '{candidate.name}' — best album '{best_album}' only {best_album_count}/{len(tags_list)} files (need >=60%). Top albums: {sample}")
            return None

        # Most-common artist among files matching the dominant album
        artist_counts = {}
        for t in tags_list:
            if t['album'].lower().strip() == best_album:
                a = t['artist'].lower().strip()
                if a:
                    artist_counts[a] = artist_counts.get(a, 0) + 1
        if not artist_counts:
            return None
        artist_name, _ = max(artist_counts.items(), key=lambda x: x[1])

        return self._search_metadata_source(artist_name, best_album, 'tags', candidate)

    def _identify_from_folder_name(self, candidate: FolderCandidate) -> Optional[Dict]:
        """Try to identify album from folder name."""
        artist, album = _parse_folder_name(candidate.name)
        query = f"{artist} {album}" if artist else album
        return self._search_metadata_source(artist, album, 'folder_name', candidate, query=query)

    def _identify_from_acoustid(self, candidate: FolderCandidate) -> Optional[Dict]:
        """Try to identify album by fingerprinting a few files."""
        try:
            from core.acoustid_client import AcoustIDClient
            client = AcoustIDClient()
        except Exception:
            return None

        # Fingerprint first 3 files
        identified_artists = []
        identified_albums = []
        for f in candidate.audio_files[:3]:
            try:
                result = client.fingerprint_and_lookup(f)
                if result and result.get('recordings'):
                    best = result['recordings'][0]
                    if best.get('artist'):
                        identified_artists.append(best['artist'])
                    # Try to get album from recording
                    # AcoustID doesn't directly give album — use artist+title to search
                time.sleep(1)  # Rate limit
            except Exception:
                continue

        if not identified_artists:
            return None

        # Most common artist
        from collections import Counter
        artist = Counter(identified_artists).most_common(1)[0][0]
        return self._search_metadata_source(artist, candidate.name, 'acoustid', candidate)

    def _search_metadata_source(self, artist: Optional[str], album: str,
                                 method: str, candidate: FolderCandidate,
                                 query: str = None) -> Optional[Dict]:
        """Search configured metadata sources for an album match.

        Iterates `get_source_priority(get_primary_source())` so primary
        is tried first and the rest are tried as fallback. Returns the
        FIRST source whose best result clears the 0.4 score threshold.

        Pre-fix this only queried the primary, which meant indie/niche
        albums missing from the user's primary (e.g. Bandcamp releases
        not on Spotify) failed auto-import even when manual search
        could find them on Tidal/Deezer. The manual search bar at the
        bottom of the Import tab already iterates the full source
        chain via `search_import_albums` — this aligns auto-import
        with that behavior.
        """
        try:
            from core.metadata_service import (
                get_primary_source,
                get_source_priority,
                get_client_for_source,
            )

            primary_source = get_primary_source()
            source_chain = get_source_priority(primary_source)
            search_query = query or (f"{artist} {album}" if artist else album)

            for source in source_chain:
                client = get_client_for_source(source)
                if not client or not hasattr(client, 'search_albums'):
                    continue

                try:
                    results = client.search_albums(search_query, limit=5)
                except Exception as e:
                    # Per-source failures (rate limit, auth, transient HTTP)
                    # shouldn't abort the fallback chain. Log + continue.
                    logger.debug(
                        f"Auto-import: search_albums failed on {source}: {e}"
                    )
                    continue

                if not results:
                    continue

                # Score each result via the pure helper. Helper is
                # tested independently in
                # `tests/imports/test_album_search_scoring.py` so the
                # weight math is pinned at the function boundary, not
                # through the orchestrator path.
                file_count = len(candidate.audio_files)
                best_result = None
                best_score = 0.0
                for r in results:
                    score = _score_album_search_result(r, album, artist, file_count)
                    if score > best_score:
                        best_score = score
                        best_result = r

                if not best_result or best_score < 0.4:
                    # Primary returned weak/no match — fall through to next source
                    if source != primary_source:
                        logger.debug(
                            f"Auto-import: {source} best score {best_score:.2f} "
                            f"below threshold for '{album}', trying next source"
                        )
                    continue

                # Get image
                image_url = ''
                if hasattr(best_result, 'image_url'):
                    image_url = best_result.image_url or ''
                elif hasattr(best_result, 'images') and best_result.images:
                    img = best_result.images[0]
                    image_url = img.get('url', '') if isinstance(img, dict) else str(img)

                r_artist = ''
                r_artist_id = ''
                if hasattr(best_result, 'artists') and best_result.artists:
                    a = best_result.artists[0]
                    if isinstance(a, dict):
                        r_artist = a.get('name', str(a))
                        # Surface the metadata-source artist ID so the
                        # standalone-library write can land it on the right
                        # `<source>_artist_id` column. Without this the
                        # artists row gets created but with NULL on the
                        # source-id, and watchlist scans can't recognise
                        # the artist as already in library by stable ID.
                        r_artist_id = str(a.get('id', '') or '')
                    else:
                        r_artist = str(a)

                # Get release date
                release_date = getattr(best_result, 'release_date', '') or ''

                if source != primary_source:
                    logger.info(
                        f"Auto-import: identified '{album}' via fallback "
                        f"source {source!r} (score {best_score:.2f}, primary "
                        f"{primary_source!r} returned nothing usable)"
                    )

                return {
                    'album_id': best_result.id,
                    'album_name': best_result.name,
                    'artist_name': r_artist or artist or '',
                    'artist_id': r_artist_id,
                    'image_url': image_url,
                    'release_date': release_date,
                    'total_tracks': getattr(best_result, 'total_tracks', 0),
                    'source': source,
                    'method': method,
                    'identification_confidence': best_score,
                }

            return None

        except Exception as e:
            logger.debug(f"Metadata search failed for '{album}': {e}")
            return None

    # ── Track Matching ──

    def _match_tracks(self, candidate: FolderCandidate, identification: Dict) -> Optional[Dict]:
        """Match staging files to the identified album's tracklist."""
        # Singles: no album tracklist to match against — the file IS the match
        if candidate.is_single or identification.get('is_single'):
            conf = identification.get('identification_confidence', 0.7)
            track_data = {
                'name': identification.get('track_name', identification.get('album_name', '')),
                'artists': [{'name': identification.get('artist_name', '')}],
                'id': identification.get('track_id', ''),
                'track_number': identification.get('track_number', 1),
                'disc_number': 1,
            }
            return {
                'matches': [{'track': track_data, 'file': candidate.audio_files[0], 'confidence': conf}],
                'unmatched_files': [],
                'total_tracks': 1,
                'matched_count': 1,
                'coverage': 1.0,
                'confidence': conf,
                'album_data': {'id': identification.get('album_id') or '', 'name': identification.get('album_name', ''),
                               'tracks': {'items': [track_data]}},
            }

        try:
            from core.metadata_service import get_client_for_source, get_album_tracks_for_source

            source = identification['source']
            album_id = identification['album_id']

            # Fetch album with tracks
            client = get_client_for_source(source)
            if not client:
                logger.warning(
                    "[Auto-Import] Match aborted for '%s' — no client available "
                    "for source '%s'. Identification probably came from a source "
                    "that's no longer configured.",
                    candidate.name, source,
                )
                return None

            album_data = None
            if hasattr(client, 'get_album'):
                album_data = client.get_album(album_id)

            # Fallback: try get_album_metadata (Deezer) or get_album_tracks
            if not album_data and hasattr(client, 'get_album_metadata'):
                album_data = client.get_album_metadata(str(album_id), include_tracks=True)
            if not album_data and hasattr(client, 'get_album_tracks'):
                tracks_data = client.get_album_tracks(str(album_id))
                if tracks_data:
                    album_data = {'id': album_id, 'name': identification.get('album_name', ''), 'tracks': tracks_data}

            if not album_data:
                logger.warning(
                    "[Auto-Import] Match aborted for '%s' — source '%s' returned "
                    "no album data for id %r. Album probably exists in the "
                    "search index but get_album endpoint can't fetch it (rate "
                    "limit / region restriction / id-format mismatch).",
                    candidate.name, source, album_id,
                )
                return None

            # Extract tracks — handle various response formats
            tracks = []
            if isinstance(album_data, dict):
                if 'tracks' in album_data:
                    raw = album_data['tracks']
                    if isinstance(raw, dict) and 'items' in raw:
                        tracks = raw['items']
                    elif isinstance(raw, dict) and 'data' in raw:
                        tracks = raw['data']  # Deezer format
                    elif isinstance(raw, list):
                        tracks = raw
                elif 'items' in album_data:
                    tracks = album_data['items']

            if not tracks:
                logger.warning(
                    "[Auto-Import] Match aborted for '%s' — source '%s' returned "
                    "album data but no tracks. album_data keys: %s",
                    candidate.name, source,
                    list(album_data.keys()) if isinstance(album_data, dict) else type(album_data).__name__,
                )
                return None

            # Read tags for all files
            file_tags = {}
            for f in candidate.audio_files:
                file_tags[f] = _read_file_tags(f)

            # Dedupe + match — both lifted into core.imports.album_matching
            # so the matching algorithm is unit-testable in isolation
            # (no worker instantiation, no metadata-client mocking, no
            # _read_file_tags monkeypatch). Worker still owns I/O +
            # metadata fetch; the helper is a pure function over dicts.
            from core.imports.album_matching import match_files_to_tracks
            target_album = identification.get('album_name', '')
            match_result = match_files_to_tracks(
                candidate.audio_files,
                file_tags,
                tracks,
                target_album=target_album,
                similarity=_similarity,
                quality_rank=_quality_rank,
            )
            matches = match_result['matches']
            unmatched_files = match_result['unmatched_files']

            if not matches:
                return None

            # Compute overall confidence
            album_conf = identification.get('identification_confidence', 0.5)
            avg_track_conf = sum(m['confidence'] for m in matches) / len(matches) if matches else 0
            coverage = len(matches) / len(tracks) if tracks else 0
            overall = album_conf * avg_track_conf * coverage

            return {
                'matches': matches,
                'unmatched_files': unmatched_files,
                'total_tracks': len(tracks),
                'matched_count': len(matches),
                'coverage': round(coverage, 3),
                'confidence': round(overall, 3),
                'album_data': album_data,
            }

        except Exception as e:
            logger.error(f"Track matching error: {e}")
            return None

    # ── Processing ──

    def _process_matches(self, candidate: FolderCandidate, identification: Dict, match_result: Dict) -> bool:
        """Process matched files through the post-processing pipeline."""
        if not self._process_callback:
            logger.warning("No process callback configured — cannot auto-process")
            return False

        album_data = match_result.get('album_data', {})
        if not isinstance(album_data, dict):
            album_data = {}

        source = identification.get('source', 'deezer')
        artist_name = identification.get('artist_name', 'Unknown')
        album_name = identification.get('album_name', 'Unknown')
        image_url = identification.get('image_url', '')

        # Parent folder artist override: if the staging folder structure is
        # Artist/Albums/AlbumName or Artist/AlbumName, use the parent folder
        # as the artist name when the tag-extracted artist looks wrong.
        # This handles mixtapes/compilations where embedded tags have DJ names.
        try:
            staging_root = self._resolve_staging_path() or self.staging_path
            rel_path = os.path.relpath(candidate.path, staging_root)
            parts = [p for p in rel_path.replace('\\', '/').split('/') if p]

            # parts[0] = artist folder, parts[1] = album or category subfolder, etc.
            # Only attempt override if there's at least 2 levels (artist/album)
            folder_artist = None
            if len(parts) >= 2:
                _category_names = {'albums', 'singles', 'eps', 'compilations', 'mixtapes',
                                   'discography', 'music', 'downloads'}
                if len(parts) >= 3 and parts[1].lower() in _category_names:
                    # Artist/Albums/AlbumFolder → parts[0] is artist
                    folder_artist = parts[0]
                elif parts[0].lower() not in _category_names:
                    # Artist/AlbumFolder → parts[0] is artist
                    folder_artist = parts[0]

            if folder_artist and folder_artist.lower() != artist_name.lower():
                logger.info(f"[Auto-Import] Parent folder artist '{folder_artist}' differs from tag artist '{artist_name}' — using folder artist")
                artist_name = folder_artist
        except Exception as e:
            logger.debug("folder artist override failed: %s", e)
        release_date = identification.get('release_date', '') or album_data.get('release_date', '')

        # Compute total discs
        total_discs = 1
        if candidate.disc_structure and len(candidate.disc_structure) > 1:
            total_discs = max(candidate.disc_structure.keys())

        processed = 0
        errors = []
        all_matches = list(match_result.get('matches', []))

        # Album total duration — sum of every matched track's duration.
        # Mirrors `SoulSyncAlbum.duration` in soulsync_client (which is
        # `sum(t.duration for t in self._tracks)`). Without this, the
        # album row gets whatever the FIRST imported track's duration
        # was — random per album (would be track 1 for a normal in-
        # order import, but no guarantee).
        album_total_duration_ms = sum(
            int(m.get('track', {}).get('duration_ms', 0) or 0)
            for m in all_matches
        )
        # Ensure an active-import entry exists for this candidate.
        # Callers from `_process_one_candidate` already registered, but
        # tests invoke `_process_matches` directly without going
        # through the pool — the auto-register makes both paths safe.
        self._register_active(candidate, status='processing')
        # Surface track total for the UI's live-progress widget. Matches
        # the loop denominator so users see "3/14" while it's working.
        self._update_active(candidate.folder_hash, track_total=len(all_matches))

        # Aggregate genres from track tags so the standalone library
        # write can populate the artists row's `genres` column with
        # something meaningful. Mirrors what `soulsync_client._scan_transfer`
        # does at deep-scan time — collects the set of genres across
        # every track in the album. Without this the artists row gets
        # genres=[] and feels empty compared to a Plex/Jellyfin scan.
        # Sorted for deterministic ordering (genre-filter dedup uses
        # set semantics so this is just for stable JSON output).
        aggregated_genres: List[str] = []
        seen_genres: set = set()
        for _m in all_matches:
            try:
                _file_tags = _read_file_tags(_m['file'])
            except Exception as _tag_err:
                logger.debug("genre tag read failed for %s: %s", _m.get('file'), _tag_err)
                continue
            for g in _file_tags.get('genres', []) or []:
                key = g.lower()
                if key and key not in seen_genres:
                    seen_genres.add(key)
                    aggregated_genres.append(g)

        for index, match in enumerate(all_matches, start=1):
            track = match['track']
            file_path = match['file']

            track_name = track.get('name', 'Unknown')
            track_number = track.get('track_number', 1)
            disc_number = track.get('disc_number', 1)
            track_id = track.get('id', '')

            # Update live progress BEFORE the per-track work so the UI
            # sees the right "now processing track N: <name>" the
            # moment polling fires (every 5s).
            self._update_active(
                candidate.folder_hash,
                track_index=index,
                track_name=track_name,
            )

            if not os.path.exists(file_path):
                errors.append(f"File not found: {os.path.basename(file_path)}")
                continue

            try:
                # Build context matching the manual import format.
                #
                # The post-process pipeline (`_post_process_matched_download`
                # → `record_soulsync_library_entry`) reads `source` to pick
                # the right source-id columns on artists/albums/tracks,
                # and reads `_download_username` to label the row in
                # library history + provenance. Without these the SoulSync
                # standalone library lands the file but leaves
                # `spotify_track_id` / `deezer_id` / etc. NULL and tags the
                # provenance row as "Soulseek" (the default fallback).
                # SoulSync standalone is a full server replacement, so the
                # row must carry the same field richness as a Plex/Jellyfin/
                # Navidrome scan would write.
                context_key = f"auto_import_{candidate.folder_hash}_{track_number}"
                # Album-level identifiers from the metadata source response.
                # `album_data['id']` is the source-native album id (e.g.
                # spotify album id, deezer album id). Identification fed it
                # into `identification['album_id']` already; prefer the
                # album_data version since it's authoritative when both
                # are present.
                source_album_id = album_data.get('id') or identification.get('album_id') or ''
                # ISRC + MusicBrainz Recording ID — propagated by the
                # metadata layer (`_build_album_track_entry`) so files
                # tagged with these IDs can match later watchlist scans
                # without relying on fuzzy title comparison.
                # Defensive `str()` cast — `_build_album_track_entry`
                # already coerces these to str, but if a future source
                # client returns a non-string (int, None) the
                # downstream `.strip()` in side_effects would
                # AttributeError. Cheap insurance.
                track_isrc = str(track.get('isrc', '') or '')
                track_mbid = str(
                    track.get('musicbrainz_recording_id', '')
                    or track.get('mbid', '')
                    or ''
                )
                context = {
                    # Top-level `source` is the canonical signal that the
                    # imports pipeline reads via `get_import_source()`.
                    # `get_library_source_id_columns(source)` then picks
                    # the right column on artists/albums/tracks for the
                    # source-aware UPDATE.
                    'source': source,
                    # `_download_username` is read by
                    # `record_library_history_download` +
                    # `record_download_provenance` to label the row.
                    # 'auto_import' maps to "Auto-Import" / "auto_import"
                    # in those source maps so the UI doesn't show every
                    # imported file as "Soulseek".
                    '_download_username': 'auto_import',
                    'spotify_artist': {
                        'id': identification.get('artist_id') or '',
                        'name': artist_name,
                        # Genres aggregated from the matched files'
                        # GENRE tags (deduped, original-case preserved).
                        # Mirrors soulsync_client deep-scan behaviour
                        # so the standalone library write populates
                        # the artists row's genres column instead of
                        # leaving it empty.
                        'genres': list(aggregated_genres),
                    },
                    'spotify_album': {
                        'id': source_album_id,
                        'name': album_name,
                        'release_date': release_date,
                        'total_tracks': album_data.get('total_tracks', match_result.get('total_tracks', 0)),
                        'total_discs': total_discs,
                        'image_url': image_url,
                        'images': album_data.get('images', [{'url': image_url}] if image_url else []),
                        'artists': [{'name': artist_name, 'id': identification.get('artist_id') or ''}],
                        'album_type': album_data.get('album_type', 'album'),
                        # Album total duration in ms (sum of every
                        # matched track). Read by side_effects to
                        # populate the album row's `duration` column —
                        # without this the album row gets whatever
                        # the first-imported track's duration happened
                        # to be.
                        'duration_ms': album_total_duration_ms,
                    },
                    'track_info': {
                        'name': track_name,
                        'id': track_id,
                        'track_number': track_number,
                        'disc_number': disc_number,
                        'duration_ms': track.get('duration_ms', 0),
                        'artists': track.get('artists', [{'name': artist_name}]),
                        'uri': track.get('uri', ''),
                        # Album-id back-reference + per-recording IDs so
                        # `get_import_source_ids` can resolve them onto
                        # the right column even when the source's API
                        # nests them under `album.id` rather than
                        # `track.album_id`.
                        'album_id': source_album_id,
                        'isrc': track_isrc,
                        'musicbrainz_recording_id': track_mbid,
                    },
                    'original_search_result': {
                        'title': track_name,
                        'artist': artist_name,
                        'album': album_name,
                        'track_number': track_number,
                        'disc_number': disc_number,
                        'spotify_clean_title': track_name,
                        'spotify_clean_album': album_name,
                        'spotify_clean_artist': artist_name,
                        'artists': track.get('artists', [{'name': artist_name}]),
                    },
                    'is_album_download': True,
                    'has_clean_spotify_data': True,
                    'has_full_spotify_metadata': True,
                }

                self._process_callback(context_key, context, file_path)
                processed += 1
                logger.info(f"[Auto-Import] Processed: {track_number}. {track_name}")

            except Exception as e:
                errors.append(f"{track.get('name', '?')}: {str(e)}")
                logger.warning(f"[Auto-Import] Error processing track: {e}")

        # Emit automation events
        if processed > 0 and self._automation_engine:
            try:
                self._automation_engine.emit('import_completed', {
                    'track_count': str(processed),
                    'album_name': album_name,
                    'artist': artist_name,
                })
                self._automation_engine.emit('batch_complete', {
                    'playlist_name': f'Import: {album_name}',
                    'total_tracks': str(len(match_result.get('matches', []))),
                    'completed_tracks': str(processed),
                    'failed_tracks': str(len(errors)),
                })
            except Exception as e:
                logger.debug("automation emit failed: %s", e)

        return processed > 0

    # ── Database ──

    def _record_in_progress(self, candidate: FolderCandidate, identification: Dict,
                            match_result: Dict) -> Optional[int]:
        """Insert a status='processing' row up-front so the UI can see
        an in-flight import while it's still running. Returns the row's
        id so ``_finalize_result`` can update the same row when done.

        Without this, auto-import goes silent for the entire processing
        window (5+ minutes for a full album) — the existing
        ``_record_result`` only fires after every track is post-
        processed, so the UI sees nothing in history while the user
        waits.
        """
        try:
            match_json = self._serialize_match_data(match_result)
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO auto_import_history
                (folder_name, folder_path, folder_hash, status, confidence, album_id, album_name,
                 artist_name, image_url, total_files, matched_files, match_data,
                 identification_method, error_message, processed_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                candidate.name, candidate.path, candidate.folder_hash,
                'processing', match_result.get('confidence', 0.0),
                identification.get('album_id'), identification.get('album_name'),
                identification.get('artist_name'), identification.get('image_url'),
                len(candidate.audio_files),
                match_result.get('matched_count', 0),
                match_json, identification.get('method'), None, None,
            ))
            row_id = cursor.lastrowid
            conn.commit()
            conn.close()
            return row_id
        except Exception as e:
            logger.error(f"Error recording in-progress auto-import row: {e}")
            return None

    def _finalize_result(self, row_id: int, status: str, confidence: float,
                         error_message: Optional[str] = None) -> None:
        """Update the in-progress row created by ``_record_in_progress``
        with the final outcome. Idempotent — safe to call even if the
        row creation failed (row_id is None)."""
        if not row_id:
            return
        try:
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("""
                UPDATE auto_import_history
                SET status = ?, confidence = ?, error_message = ?, processed_at = ?
                WHERE id = ?
            """, (
                status, confidence, error_message,
                datetime.now().isoformat() if status == 'completed' else None,
                row_id,
            ))
            conn.commit()
            conn.close()
        except Exception as e:
            logger.error(f"Error finalizing auto-import row {row_id}: {e}")

    def _serialize_match_data(self, match_data: Optional[Dict]) -> Optional[str]:
        """Serialize match_result for storage. Strips the non-JSON-safe
        ``album_data`` reference and per-match track dicts down to just
        the fields the review UI uses."""
        if not match_data:
            return None
        try:
            serializable = {
                'matches': [{'track_name': m['track']['name'],
                             'track_number': m['track'].get('track_number', 0),
                             'file': os.path.basename(m['file']),
                             'confidence': m['confidence']} for m in match_data.get('matches', [])],
                'unmatched_files': [os.path.basename(f) for f in match_data.get('unmatched_files', [])],
                'total_tracks': match_data.get('total_tracks', 0),
                'matched_count': match_data.get('matched_count', 0),
                'coverage': match_data.get('coverage', 0),
            }
            return json.dumps(serializable)
        except Exception:
            return None

    def _record_result(self, candidate: FolderCandidate, status: str, confidence: float,
                       album_id: str = None, album_name: str = None, artist_name: str = None,
                       image_url: str = None, identification_method: str = None,
                       match_data: Dict = None, error_message: str = None):
        """Record auto-import result to database (one-shot, no in-progress
        upsert). Used for early-failure paths that never enter the
        per-track processing loop (identification failures, match
        failures, low-confidence skips)."""
        try:
            match_json = self._serialize_match_data(match_data)
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("""
                INSERT INTO auto_import_history
                (folder_name, folder_path, folder_hash, status, confidence, album_id, album_name,
                 artist_name, image_url, total_files, matched_files, match_data,
                 identification_method, error_message, processed_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                candidate.name, candidate.path, candidate.folder_hash, status, confidence,
                album_id, album_name, artist_name, image_url,
                len(candidate.audio_files),
                match_data.get('matched_count', 0) if match_data else 0,
                match_json, identification_method, error_message,
                datetime.now().isoformat() if status == 'completed' else None,
            ))
            conn.commit()
            conn.close()
        except Exception as e:
            logger.error(f"Error recording auto-import result: {e}")

    def get_results(self, status_filter: str = None, limit: int = 50) -> List[Dict]:
        """Get auto-import results from database."""
        try:
            conn = self.database._get_connection()
            cursor = conn.cursor()
            if status_filter:
                cursor.execute("""
                    SELECT * FROM auto_import_history WHERE status = ?
                    ORDER BY created_at DESC LIMIT ?
                """, (status_filter, limit))
            else:
                cursor.execute("""
                    SELECT * FROM auto_import_history ORDER BY created_at DESC LIMIT ?
                """, (limit,))
            rows = cursor.fetchall()
            conn.close()
            return [dict(r) for r in rows]
        except Exception:
            return []

    def approve_item(self, item_id: int) -> Dict:
        """Approve a pending_review item and process it."""
        try:
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM auto_import_history WHERE id = ? AND status = 'pending_review'", (item_id,))
            row = cursor.fetchone()
            conn.close()

            if not row:
                return {'success': False, 'error': 'Item not found or not pending review'}

            # Rebuild candidate and match data
            match_data_raw = json.loads(row['match_data']) if row['match_data'] else None
            if not match_data_raw:
                return {'success': False, 'error': 'No match data available'}

            # We can't easily re-process from stored data alone because we don't store
            # the full album_data or file paths. Mark as approved and let next scan pick it up.
            # For now, update status to trigger re-processing.
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("UPDATE auto_import_history SET status = 'approved' WHERE id = ?", (item_id,))
            conn.commit()
            conn.close()

            return {'success': True, 'message': 'Item approved — will be processed on next scan'}

        except Exception as e:
            return {'success': False, 'error': str(e)}

    def reject_item(self, item_id: int) -> Dict:
        """Reject/dismiss an auto-import item."""
        try:
            conn = self.database._get_connection()
            cursor = conn.cursor()
            cursor.execute("UPDATE auto_import_history SET status = 'rejected' WHERE id = ?", (item_id,))
            conn.commit()
            conn.close()
            return {'success': True}
        except Exception as e:
            return {'success': False, 'error': str(e)}