SoulSync/core/imports/album_matching.py

"""Album-track matching helpers — lifted out of
``AutoImportWorker._match_tracks`` so the matching logic is testable in
isolation without instantiating the worker, mocking the metadata
client, or monkey-patching ``_read_file_tags``.

The worker still owns:
- File-system traversal + tag reads
- Metadata client lookup + album_data fetch
- Album-vs-single routing

This module owns:
- Quality-aware deduplication keyed on the ``(disc_number, track_number)``
  position tuple
- Weighted match scoring against the album's tracklist
- Returning the list of (track, file, confidence) matches + leftover
  unmatched files

Both behaviors are pure functions over already-fetched data, so the
test surface is just dicts in / dicts out.
"""

from __future__ import annotations

import os
from typing import Any, Callable, Dict, List, Set, Tuple


# ---------------------------------------------------------------------------
# Match-scoring weights
# ---------------------------------------------------------------------------
# Each weight is a fraction of the 0..1 confidence score the matcher
# accumulates per (file, track) pair. Sum of all maximum-bonus paths
# equals 1.0 in the happy case (perfect title + artist + position +
# album tag agreement).
#
# History note: the position bonus (30%) used to fire on track_number
# alone, which broke multi-disc albums where every disc has tracks 1..N.
# Disc-aware split (POSITION + CROSS_DISC) shipped 2026-05-09 after
# user reported Mr. Morale & The Big Steppers losing half its tracks
# during auto-import.

TITLE_WEIGHT = 0.45                 # case-folded fuzzy title similarity
ARTIST_WEIGHT = 0.15                # albumartist (or artist) similarity
POSITION_WEIGHT = 0.30              # exact (disc_number, track_number) match
NEAR_POSITION_WEIGHT = 0.12         # off-by-one track number, same disc
CROSS_DISC_POSITION_WEIGHT = 0.05   # same track_number, different disc
ALBUM_WEIGHT = 0.10                 # album tag similarity to target album

# A file scoring below this threshold against every track is treated
# as unmatched. Threshold sits below the per-component partial-match
# floor (~0.5 × 0.45 = 0.22) plus a small position consolation, so
# files with weak title agreement still need at least one strong signal.
MATCH_THRESHOLD = 0.4


SimilarityFn = Callable[[str, str], float]
QualityRankFn = Callable[[str], int]


def dedupe_files_by_position(
    audio_files: List[str],
    file_tags: Dict[str, Dict[str, Any]],
    *,
    quality_rank: QualityRankFn,
) -> List[str]:
    """Drop quality-duplicate files at the same ``(disc, track)``
    position, keeping the higher-quality one.

    The position key is ``(disc_number, track_number)`` — NOT
    ``track_number`` alone. Multi-disc albums where every disc has
    tracks 1..N would otherwise collapse to one disc's worth of files
    here, before the matcher even sees the rest.

    Files with ``track_number == 0`` (no tag) all pass through —
    can't dedupe positions we don't know.
    """
    seen_positions: Dict[Tuple[int, int], str] = {}
    deduped: List[str] = []

    for f in audio_files:
        tags = file_tags.get(f, {})
        track_num = tags.get('track_number', 0) or 0
        disc_num = tags.get('disc_number', 1) or 1
        ext = os.path.splitext(f)[1].lower()
        position_key = (disc_num, track_num)

        if track_num > 0 and position_key in seen_positions:
            prev_f = seen_positions[position_key]
            prev_ext = os.path.splitext(prev_f)[1].lower()
            if quality_rank(ext) > quality_rank(prev_ext):
                deduped.remove(prev_f)
                deduped.append(f)
                seen_positions[position_key] = f
        else:
            deduped.append(f)
            if track_num > 0:
                seen_positions[position_key] = f

    return deduped


def _extract_track_disc(track: Dict[str, Any]) -> int:
    """Pull disc number off an API track dict.

    Different metadata sources spell the field differently:
    Spotify ``disc_number``, Deezer ``disk_number``, iTunes
    ``discNumber``. Default to 1 when missing so single-disc albums
    still match.
    """
    return (
        track.get('disc_number')
        or track.get('disk_number')
        or track.get('discNumber')
        or 1
    )


def _extract_track_artist(track: Dict[str, Any]) -> str:
    artists = track.get('artists') or []
    if not artists:
        return ''
    a = artists[0]
    return a.get('name', str(a)) if isinstance(a, dict) else str(a)


def score_file_against_track(
    file_path: str,
    file_tags: Dict[str, Any],
    track: Dict[str, Any],
    *,
    target_album: str,
    similarity: SimilarityFn,
) -> float:
    """Compute the 0..1 confidence score for matching ``file_path``
    (with its tags) to ``track`` (an API track dict).

    Pure scoring — caller decides what to do with the score (compare
    against ``MATCH_THRESHOLD``, pick best-per-track, etc).
    """
    score = 0.0

    # Title similarity (TITLE_WEIGHT). Falls back to filename stem when
    # the file has no title tag.
    title = file_tags.get('title') or os.path.splitext(os.path.basename(file_path))[0]
    track_name = track.get('name', '')
    score += similarity(title, track_name) * TITLE_WEIGHT

    # Artist similarity (ARTIST_WEIGHT). Skipped if either side missing.
    file_artist = file_tags.get('artist', '')
    track_artist = _extract_track_artist(track)
    if file_artist and track_artist:
        score += similarity(file_artist, track_artist) * ARTIST_WEIGHT

    # Position match (POSITION_WEIGHT / NEAR_POSITION_WEIGHT /
    # CROSS_DISC_POSITION_WEIGHT). Gates on the (disc, track) tuple
    # rather than track_number alone — see the module docstring's
    # multi-disc history note.
    file_track_num = file_tags.get('track_number', 0) or 0
    track_num = track.get('track_number', 0) or 0
    if file_track_num > 0 and track_num > 0:
        file_disc = file_tags.get('disc_number', 1) or 1
        track_disc = _extract_track_disc(track)
        if file_track_num == track_num and file_disc == track_disc:
            score += POSITION_WEIGHT
        elif file_track_num == track_num and file_disc != track_disc:
            # Same track number, different disc — small consolation so
            # title/artist similarity has to carry the match. Common
            # collision in deluxe / multi-disc releases where every
            # disc has tracks numbered 1..N.
            score += CROSS_DISC_POSITION_WEIGHT
        elif abs(file_track_num - track_num) <= 1 and file_disc == track_disc:
            score += NEAR_POSITION_WEIGHT

    # Album tag bonus (ALBUM_WEIGHT). Helps disambiguate when the
    # target_album name is a strong signal.
    file_album = file_tags.get('album', '')
    if file_album:
        score += similarity(file_album, target_album) * ALBUM_WEIGHT

    return score


def match_files_to_tracks(
    audio_files: List[str],
    file_tags: Dict[str, Dict[str, Any]],
    tracks: List[Dict[str, Any]],
    *,
    target_album: str,
    similarity: SimilarityFn,
    quality_rank: QualityRankFn,
) -> Dict[str, Any]:
    """Match staging files to album tracks.

    Returns a dict with:
    - ``matches``: list of ``{'track': dict, 'file': str, 'confidence': float}``,
      one per track that found a file scoring at or above
      ``MATCH_THRESHOLD``
    - ``unmatched_files``: files left over after every track found its
      best (or none)

    Each file matches at most one track (best-scoring track that
    accepted it wins). Each track matches at most one file (the highest-
    scoring still-unused file).

    Pure function — no side effects, no I/O, no metadata client. Easy
    to unit-test by feeding tag dicts and track dicts directly.
    """
    deduped = dedupe_files_by_position(audio_files, file_tags, quality_rank=quality_rank)

    matches: List[Dict[str, Any]] = []
    used_files: Set[str] = set()

    for track in tracks:
        best_file = None
        best_score = 0.0

        for f in deduped:
            if f in used_files:
                continue
            tags = file_tags.get(f, {})
            score = score_file_against_track(
                f, tags, track,
                target_album=target_album,
                similarity=similarity,
            )
            if score > best_score and score >= MATCH_THRESHOLD:
                best_score = score
                best_file = f

        if best_file:
            used_files.add(best_file)
            matches.append({
                'track': track,
                'file': best_file,
                'confidence': round(best_score, 3),
            })

    return {
        'matches': matches,
        'unmatched_files': [f for f in deduped if f not in used_files],
    }


__all__ = [
    'TITLE_WEIGHT',
    'ARTIST_WEIGHT',
    'POSITION_WEIGHT',
    'NEAR_POSITION_WEIGHT',
    'CROSS_DISC_POSITION_WEIGHT',
    'ALBUM_WEIGHT',
    'MATCH_THRESHOLD',
    'dedupe_files_by_position',
    'score_file_against_track',
    'match_files_to_tracks',
]