SoulSync/tests/test_replaygain_summary_par...

"""Pin the ReplayGain analysis fix.

User report: every track in a downloaded album got the same
``replaygain_track_gain`` of ``+52.00 dB`` after post-processing.
Smoking gun: ``-18 (RG2 reference) - (-70.0) = +52.00``. Every track's
first ebur128 measurement window reads ~-70 LUFS because the first
window covers the silent intro / encoder padding.

The old code used ``re.search('I:\\s+...')`` which returns the FIRST
match — capturing that initial -70 LUFS reading instead of the final
integrated value from the Summary block.

These tests use representative ffmpeg ebur128 output (per-window
progress + final Summary block) to pin: parser anchors to the
Summary, ignores per-window partials, and falls back gracefully when
Summary is absent.
"""

from __future__ import annotations

import re
import subprocess
from unittest.mock import patch

import pytest

from core.replaygain import analyze_track


# ---------------------------------------------------------------------------
# Fabricated ebur128 stderr samples
# ---------------------------------------------------------------------------

_REAL_EBUR128_STDERR = """
[Parsed_ebur128_0 @ 0x000001] Summary:
[Parsed_ebur128_0 @ 0x000001] t: 0.500000   TARGET:-23 LUFS    M: -70.0 S:-70.0     I: -70.0 LUFS       LRA:   0.0 LU  FTPK: -70.0 dBFS  TPK: -70.0 dBFS
[Parsed_ebur128_0 @ 0x000001] t: 1.000000   TARGET:-23 LUFS    M: -50.0 S:-60.0     I: -60.0 LUFS       LRA:   0.0 LU  FTPK: -50.0 dBFS  TPK: -50.0 dBFS
[Parsed_ebur128_0 @ 0x000001] t: 1.500000   TARGET:-23 LUFS    M: -20.0 S:-30.0     I: -25.0 LUFS       LRA:   0.0 LU  FTPK: -2.5 dBFS   TPK: -2.5 dBFS
[Parsed_ebur128_0 @ 0x000001] t: 2.000000   TARGET:-23 LUFS    M: -18.0 S:-20.0     I: -14.5 LUFS       LRA:   1.5 LU  FTPK: -0.4 dBFS   TPK: -0.4 dBFS
[Parsed_ebur128_0 @ 0x000001] Summary:

  Integrated loudness:
    I:         -14.3 LUFS
    Threshold: -24.3 LUFS

  Loudness range:
    LRA:         3.2 LU
    Threshold: -34.3 LUFS
    LRA low:   -16.5 LUFS
    LRA high:  -13.3 LUFS

  True peak:
    Peak:       -0.4 dBFS
[out#0/null @ 0x000002] video:0KiB audio:172KiB
"""


def _stub_ffmpeg(stderr_output: str):
    """Patch subprocess.run to return a fake ffmpeg result with the
    given stderr."""
    class _FakeResult:
        def __init__(self):
            self.stderr = stderr_output
            self.returncode = 0
    return patch.object(subprocess, 'run', return_value=_FakeResult())


# ---------------------------------------------------------------------------
# Headline regression: don't grab the first per-window reading
# ---------------------------------------------------------------------------


def test_parses_summary_lufs_not_first_per_window_reading():
    """The per-window stream contains 'I: -70.0 LUFS' (silent intro)
    BEFORE the Summary block's 'I: -14.3 LUFS'. Parser must return
    -14.3 (summary), NOT -70.0 (first per-window).

    This is the exact bug from the user's +52.00 dB report:
    -18 RG2 reference - (-70.0) = +52.00 was the symptom."""
    with _stub_ffmpeg(_REAL_EBUR128_STDERR):
        lufs, peak = analyze_track('/fake/path.flac')

    assert lufs == pytest.approx(-14.3, abs=0.01)
    assert peak == pytest.approx(-0.4, abs=0.01)


def test_resulting_gain_is_realistic_not_plus_52():
    """Computed gain must be a normal real-world value (a few dB
    range), NOT the symptomatic +52.00 dB the bug produced."""
    with _stub_ffmpeg(_REAL_EBUR128_STDERR):
        lufs, _peak = analyze_track('/fake/path.flac')
    gain = -18.0 - lufs  # RG2 reference
    assert -10.0 < gain < 10.0, f"Unrealistic gain {gain:+.2f} dB — bug regression"


# ---------------------------------------------------------------------------
# Different per-track values stay different
# ---------------------------------------------------------------------------


def _make_stderr(per_window_lufs: list[float], summary_lufs: float, summary_peak: float) -> str:
    """Build an ebur128 stderr blob with controllable per-window and
    summary values. Lets each test verify the summary is what gets
    used regardless of what's in the per-window stream."""
    per_window = '\n'.join(
        f"[Parsed_ebur128_0 @ 0x1] t: {(i + 1) * 0.5:.6f}   TARGET:-23 LUFS    "
        f"M: {lufs:.1f} S:{lufs:.1f}     I: {lufs:.1f} LUFS       LRA:   0.0 LU  "
        f"FTPK: {lufs / 2:.1f} dBFS  TPK: {lufs / 2:.1f} dBFS"
        for i, lufs in enumerate(per_window_lufs)
    )
    return f"""{per_window}
[Parsed_ebur128_0 @ 0x1] Summary:

  Integrated loudness:
    I:         {summary_lufs:.1f} LUFS
    Threshold: -24.0 LUFS

  Loudness range:
    LRA:         3.2 LU

  True peak:
    Peak:       {summary_peak:+.1f} dBFS
"""


def test_two_tracks_with_different_summaries_get_different_lufs():
    """Two simulated tracks with the SAME first per-window value (-70)
    but DIFFERENT summary integrated loudness values. Old buggy parser
    would report -70 for both. Fixed parser correctly reports the
    distinct summary values."""
    track_a_stderr = _make_stderr([-70.0, -50.0, -20.0], summary_lufs=-14.3, summary_peak=-0.4)
    track_b_stderr = _make_stderr([-70.0, -45.0, -10.0], summary_lufs=-7.8, summary_peak=-1.2)

    with _stub_ffmpeg(track_a_stderr):
        lufs_a, _ = analyze_track('/fake/a.flac')
    with _stub_ffmpeg(track_b_stderr):
        lufs_b, _ = analyze_track('/fake/b.flac')

    assert lufs_a != lufs_b
    assert lufs_a == pytest.approx(-14.3, abs=0.01)
    assert lufs_b == pytest.approx(-7.8, abs=0.01)


def test_per_window_lufs_with_higher_value_doesnt_leak_into_summary():
    """If per-window readings include a value HIGHER than the summary
    (a transient loud window), the parser must still return the
    summary value, not the loudest per-window."""
    stderr = _make_stderr([-70.0, -5.0, -3.0], summary_lufs=-12.0, summary_peak=-0.5)
    with _stub_ffmpeg(stderr):
        lufs, _ = analyze_track('/fake/loud.flac')
    assert lufs == pytest.approx(-12.0, abs=0.01)


# ---------------------------------------------------------------------------
# Defensive fallback when Summary block is absent
# ---------------------------------------------------------------------------


def test_falls_back_to_last_per_window_when_no_summary():
    """Some ffmpeg versions or truncated outputs may lack a Summary
    block. Defensive fallback uses the LAST per-window reading (still
    closer to the final integrated value than the first)."""
    stderr = """
[Parsed_ebur128_0 @ 0x1] t: 0.5   I: -70.0 LUFS  LRA: 0.0 LU
[Parsed_ebur128_0 @ 0x1] t: 1.0   I: -25.0 LUFS  LRA: 0.0 LU
[Parsed_ebur128_0 @ 0x1] t: 1.5   I: -14.5 LUFS  LRA: 1.5 LU
""".strip()

    with _stub_ffmpeg(stderr):
        lufs, _peak = analyze_track('/fake/no_summary.flac')

    # Last per-window reading, NOT the first
    assert lufs == pytest.approx(-14.5, abs=0.01)


def test_raises_when_no_lufs_anywhere():
    """If ffmpeg output contains no LUFS values at all (parse failure
    / wrong format), surface a clear RuntimeError so the caller can
    decide whether to skip RG analysis."""
    stderr = "ffmpeg: garbled output, no LUFS data anywhere\n"
    with _stub_ffmpeg(stderr):
        with pytest.raises(RuntimeError, match='Could not parse'):
            analyze_track('/fake/garbage.flac')


# ---------------------------------------------------------------------------
# Peak parsing — ensure it stays anchored to summary too
# ---------------------------------------------------------------------------


def test_peak_uses_summary_value_not_per_window_max():
    """Per-window output uses 'TPK:'/'FTPK:' labels; only the summary
    uses 'Peak:'. Pin that the parser only catches the summary peak
    even when per-window TPK values would be larger."""
    stderr = _make_stderr([-70.0, -45.0, -10.0], summary_lufs=-12.0, summary_peak=-0.4)
    with _stub_ffmpeg(stderr):
        _lufs, peak = analyze_track('/fake/peak.flac')
    assert peak == pytest.approx(-0.4, abs=0.01)