mirror of https://github.com/Nezreka/SoulSync.git
Discord report (fresh.dumbledore [VRN]): slskd sometimes ships broken files
(truncated transfers, corrupt FLAC, wrong file substituted on filename match).
They flowed through post-processing and only surfaced later — Plex/Jellyfin
scan failures, dead-air playback, duplicate detector tripping over the wrong
length. By that point the file was already tagged, copied, mirrored to the
media server, and recorded in provenance.
New module `core/imports/file_integrity.py`:
- `check_audio_integrity(path, expected_duration_ms=None) -> IntegrityResult`
- Three tiered checks, cheapest to most expensive:
1. File size sanity (catches 0-byte stubs and stub transfers)
2. Mutagen parse (catches header damage, wrong-format-with-right-extension)
3. Duration agreement vs. metadata source's expected length, ±3s tolerance
(5s for tracks over 10 minutes — long tracks naturally drift more)
- Returns IntegrityResult with `ok`, human-readable `reason`, and per-check
`checks` dict for debugging
- Never raises; pathological inputs return ok=False with explanation
Pipeline integration in `core/imports/pipeline.py:post_process_matched_download`:
- Hooks between the existing file-stability wait and AcoustID verification
- On failure: quarantine via existing `move_to_quarantine` helper, mark task
failed with descriptive error, clear matched-context, fire
`on_download_completed(success=False)` so the slot is released for retry
- Mirrors the existing AcoustID-failure path so retry behavior stays consistent
- Wrapped in try/except so an unexpected failure inside the check itself
cannot block downloads — logs and continues
This is intentionally tier 1: universal across formats, no external deps.
A future tier could verify FLAC STREAMINFO MD5 by decoding audio (needs
flac binary or libflac wrapper) — skipped for now since tier 1 catches the
dominant Discord-reported cases (truncated, 0-byte, wrong file).
Tests:
- `tests/imports/test_file_integrity.py` — 14 cases covering all three check
tiers, edge cases (zero/negative expected duration, long-track wider
tolerance, caller tolerance override), and the mutagen-unavailable
degradation path
- `tests/imports/test_import_pipeline.py` — two existing tests use 5-byte
fixture files that the new check would reject; they monkeypatch the
integrity check since they're testing plumbing (notification +
metadata_runtime forwarding), not integrity behavior
WHATS_NEW entry under '2.4.2' dev cycle.
pull/477/head
parent
bcb91a1a1a
commit
42f3026eef
@ -0,0 +1,192 @@
|
||||
"""Audio file integrity checks for downloaded files.
|
||||
|
||||
slskd (and other download sources) sometimes ship broken files: truncated
|
||||
transfers, corrupted FLAC frames, mp3s with bad headers, or wrong files
|
||||
that share a name with the target. These slip past the slskd "completed"
|
||||
status and only get caught later (often by Plex/Jellyfin failing to scan
|
||||
the file, or by users hearing dead air).
|
||||
|
||||
Verification runs after the slskd transfer settles but before the heavy
|
||||
post-processing work (tagging, copying, server sync). Failed files get
|
||||
quarantined and the slot is freed for a retry from another candidate.
|
||||
|
||||
Three checks, in order from cheapest to most expensive:
|
||||
|
||||
1. **File-size sanity** — anything below ~10KB is almost certainly a
|
||||
stub, broken transfer, or non-audio masquerading as audio.
|
||||
2. **Mutagen parse** — catches truncated headers, corrupted streamheaders,
|
||||
wrong-format files (mp3 with .flac extension, etc). If mutagen can't
|
||||
parse the audio info block, the file won't import cleanly downstream.
|
||||
3. **Duration agreement** — if the caller provides an expected duration
|
||||
(Spotify/MusicBrainz `duration_ms`), the decoded length must agree
|
||||
within tolerance. Catches truncated files whose headers parse fine
|
||||
but whose audio is incomplete, and "wrong file" cases the slskd
|
||||
transfer matched on a similarly-named track.
|
||||
|
||||
This is the "tier 1" integrity layer — universal across formats, no
|
||||
external binary dep. A future tier could verify the FLAC STREAMINFO MD5
|
||||
by actually decoding the audio (requires `flac` binary or libflac
|
||||
wrapper); skipped for now since tier 1 catches the vast majority of
|
||||
real-world corruption.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from utils.logging_config import get_logger
|
||||
|
||||
|
||||
logger = get_logger("imports.file_integrity")
|
||||
|
||||
# Minimum plausible audio file size. A 1-second 64kbps mp3 is ~8KB; a
|
||||
# 1-second FLAC is much larger. Anything under this is a broken stub.
|
||||
_MIN_FILE_SIZE_BYTES = 10 * 1024
|
||||
|
||||
# Default tolerance for duration agreement. Most legitimate length
|
||||
# variations (intro silence, encoder padding, live recording trims) sit
|
||||
# inside 3 seconds. Goes up to 5s if the expected duration is itself
|
||||
# long (>10 minutes) since absolute drift scales with length.
|
||||
_DEFAULT_LENGTH_TOLERANCE_S = 3.0
|
||||
_LENGTH_TOLERANCE_LONG_TRACK_S = 5.0
|
||||
_LONG_TRACK_THRESHOLD_S = 600.0 # 10 minutes
|
||||
|
||||
|
||||
@dataclass
|
||||
class IntegrityResult:
|
||||
"""Outcome of an integrity check.
|
||||
|
||||
`ok` is the single bit the caller cares about. `reason` is the
|
||||
human-readable explanation when `ok` is False (suitable for
|
||||
quarantine sidecar / log lines / UI). `checks` carries the
|
||||
per-check details — useful for debugging and tests.
|
||||
"""
|
||||
|
||||
ok: bool
|
||||
reason: str = ""
|
||||
checks: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def check_audio_integrity(
|
||||
file_path: str,
|
||||
expected_duration_ms: Optional[int] = None,
|
||||
*,
|
||||
length_tolerance_s: Optional[float] = None,
|
||||
min_file_size_bytes: int = _MIN_FILE_SIZE_BYTES,
|
||||
) -> IntegrityResult:
|
||||
"""Verify a downloaded audio file is not broken.
|
||||
|
||||
Args:
|
||||
file_path: Path to the audio file on disk.
|
||||
expected_duration_ms: Expected track length from the metadata
|
||||
source (Spotify/MB/etc). If None, the duration check is
|
||||
skipped and only the size + parse checks run.
|
||||
length_tolerance_s: Override the default tolerance for the
|
||||
duration check. None uses the auto-scaled default
|
||||
(3s for normal tracks, 5s for >10min tracks).
|
||||
min_file_size_bytes: Override the minimum size threshold.
|
||||
|
||||
Returns:
|
||||
IntegrityResult with `ok`, `reason`, and per-check details.
|
||||
Never raises — all errors become `ok=False` with an explanatory
|
||||
reason, so callers can rely on a clean boolean.
|
||||
"""
|
||||
import os
|
||||
|
||||
checks: Dict[str, Any] = {}
|
||||
|
||||
# --- Check 1: file size ---
|
||||
try:
|
||||
size = os.path.getsize(file_path)
|
||||
except OSError as exc:
|
||||
return IntegrityResult(ok=False, reason=f"Cannot stat file: {exc}",
|
||||
checks={"size": "stat_failed"})
|
||||
|
||||
checks["size_bytes"] = size
|
||||
if size < min_file_size_bytes:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason=f"File too small ({size} bytes, minimum {min_file_size_bytes}) — "
|
||||
"likely truncated transfer or empty stub",
|
||||
checks=checks,
|
||||
)
|
||||
|
||||
# --- Check 2: mutagen parse ---
|
||||
try:
|
||||
from mutagen import File as MutagenFile
|
||||
except ImportError:
|
||||
# mutagen is a hard dep elsewhere in the codebase, but degrade
|
||||
# gracefully if it's somehow missing — pass with a warning
|
||||
# rather than failing every download.
|
||||
logger.warning("[Integrity] mutagen unavailable — skipping parse check")
|
||||
checks["mutagen_parse"] = "unavailable"
|
||||
return IntegrityResult(ok=True, checks=checks)
|
||||
|
||||
try:
|
||||
audio = MutagenFile(file_path)
|
||||
except Exception as exc:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason=f"Mutagen could not parse file: {exc}",
|
||||
checks={**checks, "mutagen_parse": "exception"},
|
||||
)
|
||||
|
||||
if audio is None:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason="Mutagen could not identify file format — likely corrupted "
|
||||
"or wrong file extension",
|
||||
checks={**checks, "mutagen_parse": "unidentified"},
|
||||
)
|
||||
|
||||
if audio.info is None:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason="Mutagen parsed file but found no audio info block — "
|
||||
"header damage suspected",
|
||||
checks={**checks, "mutagen_parse": "no_info"},
|
||||
)
|
||||
|
||||
actual_length_s = float(getattr(audio.info, "length", 0) or 0)
|
||||
checks["actual_length_s"] = actual_length_s
|
||||
|
||||
if actual_length_s <= 0:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason="Mutagen reports zero-length audio — file has no playable "
|
||||
"audio data",
|
||||
checks={**checks, "mutagen_parse": "zero_length"},
|
||||
)
|
||||
|
||||
# --- Check 3: duration agreement (optional) ---
|
||||
if expected_duration_ms is None or expected_duration_ms <= 0:
|
||||
checks["length_check"] = "skipped"
|
||||
return IntegrityResult(ok=True, checks=checks)
|
||||
|
||||
expected_length_s = expected_duration_ms / 1000.0
|
||||
checks["expected_length_s"] = expected_length_s
|
||||
|
||||
if length_tolerance_s is None:
|
||||
length_tolerance_s = (
|
||||
_LENGTH_TOLERANCE_LONG_TRACK_S
|
||||
if expected_length_s > _LONG_TRACK_THRESHOLD_S
|
||||
else _DEFAULT_LENGTH_TOLERANCE_S
|
||||
)
|
||||
checks["length_tolerance_s"] = length_tolerance_s
|
||||
|
||||
drift_s = abs(actual_length_s - expected_length_s)
|
||||
checks["length_drift_s"] = drift_s
|
||||
|
||||
if drift_s > length_tolerance_s:
|
||||
return IntegrityResult(
|
||||
ok=False,
|
||||
reason=f"Duration mismatch: file is {actual_length_s:.1f}s, "
|
||||
f"expected {expected_length_s:.1f}s "
|
||||
f"(drift {drift_s:.1f}s > tolerance {length_tolerance_s:.1f}s) — "
|
||||
"likely truncated download or wrong file matched",
|
||||
checks=checks,
|
||||
)
|
||||
|
||||
checks["length_check"] = "passed"
|
||||
return IntegrityResult(ok=True, checks=checks)
|
||||
@ -0,0 +1,254 @@
|
||||
"""Regression tests for file integrity checks on downloaded audio.
|
||||
|
||||
Discord-reported (fresh.dumbledore [VRN]): slskd sometimes hosts broken
|
||||
files (truncated transfers, corrupted FLAC, wrong file masquerading as
|
||||
the target). The integrity layer at ``core/imports/file_integrity.py``
|
||||
catches these before they reach tagging/library sync, using three
|
||||
universal checks: file-size sanity, mutagen parse, and duration
|
||||
agreement against the metadata-source-provided expected length.
|
||||
|
||||
These tests exercise the module directly with fabricated files (real
|
||||
mp3 + flac samples generated via mutagen-friendly stubs and a couple of
|
||||
hand-written WAV/FLAC files) so we don't need ffmpeg or live downloads.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import struct
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from core.imports import file_integrity
|
||||
|
||||
|
||||
def _write_minimal_wav(path: Path, duration_s: float = 1.0, sample_rate: int = 8000) -> None:
|
||||
"""Write a minimal valid WAV file. Mutagen parses WAV via the
|
||||
standard wave module wrapper, giving us a real `info.length`
|
||||
we can assert against without needing ffmpeg."""
|
||||
n_samples = int(duration_s * sample_rate)
|
||||
n_channels = 1
|
||||
bits_per_sample = 16
|
||||
byte_rate = sample_rate * n_channels * bits_per_sample // 8
|
||||
block_align = n_channels * bits_per_sample // 8
|
||||
data_size = n_samples * block_align
|
||||
fmt_chunk = struct.pack(
|
||||
"<4sIHHIIHH",
|
||||
b"fmt ", 16, 1, n_channels, sample_rate, byte_rate, block_align, bits_per_sample,
|
||||
)
|
||||
data_chunk = struct.pack("<4sI", b"data", data_size) + (b"\x00\x00" * n_samples)
|
||||
riff = struct.pack("<4sI4s", b"RIFF", 4 + len(fmt_chunk) + len(data_chunk), b"WAVE")
|
||||
path.write_bytes(riff + fmt_chunk + data_chunk)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File size check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_rejects_zero_byte_file(tmp_path: Path) -> None:
|
||||
"""A 0-byte file is the most common slskd-broken case."""
|
||||
f = tmp_path / "empty.flac"
|
||||
f.write_bytes(b"")
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f))
|
||||
|
||||
assert result.ok is False
|
||||
assert "too small" in result.reason.lower()
|
||||
assert result.checks["size_bytes"] == 0
|
||||
|
||||
|
||||
def test_rejects_tiny_stub(tmp_path: Path) -> None:
|
||||
"""A few hundred bytes can't be a real audio file — slskd dropped a stub."""
|
||||
f = tmp_path / "stub.mp3"
|
||||
f.write_bytes(b"x" * 500)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f))
|
||||
|
||||
assert result.ok is False
|
||||
assert "too small" in result.reason.lower()
|
||||
|
||||
|
||||
def test_size_threshold_is_overridable(tmp_path: Path) -> None:
|
||||
"""Tests / dev workflows can lower the size threshold."""
|
||||
f = tmp_path / "small_but_intentional.bin"
|
||||
f.write_bytes(b"y" * 100)
|
||||
|
||||
# Should pass the size check at threshold=50, then fail mutagen parse
|
||||
# since it's not real audio.
|
||||
result = file_integrity.check_audio_integrity(str(f), min_file_size_bytes=50)
|
||||
|
||||
assert result.ok is False
|
||||
assert "mutagen" in result.reason.lower() or "could not" in result.reason.lower()
|
||||
|
||||
|
||||
def test_missing_file_returns_clean_failure(tmp_path: Path) -> None:
|
||||
"""No exception should escape — caller wants a clean boolean."""
|
||||
result = file_integrity.check_audio_integrity(str(tmp_path / "ghost.flac"))
|
||||
|
||||
assert result.ok is False
|
||||
assert "stat" in result.reason.lower() or "cannot" in result.reason.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mutagen parse check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_rejects_non_audio_file_with_audio_extension(tmp_path: Path) -> None:
|
||||
"""A text file renamed to .flac (sometimes happens when slskd matches
|
||||
a wrong file) should fail the parse check, not slip through."""
|
||||
f = tmp_path / "fake.flac"
|
||||
# Big enough to clear the size check, but not audio.
|
||||
f.write_bytes(b"this is definitely not flac data\n" * 1000)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f))
|
||||
|
||||
assert result.ok is False
|
||||
# Either mutagen returns None (unidentified) or raises — either is a fail.
|
||||
assert "mutagen" in result.reason.lower() or "no info" in result.reason.lower() or "identify" in result.reason.lower()
|
||||
|
||||
|
||||
def test_accepts_valid_wav_with_no_expected_duration(tmp_path: Path) -> None:
|
||||
"""Real audio with no caller-provided duration should pass — only
|
||||
size + parse run."""
|
||||
f = tmp_path / "real.wav"
|
||||
_write_minimal_wav(f, duration_s=2.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f))
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks["actual_length_s"] == pytest.approx(2.0, abs=0.1)
|
||||
assert result.checks["length_check"] == "skipped"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Duration agreement check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_accepts_when_length_within_tolerance(tmp_path: Path) -> None:
|
||||
"""A 5-second file claiming 5.5 seconds (within 3s tolerance) passes."""
|
||||
f = tmp_path / "track.wav"
|
||||
_write_minimal_wav(f, duration_s=5.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=5500)
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks["length_check"] == "passed"
|
||||
assert result.checks["length_drift_s"] == pytest.approx(0.5, abs=0.2)
|
||||
|
||||
|
||||
def test_rejects_truncated_file(tmp_path: Path) -> None:
|
||||
"""A 2-second file claiming to be a 30-second track is truncated.
|
||||
This is the headline slskd case — bytes stopped flowing partway
|
||||
through but slskd reported success."""
|
||||
f = tmp_path / "truncated.wav"
|
||||
_write_minimal_wav(f, duration_s=2.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=30_000)
|
||||
|
||||
assert result.ok is False
|
||||
assert "duration" in result.reason.lower() or "drift" in result.reason.lower()
|
||||
assert result.checks["length_drift_s"] > 3.0
|
||||
|
||||
|
||||
def test_rejects_wrong_file_substituted(tmp_path: Path) -> None:
|
||||
"""A 10-second clip masquerading as a 3-minute album track. slskd
|
||||
matched on a similar filename but the actual content is a snippet."""
|
||||
f = tmp_path / "wrong.wav"
|
||||
_write_minimal_wav(f, duration_s=10.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=180_000)
|
||||
|
||||
assert result.ok is False
|
||||
assert result.checks["length_drift_s"] > 100
|
||||
|
||||
|
||||
def test_long_track_uses_wider_tolerance(tmp_path: Path) -> None:
|
||||
"""Tracks over 10 minutes get 5s tolerance instead of 3s — long
|
||||
tracks naturally drift more (intros, outros, encoder padding)."""
|
||||
# Write a 12-minute file (720s) but at minimum sample rate to keep
|
||||
# the test fast — under 30KB total.
|
||||
f = tmp_path / "long.wav"
|
||||
_write_minimal_wav(f, duration_s=720.0, sample_rate=8000)
|
||||
|
||||
# Claim 724 seconds — 4s drift, which would fail the 3s default but
|
||||
# passes the 5s long-track threshold.
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=724_000)
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks["length_tolerance_s"] == pytest.approx(5.0)
|
||||
|
||||
|
||||
def test_caller_can_override_tolerance(tmp_path: Path) -> None:
|
||||
"""Edge cases (e.g. live recordings, known-flaky sources) can opt
|
||||
into a wider tolerance per-call."""
|
||||
f = tmp_path / "loose.wav"
|
||||
_write_minimal_wav(f, duration_s=5.0)
|
||||
|
||||
# 8-second drift — would fail default 3s, passes explicit 10s.
|
||||
result = file_integrity.check_audio_integrity(
|
||||
str(f), expected_duration_ms=13_000, length_tolerance_s=10.0,
|
||||
)
|
||||
|
||||
assert result.ok is True
|
||||
|
||||
|
||||
def test_zero_expected_duration_skips_length_check(tmp_path: Path) -> None:
|
||||
"""Some metadata sources don't carry duration — duration check
|
||||
must be skipped, not treated as a 0-length match."""
|
||||
f = tmp_path / "no_duration.wav"
|
||||
_write_minimal_wav(f, duration_s=5.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=0)
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks["length_check"] == "skipped"
|
||||
|
||||
|
||||
def test_negative_expected_duration_skips_length_check(tmp_path: Path) -> None:
|
||||
"""Defensive: bad metadata returning negative duration shouldn't
|
||||
crash or false-reject."""
|
||||
f = tmp_path / "neg_duration.wav"
|
||||
_write_minimal_wav(f, duration_s=5.0)
|
||||
|
||||
result = file_integrity.check_audio_integrity(str(f), expected_duration_ms=-100)
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks["length_check"] == "skipped"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Failure-mode robustness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_check_never_raises(tmp_path: Path, monkeypatch) -> None:
|
||||
"""The integrity check is wrapped in try/except in pipeline.py but
|
||||
callers shouldn't have to. Verify that even pathological inputs
|
||||
return a clean IntegrityResult."""
|
||||
f = tmp_path / "test.wav"
|
||||
_write_minimal_wav(f, duration_s=2.0)
|
||||
|
||||
# Force a mutagen import-time failure by stubbing the import.
|
||||
# Should NOT raise — should pass gracefully (mutagen unavailable).
|
||||
real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
|
||||
|
||||
def _broken_import(name, *args, **kwargs):
|
||||
if name == "mutagen":
|
||||
raise ImportError("simulated missing mutagen")
|
||||
return real_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setitem(__builtins__ if isinstance(__builtins__, dict) else __builtins__.__dict__,
|
||||
"__import__", _broken_import)
|
||||
|
||||
try:
|
||||
result = file_integrity.check_audio_integrity(str(f))
|
||||
except Exception as e:
|
||||
pytest.fail(f"check_audio_integrity raised: {e}")
|
||||
|
||||
assert result.ok is True
|
||||
assert result.checks.get("mutagen_parse") == "unavailable"
|
||||
Loading…
Reference in new issue