mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
224 lines
8.2 KiB
224 lines
8.2 KiB
"""Audio file integrity checks for downloaded files.
|
|
|
|
slskd (and other download sources) sometimes ship broken files: truncated
|
|
transfers, corrupted FLAC frames, mp3s with bad headers, or wrong files
|
|
that share a name with the target. These slip past the slskd "completed"
|
|
status and only get caught later (often by Plex/Jellyfin failing to scan
|
|
the file, or by users hearing dead air).
|
|
|
|
Verification runs after the slskd transfer settles but before the heavy
|
|
post-processing work (tagging, copying, server sync). Failed files get
|
|
quarantined and the slot is freed for a retry from another candidate.
|
|
|
|
Three checks, in order from cheapest to most expensive:
|
|
|
|
1. **File-size sanity** — anything below ~10KB is almost certainly a
|
|
stub, broken transfer, or non-audio masquerading as audio.
|
|
2. **Mutagen parse** — catches truncated headers, corrupted streamheaders,
|
|
wrong-format files (mp3 with .flac extension, etc). If mutagen can't
|
|
parse the audio info block, the file won't import cleanly downstream.
|
|
3. **Duration agreement** — if the caller provides an expected duration
|
|
(Spotify/MusicBrainz `duration_ms`), the decoded length must agree
|
|
within tolerance. Catches truncated files whose headers parse fine
|
|
but whose audio is incomplete, and "wrong file" cases the slskd
|
|
transfer matched on a similarly-named track.
|
|
|
|
This is the "tier 1" integrity layer — universal across formats, no
|
|
external binary dep. A future tier could verify the FLAC STREAMINFO MD5
|
|
by actually decoding the audio (requires `flac` binary or libflac
|
|
wrapper); skipped for now since tier 1 catches the vast majority of
|
|
real-world corruption.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Dict, Optional
|
|
|
|
from utils.logging_config import get_logger
|
|
|
|
|
|
logger = get_logger("imports.file_integrity")
|
|
|
|
# Minimum plausible audio file size. A 1-second 64kbps mp3 is ~8KB; a
|
|
# 1-second FLAC is much larger. Anything under this is a broken stub.
|
|
_MIN_FILE_SIZE_BYTES = 10 * 1024
|
|
|
|
# Default tolerance for duration agreement. Most legitimate length
|
|
# variations (intro silence, encoder padding, live recording trims) sit
|
|
# inside 3 seconds. Goes up to 5s if the expected duration is itself
|
|
# long (>10 minutes) since absolute drift scales with length.
|
|
_DEFAULT_LENGTH_TOLERANCE_S = 3.0
|
|
_LENGTH_TOLERANCE_LONG_TRACK_S = 5.0
|
|
_LONG_TRACK_THRESHOLD_S = 600.0 # 10 minutes
|
|
|
|
# Upper bound for the user-configurable override. Anything past 60s
|
|
# means the check is effectively off — cap defends against accidental
|
|
# nonsense like 9999 making logs misleading. Users who genuinely want
|
|
# to disable the check can set 60.
|
|
_MAX_USER_TOLERANCE_S = 60.0
|
|
|
|
|
|
def resolve_duration_tolerance(value: Any) -> Optional[float]:
|
|
"""Coerce a user-configured tolerance value to a float override.
|
|
|
|
Returns:
|
|
- None when value is missing / 0 / negative / unparseable, so
|
|
callers fall back to the auto-scaled defaults (3s/5s).
|
|
- float in (0, _MAX_USER_TOLERANCE_S] when value is a positive
|
|
numeric string or float — clamped to the upper bound.
|
|
|
|
Pure helper. No I/O. Drives the `length_tolerance_s` override on
|
|
`check_audio_integrity`.
|
|
"""
|
|
if value is None:
|
|
return None
|
|
try:
|
|
parsed = float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
if parsed <= 0:
|
|
return None
|
|
if parsed > _MAX_USER_TOLERANCE_S:
|
|
return _MAX_USER_TOLERANCE_S
|
|
return parsed
|
|
|
|
|
|
@dataclass
|
|
class IntegrityResult:
|
|
"""Outcome of an integrity check.
|
|
|
|
`ok` is the single bit the caller cares about. `reason` is the
|
|
human-readable explanation when `ok` is False (suitable for
|
|
quarantine sidecar / log lines / UI). `checks` carries the
|
|
per-check details — useful for debugging and tests.
|
|
"""
|
|
|
|
ok: bool
|
|
reason: str = ""
|
|
checks: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
def check_audio_integrity(
|
|
file_path: str,
|
|
expected_duration_ms: Optional[int] = None,
|
|
*,
|
|
length_tolerance_s: Optional[float] = None,
|
|
min_file_size_bytes: int = _MIN_FILE_SIZE_BYTES,
|
|
) -> IntegrityResult:
|
|
"""Verify a downloaded audio file is not broken.
|
|
|
|
Args:
|
|
file_path: Path to the audio file on disk.
|
|
expected_duration_ms: Expected track length from the metadata
|
|
source (Spotify/MB/etc). If None, the duration check is
|
|
skipped and only the size + parse checks run.
|
|
length_tolerance_s: Override the default tolerance for the
|
|
duration check. None uses the auto-scaled default
|
|
(3s for normal tracks, 5s for >10min tracks).
|
|
min_file_size_bytes: Override the minimum size threshold.
|
|
|
|
Returns:
|
|
IntegrityResult with `ok`, `reason`, and per-check details.
|
|
Never raises — all errors become `ok=False` with an explanatory
|
|
reason, so callers can rely on a clean boolean.
|
|
"""
|
|
import os
|
|
|
|
checks: Dict[str, Any] = {}
|
|
|
|
# --- Check 1: file size ---
|
|
try:
|
|
size = os.path.getsize(file_path)
|
|
except OSError as exc:
|
|
return IntegrityResult(ok=False, reason=f"Cannot stat file: {exc}",
|
|
checks={"size": "stat_failed"})
|
|
|
|
checks["size_bytes"] = size
|
|
if size < min_file_size_bytes:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason=f"File too small ({size} bytes, minimum {min_file_size_bytes}) — "
|
|
"likely truncated transfer or empty stub",
|
|
checks=checks,
|
|
)
|
|
|
|
# --- Check 2: mutagen parse ---
|
|
try:
|
|
from mutagen import File as MutagenFile
|
|
except ImportError:
|
|
# mutagen is a hard dep elsewhere in the codebase, but degrade
|
|
# gracefully if it's somehow missing — pass with a warning
|
|
# rather than failing every download.
|
|
logger.warning("[Integrity] mutagen unavailable — skipping parse check")
|
|
checks["mutagen_parse"] = "unavailable"
|
|
return IntegrityResult(ok=True, checks=checks)
|
|
|
|
try:
|
|
audio = MutagenFile(file_path)
|
|
except Exception as exc:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason=f"Mutagen could not parse file: {exc}",
|
|
checks={**checks, "mutagen_parse": "exception"},
|
|
)
|
|
|
|
if audio is None:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason="Mutagen could not identify file format — likely corrupted "
|
|
"or wrong file extension",
|
|
checks={**checks, "mutagen_parse": "unidentified"},
|
|
)
|
|
|
|
if audio.info is None:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason="Mutagen parsed file but found no audio info block — "
|
|
"header damage suspected",
|
|
checks={**checks, "mutagen_parse": "no_info"},
|
|
)
|
|
|
|
actual_length_s = float(getattr(audio.info, "length", 0) or 0)
|
|
checks["actual_length_s"] = actual_length_s
|
|
|
|
if actual_length_s <= 0:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason="Mutagen reports zero-length audio — file has no playable "
|
|
"audio data",
|
|
checks={**checks, "mutagen_parse": "zero_length"},
|
|
)
|
|
|
|
# --- Check 3: duration agreement (optional) ---
|
|
if expected_duration_ms is None or expected_duration_ms <= 0:
|
|
checks["length_check"] = "skipped"
|
|
return IntegrityResult(ok=True, checks=checks)
|
|
|
|
expected_length_s = expected_duration_ms / 1000.0
|
|
checks["expected_length_s"] = expected_length_s
|
|
|
|
if length_tolerance_s is None:
|
|
length_tolerance_s = (
|
|
_LENGTH_TOLERANCE_LONG_TRACK_S
|
|
if expected_length_s > _LONG_TRACK_THRESHOLD_S
|
|
else _DEFAULT_LENGTH_TOLERANCE_S
|
|
)
|
|
checks["length_tolerance_s"] = length_tolerance_s
|
|
|
|
drift_s = abs(actual_length_s - expected_length_s)
|
|
checks["length_drift_s"] = drift_s
|
|
|
|
if drift_s > length_tolerance_s:
|
|
return IntegrityResult(
|
|
ok=False,
|
|
reason=f"Duration mismatch: file is {actual_length_s:.1f}s, "
|
|
f"expected {expected_length_s:.1f}s "
|
|
f"(drift {drift_s:.1f}s > tolerance {length_tolerance_s:.1f}s) — "
|
|
"likely truncated download or wrong file matched",
|
|
checks=checks,
|
|
)
|
|
|
|
checks["length_check"] = "passed"
|
|
return IntegrityResult(ok=True, checks=checks)
|