diff --git a/core/downloads/validation.py b/core/downloads/validation.py index 79131472..3ac38751 100644 --- a/core/downloads/validation.py +++ b/core/downloads/validation.py @@ -9,6 +9,7 @@ import logging import re from config.settings import config_manager +from core.imports.file_integrity import resolve_duration_tolerance logger = logging.getLogger(__name__) @@ -61,6 +62,24 @@ def filter_soundcloud_previews(results, expected_track): return [r for r in results if not _is_preview(r)] +def _duration_tolerance_seconds(expected_duration_ms): + override = resolve_duration_tolerance( + config_manager.get('post_processing.duration_tolerance_seconds', 0) + ) + if override is not None: + return override + expected_seconds = expected_duration_ms / 1000.0 + return 5.0 if expected_seconds > 600.0 else 3.0 + + +def _duration_mismatch_exceeds_integrity_tolerance(expected_duration_ms, candidate_duration_ms): + if not expected_duration_ms or not candidate_duration_ms: + return False + tolerance = _duration_tolerance_seconds(expected_duration_ms) + drift = abs((candidate_duration_ms / 1000.0) - (expected_duration_ms / 1000.0)) + return drift > tolerance + + def get_valid_candidates(results, spotify_track, query): """ This function is a direct port from sync.py. It scores and filters @@ -98,7 +117,21 @@ def get_valid_candidates(results, spotify_track, query): expected_is_version = any(kw in expected_title_lower for kw in _version_keywords) scored = [] + _strict_duration_sources = {'tidal', 'qobuz', 'hifi', 'deezer_dl', 'amazon'} for r in results: + if ( + r.username in _strict_duration_sources + and _duration_mismatch_exceeds_integrity_tolerance(expected_duration, r.duration or 0) + ): + logger.info( + "[%s] Rejecting candidate due to duration mismatch before download: " + "expected %.1fs, candidate %.1fs", + source_label, + expected_duration / 1000.0, + (r.duration or 0) / 1000.0, + ) + continue + # Score using matching engine's generic scorer (same weights as Soulseek) confidence, match_type = matching_engine.score_track_match( source_title=expected_title, diff --git a/tests/downloads/test_downloads_validation.py b/tests/downloads/test_downloads_validation.py index 62798523..ddb5346a 100644 --- a/tests/downloads/test_downloads_validation.py +++ b/tests/downloads/test_downloads_validation.py @@ -11,19 +11,32 @@ from __future__ import annotations from dataclasses import dataclass from typing import Optional -from core.downloads.validation import filter_soundcloud_previews +from core.downloads import validation +from core.downloads.validation import filter_soundcloud_previews, get_valid_candidates @dataclass class _Track: duration_ms: int + name: str = 'Song' + artists: tuple[str, ...] = ('Artist',) @dataclass class _Candidate: username: str duration: Optional[int] # milliseconds - title: str = '' + title: str = 'Song' + artist: str = 'Artist' + filename: str = 'candidate' + + +class _MatchingEngine: + def score_track_match(self, **kwargs): + return 0.99, 'core_title_match' + + def normalize_string(self, text): + return (text or '').lower() def test_drops_soundcloud_30s_preview_when_expected_long(): @@ -90,3 +103,23 @@ def test_keeps_soundcloud_candidate_at_threshold(): # 110s passes both checks: > 35s AND > 100s (half of 200s) cand = _Candidate(username='soundcloud', duration=110_000) assert filter_soundcloud_previews([cand], expected) == [cand] + + +def test_rejects_tidal_candidate_that_would_fail_integrity_duration(monkeypatch): + """Structured sources should not download candidates that post-processing + will immediately quarantine for the same duration mismatch.""" + monkeypatch.setattr(validation, 'matching_engine', _MatchingEngine()) + expected = _Track(duration_ms=338_000) + wrong_tidal = _Candidate(username='tidal', duration=30_000) + + assert get_valid_candidates([wrong_tidal], expected, 'Artist Song') == [] + + +def test_keeps_tidal_candidate_inside_integrity_duration_tolerance(monkeypatch): + monkeypatch.setattr(validation, 'matching_engine', _MatchingEngine()) + expected = _Track(duration_ms=338_000) + tidal = _Candidate(username='tidal', duration=340_000) + + result = get_valid_candidates([tidal], expected, 'Artist Song') + + assert result == [tidal]