You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/downloads/validation.py

263 lines
12 KiB

"""Soulseek/streaming candidate validation — lifted from web_server.py.
Body is byte-identical to the original. ``matching_engine`` and
``download_orchestrator`` are injected via init() because both are
constructed in web_server.py and referenced by name throughout
the body.
"""
import logging
import re
from config.settings import config_manager
logger = logging.getLogger(__name__)
# Injected at runtime via init().
matching_engine = None
download_orchestrator = None
def init(matching_engine_obj, download_orchestrator_obj):
"""Bind the matching engine and download orchestrator from web_server."""
global matching_engine, download_orchestrator
matching_engine = matching_engine_obj
download_orchestrator = download_orchestrator_obj
def filter_soundcloud_previews(results, expected_track):
"""Drop SoundCloud preview snippets so they never reach the cache,
the modal, or the auto-download attempt.
SoundCloud serves a ~30s preview clip for tracks gated behind Go+ /
login. yt-dlp accepts the preview as the download payload, the
integrity check catches the truncated file, but the user just sees
"all candidates failed" with previews still listed in the modal
(and clickable for manual retry, which downloads another preview).
Filter at every spot raw search results enter the task: validation
scoring, modal-cache fallback when validation drops everything,
and the not-found raw-results cache. Keep candidates that genuinely
are short (intros, sound effects) when the expected track is also
short.
"""
if not results or not expected_track:
return results
expected_ms = getattr(expected_track, 'duration_ms', 0) or 0
if expected_ms <= 0:
return results
expected_secs = expected_ms / 1000.0
if expected_secs <= 60:
return results
def _is_preview(r):
if getattr(r, 'username', None) != 'soundcloud':
return False
cand_ms = getattr(r, 'duration', None) or 0
if cand_ms <= 0:
return False
cand_secs = cand_ms / 1000.0
return cand_secs < 35 or cand_secs < expected_secs * 0.5
return [r for r in results if not _is_preview(r)]
def get_valid_candidates(results, spotify_track, query):
"""
This function is a direct port from sync.py. It scores and filters
Soulseek search results against a Spotify track to find the best, most
accurate download candidates.
"""
if not results:
return []
# Pre-filter: drop SoundCloud preview snippets when expected
# duration is non-trivially long. Same helper is also applied at
# the modal-cache fallback path so previews never reach the UI.
results = filter_soundcloud_previews(results, spotify_track)
if not results:
return []
# Streaming sources (YouTube, Tidal, Qobuz, HiFi, Deezer, SoundCloud) return structured API results
# with proper artist/title metadata — score using the same matching engine as Soulseek
_streaming_sources = ("youtube", "tidal", "qobuz", "hifi", "deezer_dl", "soundcloud")
if results[0].username in _streaming_sources:
source_label = results[0].username.replace('_dl', '').title()
expected_artists = spotify_track.artists if spotify_track else []
expected_title = spotify_track.name if spotify_track else ''
expected_duration = spotify_track.duration_ms if spotify_track else 0
# Detect if the expected track is a specific version (live, remix, acoustic, etc.)
expected_title_lower = (expected_title or '').lower()
_version_keywords = ['remix', 'live', 'acoustic', 'instrumental', 'radio edit',
'extended', 'slowed', 'sped up', 'reverb', 'karaoke',
# Producer-tag noise common on SoundCloud — "type
# beat" is an instrumental track produced in
# someone's style, tagged with the artist name to
# game search. NEVER the real song.
'type beat']
expected_is_version = any(kw in expected_title_lower for kw in _version_keywords)
scored = []
for r in results:
# Score using matching engine's generic scorer (same weights as Soulseek)
confidence, match_type = matching_engine.score_track_match(
source_title=expected_title,
source_artists=expected_artists,
source_duration_ms=expected_duration,
candidate_title=r.title or '',
candidate_artists=[r.artist] if r.artist else [],
candidate_duration_ms=r.duration or 0,
)
# Version detection penalty — reject live/remix/acoustic when expecting original
r_title_lower = (r.title or '').lower()
is_wrong_version = False
if not expected_is_version:
# Expecting original — penalize versions
for kw in _version_keywords:
if kw in r_title_lower and kw not in expected_title_lower:
confidence *= 0.4 # Heavy penalty
is_wrong_version = True
break
else:
# Expecting specific version — penalize results that don't have it
for kw in _version_keywords:
if kw in expected_title_lower and kw not in r_title_lower:
confidence *= 0.5
is_wrong_version = True
break
# Artist gate — streaming APIs (Tidal/Qobuz/HiFi/Deezer) have reliable metadata,
# so "My Will" by "B. Starr" should never match expected "B小町".
# Skip for YouTube — artist is parsed from video titles and often unreliable.
if r.username != 'youtube':
from difflib import SequenceMatcher
import re as _re
_cand_artist_raw = r.artist or ''
_cand_artist = matching_engine.normalize_string(_cand_artist_raw)
_best_artist = 0.0
for _ea in expected_artists:
_ea_norm = matching_engine.normalize_string(_ea)
if not _ea_norm:
continue
# For short normalized names (e.g. "B小町"→"b"), containment is useless.
# Compare original Unicode strings directly via similarity instead.
if len(_ea_norm) <= 2:
_best_artist = max(_best_artist, SequenceMatcher(None, _ea.lower(), _cand_artist_raw.lower()).ratio())
elif _re.search(r'\b' + _re.escape(_ea_norm) + r'\b', _cand_artist):
_best_artist = 1.0
break
elif _ea_norm == _cand_artist:
_best_artist = 1.0
break
else:
_best_artist = max(_best_artist, SequenceMatcher(None, _ea_norm, _cand_artist).ratio())
# Raised from 0.4 → 0.5 to close a fencepost bug: SequenceMatcher
# returns exactly 0.400 for "maduk" vs "tom walker" (5 chars vs
# 10 chars with 2 coincidental char matches), which bypassed the
# strict `< 0.4` check and let Tom Walker through as a candidate
# for a Maduk track. The word-boundary containment check above
# already short-circuits legitimate formatting variations
# ("Beatles"/"The Beatles", "Maduk"/"Maduk feat. X") to sim=1.0,
# so falling to SequenceMatcher means the strings are genuinely
# different. 0.5 gives a safer buffer without blocking real
# matches that would have scored above 0.85 anyway.
if _best_artist < 0.5 and confidence < 0.85:
continue
r.confidence = confidence
r.version_type = 'wrong_version' if is_wrong_version else match_type
if confidence >= 0.60:
scored.append(r)
if scored:
# Sort by confidence (best match first)
scored.sort(key=lambda x: x.confidence, reverse=True)
best = scored[0]
logger.info(f"[{source_label}] {len(scored)}/{len(results)} candidates passed validation "
f"(best: {best.confidence:.2f} '{best.artist} - {best.title}')")
return scored
else:
if results[0].username == 'youtube':
logger.warning(f"[{source_label}] No streaming results passed validation — falling through to filename matching")
# YouTube artist data is unreliable, allow fallback to filename-based matching
else:
logger.warning(f"[{source_label}] No streaming results passed validation (threshold: 0.60, artist gate: 0.50) — rejecting all candidates")
return [] # Tidal/Qobuz/HiFi/Deezer have structured metadata; don't fall back to filename matching
# Uses the existing, powerful matching engine for scoring (Soulseek P2P results)
_max_q = config_manager.get('soulseek.max_peer_queue', 0) or 0
initial_candidates = matching_engine.find_best_slskd_matches_enhanced(spotify_track, results, max_peer_queue=_max_q)
if not initial_candidates:
return []
# Skip quality filtering for streaming source results that somehow got here
is_streaming_source = initial_candidates[0].username in _streaming_sources if initial_candidates else False
if is_streaming_source:
source_label = initial_candidates[0].username.title()
logger.info(f"[{source_label}] Skipping quality filter - streaming source handles quality internally")
quality_filtered_candidates = initial_candidates
else:
# Filter by user's quality profile before artist verification (Soulseek only)
# Use existing download_orchestrator to avoid re-initializing (which accesses download_path filesystem)
quality_filtered_candidates = download_orchestrator.client('soulseek').filter_results_by_quality_preference(initial_candidates)
# IMPORTANT: Respect empty results from quality filter
# If user has strict quality requirements (e.g., FLAC-only with fallback disabled),
# and no results match, we should fail the download rather than force a fallback.
# The quality filter already has its own fallback logic controlled by the user's settings.
if not quality_filtered_candidates:
logger.error("[Quality Filter] No candidates match quality profile - download will fail per user preferences")
return []
verified_candidates = []
spotify_artists = spotify_track.artists if spotify_track.artists else []
# Pre-normalize all artist names into word sets using the matching engine
# This handles Cyrillic, accents, special chars ($), separators, etc.
artist_word_sets = []
for artist_name in spotify_artists:
normalized = matching_engine.normalize_string(artist_name)
words = set(normalized.split())
if words:
artist_word_sets.append(words)
for candidate in quality_filtered_candidates:
# Skip artist check for streaming results (title matching is sufficient as processed by matching engine)
if is_streaming_source:
verified_candidates.append(candidate)
continue
# No artist info available — can't verify, accept candidate
if not artist_word_sets:
verified_candidates.append(candidate)
continue
# Split the Soulseek path into segments (folders + filename) and check each one.
# This prevents false positives where a short artist name like "Sia" accidentally
# matches inside a folder name like "Enthusiastic" — by checking words within
# individual segments rather than a flat substring of the entire path.
path_segments = re.split(r'[/\\]', candidate.filename)
artist_found = False
for segment in path_segments:
if not segment:
continue
seg_words = set(matching_engine.normalize_string(segment).split())
if not seg_words:
continue
# Check if ANY artist's words are ALL present in this segment
for artist_words in artist_word_sets:
if artist_words.issubset(seg_words):
artist_found = True
break
if artist_found:
break
if artist_found:
verified_candidates.append(candidate)
return verified_candidates