You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/downloads/validation.py

214 lines
10 KiB

"""Soulseek/streaming candidate validation — lifted from web_server.py.
Body is byte-identical to the original. ``matching_engine`` and
``soulseek_client`` are injected via init() because both are
constructed in web_server.py and referenced by name throughout
the body.
"""
import logging
import re
from config.settings import config_manager
logger = logging.getLogger(__name__)
# Injected at runtime via init().
matching_engine = None
soulseek_client = None
def init(matching_engine_obj, soulseek_client_obj):
"""Bind the matching engine and download orchestrator from web_server."""
global matching_engine, soulseek_client
matching_engine = matching_engine_obj
soulseek_client = soulseek_client_obj
def get_valid_candidates(results, spotify_track, query):
"""
This function is a direct port from sync.py. It scores and filters
Soulseek search results against a Spotify track to find the best, most
accurate download candidates.
"""
if not results:
return []
# Streaming sources (YouTube, Tidal, Qobuz, HiFi, Deezer) return structured API results
# with proper artist/title metadata — score using the same matching engine as Soulseek
_streaming_sources = ("youtube", "tidal", "qobuz", "hifi", "deezer_dl")
if results[0].username in _streaming_sources:
source_label = results[0].username.replace('_dl', '').title()
expected_artists = spotify_track.artists if spotify_track else []
expected_title = spotify_track.name if spotify_track else ''
expected_duration = spotify_track.duration_ms if spotify_track else 0
# Detect if the expected track is a specific version (live, remix, acoustic, etc.)
expected_title_lower = (expected_title or '').lower()
_version_keywords = ['remix', 'live', 'acoustic', 'instrumental', 'radio edit',
'extended', 'slowed', 'sped up', 'reverb', 'karaoke']
expected_is_version = any(kw in expected_title_lower for kw in _version_keywords)
scored = []
for r in results:
# Score using matching engine's generic scorer (same weights as Soulseek)
confidence, match_type = matching_engine.score_track_match(
source_title=expected_title,
source_artists=expected_artists,
source_duration_ms=expected_duration,
candidate_title=r.title or '',
candidate_artists=[r.artist] if r.artist else [],
candidate_duration_ms=r.duration or 0,
)
# Version detection penalty — reject live/remix/acoustic when expecting original
r_title_lower = (r.title or '').lower()
is_wrong_version = False
if not expected_is_version:
# Expecting original — penalize versions
for kw in _version_keywords:
if kw in r_title_lower and kw not in expected_title_lower:
confidence *= 0.4 # Heavy penalty
is_wrong_version = True
break
else:
# Expecting specific version — penalize results that don't have it
for kw in _version_keywords:
if kw in expected_title_lower and kw not in r_title_lower:
confidence *= 0.5
is_wrong_version = True
break
# Artist gate — streaming APIs (Tidal/Qobuz/HiFi/Deezer) have reliable metadata,
# so "My Will" by "B. Starr" should never match expected "B小町".
# Skip for YouTube — artist is parsed from video titles and often unreliable.
if r.username != 'youtube':
from difflib import SequenceMatcher
import re as _re
_cand_artist_raw = r.artist or ''
_cand_artist = matching_engine.normalize_string(_cand_artist_raw)
_best_artist = 0.0
for _ea in expected_artists:
_ea_norm = matching_engine.normalize_string(_ea)
if not _ea_norm:
continue
# For short normalized names (e.g. "B小町"→"b"), containment is useless.
# Compare original Unicode strings directly via similarity instead.
if len(_ea_norm) <= 2:
_best_artist = max(_best_artist, SequenceMatcher(None, _ea.lower(), _cand_artist_raw.lower()).ratio())
elif _re.search(r'\b' + _re.escape(_ea_norm) + r'\b', _cand_artist):
_best_artist = 1.0
break
elif _ea_norm == _cand_artist:
_best_artist = 1.0
break
else:
_best_artist = max(_best_artist, SequenceMatcher(None, _ea_norm, _cand_artist).ratio())
# Raised from 0.4 → 0.5 to close a fencepost bug: SequenceMatcher
# returns exactly 0.400 for "maduk" vs "tom walker" (5 chars vs
# 10 chars with 2 coincidental char matches), which bypassed the
# strict `< 0.4` check and let Tom Walker through as a candidate
# for a Maduk track. The word-boundary containment check above
# already short-circuits legitimate formatting variations
# ("Beatles"/"The Beatles", "Maduk"/"Maduk feat. X") to sim=1.0,
# so falling to SequenceMatcher means the strings are genuinely
# different. 0.5 gives a safer buffer without blocking real
# matches that would have scored above 0.85 anyway.
if _best_artist < 0.5 and confidence < 0.85:
continue
r.confidence = confidence
r.version_type = 'wrong_version' if is_wrong_version else match_type
if confidence >= 0.60:
scored.append(r)
if scored:
# Sort by confidence (best match first)
scored.sort(key=lambda x: x.confidence, reverse=True)
best = scored[0]
logger.info(f"[{source_label}] {len(scored)}/{len(results)} candidates passed validation "
f"(best: {best.confidence:.2f} '{best.artist} - {best.title}')")
return scored
else:
if results[0].username == 'youtube':
logger.warning(f"[{source_label}] No streaming results passed validation — falling through to filename matching")
# YouTube artist data is unreliable, allow fallback to filename-based matching
else:
logger.warning(f"[{source_label}] No streaming results passed validation (threshold: 0.60, artist gate: 0.50) — rejecting all candidates")
return [] # Tidal/Qobuz/HiFi/Deezer have structured metadata; don't fall back to filename matching
# Uses the existing, powerful matching engine for scoring (Soulseek P2P results)
_max_q = config_manager.get('soulseek.max_peer_queue', 0) or 0
initial_candidates = matching_engine.find_best_slskd_matches_enhanced(spotify_track, results, max_peer_queue=_max_q)
if not initial_candidates:
return []
# Skip quality filtering for streaming source results that somehow got here
is_streaming_source = initial_candidates[0].username in _streaming_sources if initial_candidates else False
if is_streaming_source:
source_label = initial_candidates[0].username.title()
logger.info(f"[{source_label}] Skipping quality filter - streaming source handles quality internally")
quality_filtered_candidates = initial_candidates
else:
# Filter by user's quality profile before artist verification (Soulseek only)
# Use existing soulseek_client to avoid re-initializing (which accesses download_path filesystem)
quality_filtered_candidates = soulseek_client.soulseek.filter_results_by_quality_preference(initial_candidates)
# IMPORTANT: Respect empty results from quality filter
# If user has strict quality requirements (e.g., FLAC-only with fallback disabled),
# and no results match, we should fail the download rather than force a fallback.
# The quality filter already has its own fallback logic controlled by the user's settings.
if not quality_filtered_candidates:
logger.error("[Quality Filter] No candidates match quality profile - download will fail per user preferences")
return []
verified_candidates = []
spotify_artists = spotify_track.artists if spotify_track.artists else []
# Pre-normalize all artist names into word sets using the matching engine
# This handles Cyrillic, accents, special chars ($), separators, etc.
artist_word_sets = []
for artist_name in spotify_artists:
normalized = matching_engine.normalize_string(artist_name)
words = set(normalized.split())
if words:
artist_word_sets.append(words)
for candidate in quality_filtered_candidates:
# Skip artist check for streaming results (title matching is sufficient as processed by matching engine)
if is_streaming_source:
verified_candidates.append(candidate)
continue
# No artist info available — can't verify, accept candidate
if not artist_word_sets:
verified_candidates.append(candidate)
continue
# Split the Soulseek path into segments (folders + filename) and check each one.
# This prevents false positives where a short artist name like "Sia" accidentally
# matches inside a folder name like "Enthusiastic" — by checking words within
# individual segments rather than a flat substring of the entire path.
path_segments = re.split(r'[/\\]', candidate.filename)
artist_found = False
for segment in path_segments:
if not segment:
continue
seg_words = set(matching_engine.normalize_string(segment).split())
if not seg_words:
continue
# Check if ANY artist's words are ALL present in this segment
for artist_words in artist_word_sets:
if artist_words.issubset(seg_words):
artist_found = True
break
if artist_found:
break
if artist_found:
verified_candidates.append(candidate)
return verified_candidates