mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
214 lines
10 KiB
214 lines
10 KiB
"""Soulseek/streaming candidate validation — lifted from web_server.py.
|
|
|
|
Body is byte-identical to the original. ``matching_engine`` and
|
|
``soulseek_client`` are injected via init() because both are
|
|
constructed in web_server.py and referenced by name throughout
|
|
the body.
|
|
"""
|
|
import logging
|
|
import re
|
|
|
|
from config.settings import config_manager
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Injected at runtime via init().
|
|
matching_engine = None
|
|
soulseek_client = None
|
|
|
|
|
|
def init(matching_engine_obj, soulseek_client_obj):
|
|
"""Bind the matching engine and download orchestrator from web_server."""
|
|
global matching_engine, soulseek_client
|
|
matching_engine = matching_engine_obj
|
|
soulseek_client = soulseek_client_obj
|
|
|
|
|
|
def get_valid_candidates(results, spotify_track, query):
|
|
"""
|
|
This function is a direct port from sync.py. It scores and filters
|
|
Soulseek search results against a Spotify track to find the best, most
|
|
accurate download candidates.
|
|
"""
|
|
if not results:
|
|
return []
|
|
|
|
# Streaming sources (YouTube, Tidal, Qobuz, HiFi, Deezer) return structured API results
|
|
# with proper artist/title metadata — score using the same matching engine as Soulseek
|
|
_streaming_sources = ("youtube", "tidal", "qobuz", "hifi", "deezer_dl")
|
|
if results[0].username in _streaming_sources:
|
|
source_label = results[0].username.replace('_dl', '').title()
|
|
expected_artists = spotify_track.artists if spotify_track else []
|
|
expected_title = spotify_track.name if spotify_track else ''
|
|
expected_duration = spotify_track.duration_ms if spotify_track else 0
|
|
|
|
# Detect if the expected track is a specific version (live, remix, acoustic, etc.)
|
|
expected_title_lower = (expected_title or '').lower()
|
|
_version_keywords = ['remix', 'live', 'acoustic', 'instrumental', 'radio edit',
|
|
'extended', 'slowed', 'sped up', 'reverb', 'karaoke']
|
|
expected_is_version = any(kw in expected_title_lower for kw in _version_keywords)
|
|
|
|
scored = []
|
|
for r in results:
|
|
# Score using matching engine's generic scorer (same weights as Soulseek)
|
|
confidence, match_type = matching_engine.score_track_match(
|
|
source_title=expected_title,
|
|
source_artists=expected_artists,
|
|
source_duration_ms=expected_duration,
|
|
candidate_title=r.title or '',
|
|
candidate_artists=[r.artist] if r.artist else [],
|
|
candidate_duration_ms=r.duration or 0,
|
|
)
|
|
|
|
# Version detection penalty — reject live/remix/acoustic when expecting original
|
|
r_title_lower = (r.title or '').lower()
|
|
is_wrong_version = False
|
|
if not expected_is_version:
|
|
# Expecting original — penalize versions
|
|
for kw in _version_keywords:
|
|
if kw in r_title_lower and kw not in expected_title_lower:
|
|
confidence *= 0.4 # Heavy penalty
|
|
is_wrong_version = True
|
|
break
|
|
else:
|
|
# Expecting specific version — penalize results that don't have it
|
|
for kw in _version_keywords:
|
|
if kw in expected_title_lower and kw not in r_title_lower:
|
|
confidence *= 0.5
|
|
is_wrong_version = True
|
|
break
|
|
|
|
# Artist gate — streaming APIs (Tidal/Qobuz/HiFi/Deezer) have reliable metadata,
|
|
# so "My Will" by "B. Starr" should never match expected "B小町".
|
|
# Skip for YouTube — artist is parsed from video titles and often unreliable.
|
|
if r.username != 'youtube':
|
|
from difflib import SequenceMatcher
|
|
import re as _re
|
|
_cand_artist_raw = r.artist or ''
|
|
_cand_artist = matching_engine.normalize_string(_cand_artist_raw)
|
|
_best_artist = 0.0
|
|
for _ea in expected_artists:
|
|
_ea_norm = matching_engine.normalize_string(_ea)
|
|
if not _ea_norm:
|
|
continue
|
|
# For short normalized names (e.g. "B小町"→"b"), containment is useless.
|
|
# Compare original Unicode strings directly via similarity instead.
|
|
if len(_ea_norm) <= 2:
|
|
_best_artist = max(_best_artist, SequenceMatcher(None, _ea.lower(), _cand_artist_raw.lower()).ratio())
|
|
elif _re.search(r'\b' + _re.escape(_ea_norm) + r'\b', _cand_artist):
|
|
_best_artist = 1.0
|
|
break
|
|
elif _ea_norm == _cand_artist:
|
|
_best_artist = 1.0
|
|
break
|
|
else:
|
|
_best_artist = max(_best_artist, SequenceMatcher(None, _ea_norm, _cand_artist).ratio())
|
|
# Raised from 0.4 → 0.5 to close a fencepost bug: SequenceMatcher
|
|
# returns exactly 0.400 for "maduk" vs "tom walker" (5 chars vs
|
|
# 10 chars with 2 coincidental char matches), which bypassed the
|
|
# strict `< 0.4` check and let Tom Walker through as a candidate
|
|
# for a Maduk track. The word-boundary containment check above
|
|
# already short-circuits legitimate formatting variations
|
|
# ("Beatles"/"The Beatles", "Maduk"/"Maduk feat. X") to sim=1.0,
|
|
# so falling to SequenceMatcher means the strings are genuinely
|
|
# different. 0.5 gives a safer buffer without blocking real
|
|
# matches that would have scored above 0.85 anyway.
|
|
if _best_artist < 0.5 and confidence < 0.85:
|
|
continue
|
|
|
|
r.confidence = confidence
|
|
r.version_type = 'wrong_version' if is_wrong_version else match_type
|
|
if confidence >= 0.60:
|
|
scored.append(r)
|
|
|
|
if scored:
|
|
# Sort by confidence (best match first)
|
|
scored.sort(key=lambda x: x.confidence, reverse=True)
|
|
best = scored[0]
|
|
logger.info(f"[{source_label}] {len(scored)}/{len(results)} candidates passed validation "
|
|
f"(best: {best.confidence:.2f} '{best.artist} - {best.title}')")
|
|
return scored
|
|
else:
|
|
if results[0].username == 'youtube':
|
|
logger.warning(f"[{source_label}] No streaming results passed validation — falling through to filename matching")
|
|
# YouTube artist data is unreliable, allow fallback to filename-based matching
|
|
else:
|
|
logger.warning(f"[{source_label}] No streaming results passed validation (threshold: 0.60, artist gate: 0.50) — rejecting all candidates")
|
|
return [] # Tidal/Qobuz/HiFi/Deezer have structured metadata; don't fall back to filename matching
|
|
|
|
# Uses the existing, powerful matching engine for scoring (Soulseek P2P results)
|
|
_max_q = config_manager.get('soulseek.max_peer_queue', 0) or 0
|
|
initial_candidates = matching_engine.find_best_slskd_matches_enhanced(spotify_track, results, max_peer_queue=_max_q)
|
|
if not initial_candidates:
|
|
return []
|
|
|
|
# Skip quality filtering for streaming source results that somehow got here
|
|
is_streaming_source = initial_candidates[0].username in _streaming_sources if initial_candidates else False
|
|
|
|
if is_streaming_source:
|
|
source_label = initial_candidates[0].username.title()
|
|
logger.info(f"[{source_label}] Skipping quality filter - streaming source handles quality internally")
|
|
quality_filtered_candidates = initial_candidates
|
|
else:
|
|
# Filter by user's quality profile before artist verification (Soulseek only)
|
|
# Use existing soulseek_client to avoid re-initializing (which accesses download_path filesystem)
|
|
quality_filtered_candidates = soulseek_client.soulseek.filter_results_by_quality_preference(initial_candidates)
|
|
|
|
# IMPORTANT: Respect empty results from quality filter
|
|
# If user has strict quality requirements (e.g., FLAC-only with fallback disabled),
|
|
# and no results match, we should fail the download rather than force a fallback.
|
|
# The quality filter already has its own fallback logic controlled by the user's settings.
|
|
if not quality_filtered_candidates:
|
|
logger.error("[Quality Filter] No candidates match quality profile - download will fail per user preferences")
|
|
return []
|
|
|
|
verified_candidates = []
|
|
spotify_artists = spotify_track.artists if spotify_track.artists else []
|
|
|
|
# Pre-normalize all artist names into word sets using the matching engine
|
|
# This handles Cyrillic, accents, special chars ($), separators, etc.
|
|
artist_word_sets = []
|
|
for artist_name in spotify_artists:
|
|
normalized = matching_engine.normalize_string(artist_name)
|
|
words = set(normalized.split())
|
|
if words:
|
|
artist_word_sets.append(words)
|
|
|
|
for candidate in quality_filtered_candidates:
|
|
# Skip artist check for streaming results (title matching is sufficient as processed by matching engine)
|
|
if is_streaming_source:
|
|
verified_candidates.append(candidate)
|
|
continue
|
|
|
|
# No artist info available — can't verify, accept candidate
|
|
if not artist_word_sets:
|
|
verified_candidates.append(candidate)
|
|
continue
|
|
|
|
# Split the Soulseek path into segments (folders + filename) and check each one.
|
|
# This prevents false positives where a short artist name like "Sia" accidentally
|
|
# matches inside a folder name like "Enthusiastic" — by checking words within
|
|
# individual segments rather than a flat substring of the entire path.
|
|
path_segments = re.split(r'[/\\]', candidate.filename)
|
|
|
|
artist_found = False
|
|
for segment in path_segments:
|
|
if not segment:
|
|
continue
|
|
seg_words = set(matching_engine.normalize_string(segment).split())
|
|
if not seg_words:
|
|
continue
|
|
|
|
# Check if ANY artist's words are ALL present in this segment
|
|
for artist_words in artist_word_sets:
|
|
if artist_words.issubset(seg_words):
|
|
artist_found = True
|
|
break
|
|
|
|
if artist_found:
|
|
break
|
|
|
|
if artist_found:
|
|
verified_candidates.append(candidate)
|
|
return verified_candidates
|