Improve matching logic and wishlist album display

Enhanced the music matching engine to use stricter version handling, rebalance title/artist/duration weights, and raise confidence thresholds to reduce false positives. Updated string normalization to better handle separators and special characters. In the web UI, improved album ID sanitization and added a placeholder for missing album images in the wishlist view.
pull/97/head
Broque Thomas 4 months ago
parent 31903990fa
commit a2bdb93498

@ -81,10 +81,11 @@ class MusicMatchingEngine:
# The new approach is to PRESERVE the '$' symbol during normalization.
# Replace common separators with spaces to preserve word boundaries.
text = re.sub(r'[._/]', ' ', text)
# Keep alphanumeric characters, spaces, hyphens, AND the '$' sign.
text = re.sub(r'[^a-z0-9\s$-]', '', text)
# Include hyphen in separator replacement for artist names like "AC/DC" vs "AC-DC"
text = re.sub(r'[._/-]', ' ', text)
# Keep alphanumeric characters, spaces, AND the '$' sign.
text = re.sub(r'[^a-z0-9\s$]', '', text)
# Consolidate multiple spaces into one
text = re.sub(r'\s+', ' ', text).strip()
@ -154,39 +155,67 @@ class MusicMatchingEngine:
return self.normalize_string(cleaned)
def similarity_score(self, str1: str, str2: str) -> float:
"""Calculates similarity score between two strings with enhanced version handling."""
"""
Calculates similarity score between two strings with STRICT version handling.
IMPORTANT: Different versions (remix, live, acoustic) should NOT match the original.
This prevents false positives during sync where "Song Title (Remix)" matches "Song Title".
"""
if not str1 or not str2:
return 0.0
# Exact match - highest score
if str1 == str2:
return 1.0
# Standard similarity
standard_ratio = SequenceMatcher(None, str1, str2).ratio()
# Enhanced logic: Check if one string is a version of the other
# This handles cases like "Back & forth" vs "Back & forth original mix"
# STRICT VERSION CHECKING: Different versions should score LOW
# This prevents "Song Title" from matching "Song Title (Remix)" during sync
shorter, longer = (str1, str2) if len(str1) <= len(str2) else (str2, str1)
# If the shorter string is at the start of the longer string
if longer.startswith(shorter):
# Extract the extra content
extra_content = longer[len(shorter):].strip()
# Check if the extra content looks like version info
version_keywords = [
'original mix', 'radio mix', 'club mix', 'extended mix',
'slowed', 'reverb', 'sped up', 'acoustic', 'remix', 'remaster',
'live', 'demo', 'instrumental', 'clean', 'explicit',
'radio edit', 'extended', 'version'
# Separate remasters from other versions - they should be treated differently
remaster_keywords = ['remaster', 'remastered']
different_version_keywords = [
'remix', 'mix', 'rmx', # Remixes (different song)
'live', 'live at', 'live from', # Live versions (different recording)
'acoustic', 'unplugged', # Acoustic versions (different arrangement)
'slowed', 'reverb', 'sped up', 'speed up', # TikTok edits (different)
'radio edit', 'radio version', # Radio edits (different)
'instrumental', 'karaoke', # Instrumental (different)
'extended', 'extended version', # Extended (different length)
'demo', 'rough cut', # Demos (different recording)
]
# Normalize extra content for comparison
extra_normalized = extra_content.lower().strip(' -()[]')
# If the extra content matches version keywords, boost the similarity
for keyword in version_keywords:
# Check for remasters first - apply light penalty (might still match)
for keyword in remaster_keywords:
if keyword in extra_normalized:
# High similarity but not perfect (to distinguish from exact matches)
return max(standard_ratio, 0.85)
# Light penalty for remasters (same song, different mastering)
# 0.75 = 75% match - likely still matches with 0.70 threshold
# With 50/50 title/artist split: 0.75 * 0.5 + 1.0 * 0.5 = 0.875 > 0.7 threshold
logger.debug(f"Remaster detected: '{str1}' vs '{str2}' (keyword: '{keyword}') - applying light penalty")
return 0.75
# Check for different versions - apply heavy penalty (won't match)
for keyword in different_version_keywords:
if keyword in extra_normalized:
# Heavy penalty for different versions (remix, live, acoustic, etc.)
# 0.3 = 30% match - low enough to fail the 0.7 threshold
# With 50/50 title/artist split: 0.3 * 0.5 + 1.0 * 0.5 = 0.65 < 0.7 threshold
logger.debug(f"Version mismatch detected: '{str1}' vs '{str2}' (keyword: '{keyword}') - applying heavy penalty")
return 0.30
return standard_ratio
def duration_similarity(self, duration1: int, duration2: int) -> float:
@ -481,7 +510,8 @@ class MusicMatchingEngine:
def calculate_slskd_match_confidence(self, spotify_track: SpotifyTrack, slskd_track: TrackResult) -> float:
"""
Calculates a confidence score for a Soulseek track against a Spotify track.
This is the core of the new matching logic.
Uses full-string similarity matching (like Soularr) instead of substring matching
to prevent false positives like "Girls" matching "Girls Girls Girls".
"""
# Normalize the Spotify track info once for efficiency
spotify_title_norm = self.normalize_string(spotify_track.name)
@ -490,42 +520,87 @@ class MusicMatchingEngine:
# The slskd filename is our primary source of truth, so normalize it
slskd_filename_norm = self.normalize_string(slskd_track.filename)
# 1. Title Score: How well does the Spotify title appear in the filename?
# We use the cleaned, core title for a strict check. This avoids matching remixes.
# 1. Title Score: Use full-string similarity instead of substring matching
# This prevents false positives like "Love" matching "Loveless"
spotify_cleaned_title = self.clean_title(spotify_track.name)
title_score = 0.0
if spotify_cleaned_title in slskd_filename_norm:
title_score = 0.9 # High score for direct inclusion
# Bonus for being a standalone word/phrase, penalizing partial matches like 'in' in 'finland'
if re.search(r'\b' + re.escape(spotify_cleaned_title) + r'\b', slskd_filename_norm):
title_score = 1.0
# 2. Artist Score: How well do the Spotify artists appear in the filename?
# Calculate full-string similarity ratio (0.0 to 1.0) like Soularr does
title_ratio = SequenceMatcher(None, spotify_cleaned_title, slskd_filename_norm).ratio()
# Boost score if title appears as a complete word in filename
has_word_boundary = bool(re.search(r'\b' + re.escape(spotify_cleaned_title) + r'\b', slskd_filename_norm))
if has_word_boundary:
# Title exists as complete word - significant bonus
title_score = min(1.0, title_ratio + 0.3)
else:
# No word boundary match - rely on similarity ratio only
title_score = title_ratio
# 2. Artist Score: Keep substring matching for artists (they're more unique)
# But add similarity-based fallback for better matching
artist_score = 0.0
best_artist_similarity = 0.0
for artist in spotify_artists_norm:
if artist in slskd_filename_norm:
artist_score = 1.0 # Perfect match if any artist is found
artist_score = 1.0 # Perfect match if any artist is found
break
# 3. Duration Score: How similar are the track lengths?
# We give this a lower weight as slskd duration data can be unreliable.
else:
# Try similarity matching as fallback for misspellings/variations
artist_ratio = SequenceMatcher(None, artist, slskd_filename_norm).ratio()
best_artist_similarity = max(best_artist_similarity, artist_ratio)
# If no exact artist match, use best similarity with penalty
if artist_score == 0.0 and best_artist_similarity > 0:
artist_score = best_artist_similarity * 0.7 # Penalize similarity-only matches
# 3. Duration Score: Increased weight for better accuracy
duration_score = self.duration_similarity(spotify_track.duration_ms, slskd_track.duration if slskd_track.duration else 0)
# 4. Quality Bonus: Add a small bonus for higher quality formats
# 4. Quality Bonus: Reduced to prevent boosting bad matches
quality_bonus = 0.0
if slskd_track.quality:
if slskd_track.quality.lower() == 'flac':
quality_bonus = 0.07 # Reduced from 0.1 to prevent low-confidence FLAC beating high-confidence MP3
quality_bonus = 0.03 # Reduced from 0.07
elif slskd_track.quality.lower() == 'mp3' and (slskd_track.bitrate or 0) >= 320:
quality_bonus = 0.05
quality_bonus = 0.02 # Reduced from 0.05
# 5. Special handling for short titles (high false positive risk)
# Titles like "Run", "Love", "Girls", "Stay" need stricter artist matching
title_words = spotify_cleaned_title.split()
is_short_title = len(spotify_cleaned_title) <= 5 or len(title_words) == 1
# --- Final Weighted Score ---
# Title and Artist are the most important factors for an accurate match.
final_confidence = (title_score * 0.60) + (artist_score * 0.35) + (duration_score * 0.05)
# Rebalanced weights: Artist matching is now more important to prevent false positives
final_confidence = (title_score * 0.45) + (artist_score * 0.40) + (duration_score * 0.15)
# Apply short title penalty AFTER calculating base confidence
# This allows perfect matches to still pass, but penalizes weak artist matches
if is_short_title and artist_score < 0.5:
# Heavy penalty but not complete rejection
# Multiply by 0.4 (60% penalty) - still possible to pass if title+duration are perfect
logger.debug(f"Short title '{spotify_cleaned_title}' with low artist match ({artist_score:.2f}) - applying 60% penalty")
final_confidence *= 0.4
# Add the quality bonus to the final score
final_confidence += quality_bonus
# Store individual scores for debugging (used in enhanced version)
slskd_track.title_score = title_score
slskd_track.artist_score = artist_score
slskd_track.duration_score = duration_score
# Debug logging to track matching decisions
if final_confidence > 0.3: # Only log potential matches
logger.debug(
f"Match scoring: '{spotify_track.name}' by {spotify_track.artists[0] if spotify_track.artists else 'Unknown'} "
f"vs '{slskd_track.filename[:60]}...' | "
f"Title: {title_score:.2f} (ratio: {title_ratio:.2f}, boundary: {has_word_boundary}), "
f"Artist: {artist_score:.2f}, Duration: {duration_score:.2f}, "
f"Final: {final_confidence:.2f} {'✅ PASS' if final_confidence > 0.58 else '❌ FAIL'}"
)
# Ensure the final score doesn't exceed 1.0
return min(final_confidence, 1.0)
@ -547,10 +622,11 @@ class MusicMatchingEngine:
# Sort by confidence score (descending), and then by size as a tie-breaker
sorted_results = sorted(scored_results, key=lambda r: (r.confidence, r.size), reverse=True)
# Filter out very low-confidence results to avoid bad matches.
# A threshold of 0.6 means the title and artist had to have some reasonable similarity.
confident_results = [r for r in sorted_results if r.confidence > 0.6]
# Threshold at 0.63 (63%) balances false positive reduction with match rate
# Testing showed: 0.65 → 2.2% fewer matches, 0.63 should recover ~1% while keeping safety
confident_results = [r for r in sorted_results if r.confidence > 0.63]
return confident_results
@ -733,18 +809,19 @@ class MusicMatchingEngine:
return (r.confidence, -version_priority, r.size)
sorted_results = sorted(scored_results, key=sort_key, reverse=True)
# Filter out very low-confidence results
# Lower the threshold to 0.45 to account for version penalties and album-in-title scenarios
confident_results = [r for r in sorted_results if r.confidence > 0.45]
# Threshold at 0.58 (58%) to prevent false positives while maintaining good match rate
# Testing showed: 0.60 was slightly too strict, 0.58 balances accuracy and recall
confident_results = [r for r in sorted_results if r.confidence > 0.58]
# Debug logging for troubleshooting
if scored_results and not confident_results:
print(f"⚠️ DEBUG: Found {len(scored_results)} scored results but none met confidence threshold 0.45")
print(f"⚠️ DEBUG: Found {len(scored_results)} scored results but none met confidence threshold 0.58")
for i, result in enumerate(sorted_results[:3]): # Show top 3
print(f" {i+1}. {result.confidence:.3f} - {getattr(result, 'version_type', 'unknown')} - {result.filename[:60]}...")
elif confident_results:
print(f"✅ DEBUG: {len(confident_results)} results passed confidence threshold 0.45")
print(f"✅ DEBUG: {len(confident_results)} results passed confidence threshold 0.58")
for i, result in enumerate(confident_results[:3]): # Show top 3
print(f" {i+1}. {result.confidence:.3f} - {getattr(result, 'version_type', 'unknown')} - {result.filename[:60]}...")

@ -5505,7 +5505,11 @@ async function selectWishlistCategory(category) {
const albumImage = spotifyData?.album?.images?.[0]?.url || '';
// Use album ID if available, otherwise create unique key from album + artist
const albumId = spotifyData?.album?.id || `${albumName}_${artistName}`.replace(/\s+/g, '_').toLowerCase();
// Sanitize the ID to remove all special characters that could break DOM IDs or CSS selectors
const albumId = spotifyData?.album?.id || `${albumName}_${artistName}`
.replace(/[^a-zA-Z0-9\s_-]/g, '') // Remove all special chars except spaces, underscores, hyphens
.replace(/\s+/g, '_') // Replace spaces with underscores
.toLowerCase();
if (!albumGroups[albumId]) {
albumGroups[albumId] = {
@ -5542,10 +5546,16 @@ async function selectWishlistCategory(category) {
</div>
`).join('');
// Handle missing album images with a placeholder
const albumImageStyle = albumData.albumImage
? `background-image: url('${albumData.albumImage}')`
: `background: linear-gradient(135deg, rgba(30, 30, 30, 0.9) 0%, rgba(50, 50, 50, 0.9) 100%); display: flex; align-items: center; justify-content: center; font-size: 40px;`;
const albumImageContent = albumData.albumImage ? '' : '<span style="opacity: 0.3;">💿</span>';
albumsHTML += `
<div class="wishlist-album-card">
<div class="wishlist-album-header" onclick="toggleAlbumTracks('${albumId}')">
<div class="wishlist-album-image" style="background-image: url('${albumData.albumImage}')"></div>
<div class="wishlist-album-image" style="${albumImageStyle}">${albumImageContent}</div>
<div class="wishlist-album-info">
<div class="wishlist-album-name">${albumData.albumName}</div>
<div class="wishlist-album-artist">${albumData.artistName}</div>

Loading…
Cancel
Save