@ -81,10 +81,11 @@ class MusicMatchingEngine:
# The new approach is to PRESERVE the '$' symbol during normalization.
# Replace common separators with spaces to preserve word boundaries.
text = re . sub ( r ' [._/] ' , ' ' , text )
# Keep alphanumeric characters, spaces, hyphens, AND the '$' sign.
text = re . sub ( r ' [^a-z0-9 \ s$-] ' , ' ' , text )
# Include hyphen in separator replacement for artist names like "AC/DC" vs "AC-DC"
text = re . sub ( r ' [._/-] ' , ' ' , text )
# Keep alphanumeric characters, spaces, AND the '$' sign.
text = re . sub ( r ' [^a-z0-9 \ s$] ' , ' ' , text )
# Consolidate multiple spaces into one
text = re . sub ( r ' \ s+ ' , ' ' , text ) . strip ( )
@ -154,39 +155,67 @@ class MusicMatchingEngine:
return self . normalize_string ( cleaned )
def similarity_score ( self , str1 : str , str2 : str ) - > float :
""" Calculates similarity score between two strings with enhanced version handling. """
"""
Calculates similarity score between two strings with STRICT version handling .
IMPORTANT : Different versions ( remix , live , acoustic ) should NOT match the original .
This prevents false positives during sync where " Song Title (Remix) " matches " Song Title " .
"""
if not str1 or not str2 :
return 0.0
# Exact match - highest score
if str1 == str2 :
return 1.0
# Standard similarity
standard_ratio = SequenceMatcher ( None , str1 , str2 ) . ratio ( )
# Enhanced logic: Check if one string is a version of the other
# This handles cases like "Back & forth" vs "Back & forth original mix"
# STRICT VERSION CHECKING: Different versions should score LOW
# This prevents "Song Title" from matching "Song Title (Remix)" during sync
shorter , longer = ( str1 , str2 ) if len ( str1 ) < = len ( str2 ) else ( str2 , str1 )
# If the shorter string is at the start of the longer string
if longer . startswith ( shorter ) :
# Extract the extra content
extra_content = longer [ len ( shorter ) : ] . strip ( )
# Check if the extra content looks like version info
version_keywords = [
' original mix ' , ' radio mix ' , ' club mix ' , ' extended mix ' ,
' slowed ' , ' reverb ' , ' sped up ' , ' acoustic ' , ' remix ' , ' remaster ' ,
' live ' , ' demo ' , ' instrumental ' , ' clean ' , ' explicit ' ,
' radio edit ' , ' extended ' , ' version '
# Separate remasters from other versions - they should be treated differently
remaster_keywords = [ ' remaster ' , ' remastered ' ]
different_version_keywords = [
' remix ' , ' mix ' , ' rmx ' , # Remixes (different song)
' live ' , ' live at ' , ' live from ' , # Live versions (different recording)
' acoustic ' , ' unplugged ' , # Acoustic versions (different arrangement)
' slowed ' , ' reverb ' , ' sped up ' , ' speed up ' , # TikTok edits (different)
' radio edit ' , ' radio version ' , # Radio edits (different)
' instrumental ' , ' karaoke ' , # Instrumental (different)
' extended ' , ' extended version ' , # Extended (different length)
' demo ' , ' rough cut ' , # Demos (different recording)
]
# Normalize extra content for comparison
extra_normalized = extra_content . lower ( ) . strip ( ' -()[] ' )
# If the extra content matches version keywords, boost the similarity
for keyword in version_keywords :
# Check for remasters first - apply light penalty (might still match)
for keyword in remaster _keywords:
if keyword in extra_normalized :
# High similarity but not perfect (to distinguish from exact matches)
return max ( standard_ratio , 0.85 )
# Light penalty for remasters (same song, different mastering)
# 0.75 = 75% match - likely still matches with 0.70 threshold
# With 50/50 title/artist split: 0.75 * 0.5 + 1.0 * 0.5 = 0.875 > 0.7 threshold
logger . debug ( f " Remaster detected: ' { str1 } ' vs ' { str2 } ' (keyword: ' { keyword } ' ) - applying light penalty " )
return 0.75
# Check for different versions - apply heavy penalty (won't match)
for keyword in different_version_keywords :
if keyword in extra_normalized :
# Heavy penalty for different versions (remix, live, acoustic, etc.)
# 0.3 = 30% match - low enough to fail the 0.7 threshold
# With 50/50 title/artist split: 0.3 * 0.5 + 1.0 * 0.5 = 0.65 < 0.7 threshold
logger . debug ( f " Version mismatch detected: ' { str1 } ' vs ' { str2 } ' (keyword: ' { keyword } ' ) - applying heavy penalty " )
return 0.30
return standard_ratio
def duration_similarity ( self , duration1 : int , duration2 : int ) - > float :
@ -481,7 +510,8 @@ class MusicMatchingEngine:
def calculate_slskd_match_confidence ( self , spotify_track : SpotifyTrack , slskd_track : TrackResult ) - > float :
"""
Calculates a confidence score for a Soulseek track against a Spotify track .
This is the core of the new matching logic .
Uses full - string similarity matching ( like Soularr ) instead of substring matching
to prevent false positives like " Girls " matching " Girls Girls Girls " .
"""
# Normalize the Spotify track info once for efficiency
spotify_title_norm = self . normalize_string ( spotify_track . name )
@ -490,42 +520,87 @@ class MusicMatchingEngine:
# The slskd filename is our primary source of truth, so normalize it
slskd_filename_norm = self . normalize_string ( slskd_track . filename )
# 1. Title Score: How well does the Spotify title appear in the filename?
# We use the cleaned, core title for a strict check. This avoids matching remixes.
# 1. Title Score: Use full-string similarity instead of substring matching
# This prevents false positives like "Love" matching "Loveless"
spotify_cleaned_title = self . clean_title ( spotify_track . name )
title_score = 0.0
if spotify_cleaned_title in slskd_filename_norm :
title_score = 0.9 # High score for direct inclusion
# Bonus for being a standalone word/phrase, penalizing partial matches like 'in' in 'finland'
if re . search ( r ' \ b ' + re . escape ( spotify_cleaned_title ) + r ' \ b ' , slskd_filename_norm ) :
title_score = 1.0
# 2. Artist Score: How well do the Spotify artists appear in the filename?
# Calculate full-string similarity ratio (0.0 to 1.0) like Soularr does
title_ratio = SequenceMatcher ( None , spotify_cleaned_title , slskd_filename_norm ) . ratio ( )
# Boost score if title appears as a complete word in filename
has_word_boundary = bool ( re . search ( r ' \ b ' + re . escape ( spotify_cleaned_title ) + r ' \ b ' , slskd_filename_norm ) )
if has_word_boundary :
# Title exists as complete word - significant bonus
title_score = min ( 1.0 , title_ratio + 0.3 )
else :
# No word boundary match - rely on similarity ratio only
title_score = title_ratio
# 2. Artist Score: Keep substring matching for artists (they're more unique)
# But add similarity-based fallback for better matching
artist_score = 0.0
best_artist_similarity = 0.0
for artist in spotify_artists_norm :
if artist in slskd_filename_norm :
artist_score = 1.0 # Perfect match if any artist is found
artist_score = 1.0 # Perfect match if any artist is found
break
# 3. Duration Score: How similar are the track lengths?
# We give this a lower weight as slskd duration data can be unreliable.
else :
# Try similarity matching as fallback for misspellings/variations
artist_ratio = SequenceMatcher ( None , artist , slskd_filename_norm ) . ratio ( )
best_artist_similarity = max ( best_artist_similarity , artist_ratio )
# If no exact artist match, use best similarity with penalty
if artist_score == 0.0 and best_artist_similarity > 0 :
artist_score = best_artist_similarity * 0.7 # Penalize similarity-only matches
# 3. Duration Score: Increased weight for better accuracy
duration_score = self . duration_similarity ( spotify_track . duration_ms , slskd_track . duration if slskd_track . duration else 0 )
# 4. Quality Bonus: Add a small bonus for higher quality formats
# 4. Quality Bonus: Reduced to prevent boosting bad matche s
quality_bonus = 0.0
if slskd_track . quality :
if slskd_track . quality . lower ( ) == ' flac ' :
quality_bonus = 0.07 # Reduced from 0.1 to prevent low-confidence FLAC beating high-confidence MP3
quality_bonus = 0.0 3 # Reduced from 0.07
elif slskd_track . quality . lower ( ) == ' mp3 ' and ( slskd_track . bitrate or 0 ) > = 320 :
quality_bonus = 0.05
quality_bonus = 0.02 # Reduced from 0.05
# 5. Special handling for short titles (high false positive risk)
# Titles like "Run", "Love", "Girls", "Stay" need stricter artist matching
title_words = spotify_cleaned_title . split ( )
is_short_title = len ( spotify_cleaned_title ) < = 5 or len ( title_words ) == 1
# --- Final Weighted Score ---
# Title and Artist are the most important factors for an accurate match.
final_confidence = ( title_score * 0.60 ) + ( artist_score * 0.35 ) + ( duration_score * 0.05 )
# Rebalanced weights: Artist matching is now more important to prevent false positives
final_confidence = ( title_score * 0.45 ) + ( artist_score * 0.40 ) + ( duration_score * 0.15 )
# Apply short title penalty AFTER calculating base confidence
# This allows perfect matches to still pass, but penalizes weak artist matches
if is_short_title and artist_score < 0.5 :
# Heavy penalty but not complete rejection
# Multiply by 0.4 (60% penalty) - still possible to pass if title+duration are perfect
logger . debug ( f " Short title ' { spotify_cleaned_title } ' with low artist match ( { artist_score : .2f } ) - applying 60% penalty " )
final_confidence * = 0.4
# Add the quality bonus to the final score
final_confidence + = quality_bonus
# Store individual scores for debugging (used in enhanced version)
slskd_track . title_score = title_score
slskd_track . artist_score = artist_score
slskd_track . duration_score = duration_score
# Debug logging to track matching decisions
if final_confidence > 0.3 : # Only log potential matches
logger . debug (
f " Match scoring: ' { spotify_track . name } ' by { spotify_track . artists [ 0 ] if spotify_track . artists else ' Unknown ' } "
f " vs ' { slskd_track . filename [ : 60 ] } ... ' | "
f " Title: { title_score : .2f } (ratio: { title_ratio : .2f } , boundary: { has_word_boundary } ), "
f " Artist: { artist_score : .2f } , Duration: { duration_score : .2f } , "
f " Final: { final_confidence : .2f } { ' ✅ PASS ' if final_confidence > 0.58 else ' ❌ FAIL ' } "
)
# Ensure the final score doesn't exceed 1.0
return min ( final_confidence , 1.0 )
@ -547,10 +622,11 @@ class MusicMatchingEngine:
# Sort by confidence score (descending), and then by size as a tie-breaker
sorted_results = sorted ( scored_results , key = lambda r : ( r . confidence , r . size ) , reverse = True )
# Filter out very low-confidence results to avoid bad matches.
# A threshold of 0.6 means the title and artist had to have some reasonable similarity.
confident_results = [ r for r in sorted_results if r . confidence > 0.6 ]
# Threshold at 0.63 (63%) balances false positive reduction with match rate
# Testing showed: 0.65 → 2.2% fewer matches, 0.63 should recover ~1% while keeping safety
confident_results = [ r for r in sorted_results if r . confidence > 0.63 ]
return confident_results
@ -733,18 +809,19 @@ class MusicMatchingEngine:
return ( r . confidence , - version_priority , r . size )
sorted_results = sorted ( scored_results , key = sort_key , reverse = True )
# Filter out very low-confidence results
# Lower the threshold to 0.45 to account for version penalties and album-in-title scenarios
confident_results = [ r for r in sorted_results if r . confidence > 0.45 ]
# Threshold at 0.58 (58%) to prevent false positives while maintaining good match rate
# Testing showed: 0.60 was slightly too strict, 0.58 balances accuracy and recall
confident_results = [ r for r in sorted_results if r . confidence > 0.58 ]
# Debug logging for troubleshooting
if scored_results and not confident_results :
print ( f " ⚠️ DEBUG: Found { len ( scored_results ) } scored results but none met confidence threshold 0. 4 5" )
print ( f " ⚠️ DEBUG: Found { len ( scored_results ) } scored results but none met confidence threshold 0. 58 " )
for i , result in enumerate ( sorted_results [ : 3 ] ) : # Show top 3
print ( f " { i + 1 } . { result . confidence : .3f } - { getattr ( result , ' version_type ' , ' unknown ' ) } - { result . filename [ : 60 ] } ... " )
elif confident_results :
print ( f " ✅ DEBUG: { len ( confident_results ) } results passed confidence threshold 0. 4 5" )
print ( f " ✅ DEBUG: { len ( confident_results ) } results passed confidence threshold 0. 58 " )
for i , result in enumerate ( confident_results [ : 3 ] ) : # Show top 3
print ( f " { i + 1 } . { result . confidence : .3f } - { getattr ( result , ' version_type ' , ' unknown ' ) } - { result . filename [ : 60 ] } ... " )