from typing import List, Optional, Dict, Any, Tuple import re from dataclasses import dataclass from difflib import SequenceMatcher from unidecode import unidecode from utils.logging_config import get_logger from core.spotify_client import Track as SpotifyTrack from core.plex_client import PlexTrackInfo from core.soulseek_client import TrackResult logger = get_logger("matching_engine") @dataclass class MatchResult: spotify_track: SpotifyTrack plex_track: Optional[PlexTrackInfo] confidence: float match_type: str @property def is_match(self) -> bool: return self.plex_track is not None and self.confidence >= 0.8 class MusicMatchingEngine: def __init__(self): # Conservative title patterns - only remove clear noise, preserve meaningful differences like remixes self.title_patterns = [ # Only remove explicit/clean markers - preserve remixes, versions, and content after hyphens r'\s*\(explicit\)', r'\s*\(clean\)', # Remove featuring artists from the title itself r'\sfeat\.?.*', r'\sft\.?.*', r'\sfeaturing.*' ] self.artist_patterns = [ # Only remove featured artists, not parts of main artist names r'\s*feat\..*', r'\s*ft\..*', r'\s*featuring.*', # REMOVED: r'\s*&.*' - This breaks "Daryl Hall & John Oates", "Blood & Water" # REMOVED: r'\s*and.*' - This breaks artist names with "and" # REMOVED: r',.*' - This can break legitimate artist names with commas ] def normalize_string(self, text: str) -> str: """ Normalizes string by handling common stylizations, converting to ASCII, lowercasing, and replacing separators with spaces. """ if not text: return "" # Handle Korn/KoЯn variations - both uppercase Я (U+042F) and lowercase я (U+044F) char_map = { 'Я': 'R', # Cyrillic 'Ya' to 'R' 'я': 'r', # Lowercase Cyrillic 'ya' to 'r' } # Apply the character replacements before other normalization steps for original, replacement in char_map.items(): text = text.replace(original, replacement) text = unidecode(text) text = text.lower() # Expand specific abbreviations for better matching abbreviation_map = { r'\bpt\.': 'part', # "pt." → "part" r'\bvol\.': 'volume', # "vol." → "volume" r'\bfeat\.': 'featured' # "feat." → "featured" # Removed "ft." → "featured" (ambiguous: could be "feet" in measurements) } for pattern, replacement in abbreviation_map.items(): text = re.sub(pattern, replacement, text) # --- IMPROVEMENT V4 --- # The user correctly pointed out that replacing '$' with 's' was incorrect # as it breaks searching for stylized names like A$AP Rocky. # The new approach is to PRESERVE the '$' symbol during normalization. # Replace common separators with spaces to preserve word boundaries. # Include hyphen in separator replacement for artist names like "AC/DC" vs "AC-DC" text = re.sub(r'[._/-]', ' ', text) # Keep alphanumeric characters, spaces, AND the '$' sign. text = re.sub(r'[^a-z0-9\s$]', '', text) # Consolidate multiple spaces into one text = re.sub(r'\s+', ' ', text).strip() return text def get_core_string(self, text: str) -> str: """Returns a 'core' version of a string with only letters and numbers for a strict comparison.""" if not text: return "" # Use normalize_string first to get abbreviation expansion, then strip to core normalized = self.normalize_string(text) return re.sub(r'[^a-z0-9]', '', normalized) def clean_title(self, title: str) -> str: """Cleans title by removing common extra info using regex for fuzzy matching.""" cleaned = title for pattern in self.title_patterns: cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() return self.normalize_string(cleaned) def clean_artist(self, artist: str) -> str: """Cleans artist name by removing featured artists and other noise.""" cleaned = artist for pattern in self.artist_patterns: cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() return self.normalize_string(cleaned) def clean_album_name(self, album_name: str) -> str: """Clean album name by removing version info, deluxe editions, etc.""" if not album_name: return "" cleaned = album_name # Common album suffixes to remove album_patterns = [ # Add pattern to remove trailing info after a hyphen, common for remasters/editions. r'\s-\s.*', r'\s*\(deluxe\s*edition?\)', r'\s*\(expanded\s*edition?\)', r'\s*\(platinum\s*edition?\)', # Fix for "Fearless (Platinum Edition)" r'\s*\(remastered?\)', r'\s*\(remaster\)', r'\s*\(anniversary\s*edition?\)', r'\s*\(special\s*edition?\)', r'\s*\(bonus\s*track\s*version\)', r'\s*\(.*version\)', # Covers "Taylor's Version", "Radio Version", etc. r'\s*\[deluxe\]', r'\s*\[remastered?\]', r'\s*\[.*version\]', r'\s*-\s*deluxe', r'\s*-\s*platinum\s*edition?', # Handle "Album - Platinum Edition" r'\s*-\s*remastered?', r'\s+platinum\s*edition?$', # Handle "Album Platinum Edition" at end r'\s*\d{4}\s*remaster', # Year remaster r'\s*\(\d{4}\s*remaster\)' ] for pattern in album_patterns: cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() return self.normalize_string(cleaned) def similarity_score(self, str1: str, str2: str) -> float: """ Calculates similarity score between two strings with STRICT version handling. IMPORTANT: Different versions (remix, live, acoustic) should NOT match the original. This prevents false positives during sync where "Song Title (Remix)" matches "Song Title". """ if not str1 or not str2: return 0.0 # Exact match - highest score if str1 == str2: return 1.0 # Standard similarity standard_ratio = SequenceMatcher(None, str1, str2).ratio() # STRICT VERSION CHECKING: Different versions should score LOW # This prevents "Song Title" from matching "Song Title (Remix)" during sync shorter, longer = (str1, str2) if len(str1) <= len(str2) else (str2, str1) # If the shorter string is at the start of the longer string if longer.startswith(shorter): # Extract the extra content extra_content = longer[len(shorter):].strip() # Check if the extra content looks like version info # Separate remasters from other versions - they should be treated differently remaster_keywords = ['remaster', 'remastered'] different_version_keywords = [ 'remix', 'mix', 'rmx', # Remixes (different song) 'live', 'live at', 'live from', # Live versions (different recording) 'acoustic', 'unplugged', # Acoustic versions (different arrangement) 'slowed', 'reverb', 'sped up', 'speed up', # TikTok edits (different) 'radio edit', 'radio version', # Radio edits (different) 'instrumental', 'karaoke', # Instrumental (different) 'extended', 'extended version', # Extended (different length) 'demo', 'rough cut', # Demos (different recording) ] # Normalize extra content for comparison extra_normalized = extra_content.lower().strip(' -()[]') # Check for remasters first - apply light penalty (might still match) for keyword in remaster_keywords: if keyword in extra_normalized: # Light penalty for remasters (same song, different mastering) # 0.75 = 75% match - likely still matches with 0.70 threshold # With 50/50 title/artist split: 0.75 * 0.5 + 1.0 * 0.5 = 0.875 > 0.7 threshold logger.debug(f"Remaster detected: '{str1}' vs '{str2}' (keyword: '{keyword}') - applying light penalty") return 0.75 # Check for different versions - apply heavy penalty (won't match) for keyword in different_version_keywords: if keyword in extra_normalized: # Heavy penalty for different versions (remix, live, acoustic, etc.) # 0.3 = 30% match - low enough to fail the 0.7 threshold # With 50/50 title/artist split: 0.3 * 0.5 + 1.0 * 0.5 = 0.65 < 0.7 threshold logger.debug(f"Version mismatch detected: '{str1}' vs '{str2}' (keyword: '{keyword}') - applying heavy penalty") return 0.30 return standard_ratio def duration_similarity(self, duration1: int, duration2: int) -> float: """Calculates similarity score based on track duration (in ms).""" if duration1 == 0 or duration2 == 0: return 0.5 # Neutral score if a duration is missing # Allow a 5-second tolerance (5000 ms) if abs(duration1 - duration2) <= 5000: return 1.0 diff_ratio = abs(duration1 - duration2) / max(duration1, duration2) return max(0, 1.0 - diff_ratio * 5) def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: """Calculates a confidence score using a prioritized model, starting with a strict 'core' title check.""" # --- Artist Scoring (calculated once) --- spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a] plex_artist_normalized = self.normalize_string(plex_track.artist) plex_artist_cleaned = self.clean_artist(plex_track.artist) best_artist_score = 0.0 for spotify_artist in spotify_artists_cleaned: if spotify_artist and spotify_artist in plex_artist_normalized: best_artist_score = 1.0 break score = self.similarity_score(spotify_artist, plex_artist_cleaned) if score > best_artist_score: best_artist_score = score artist_score = best_artist_score # --- Priority 1: Core Title Match (for exact matches like "Girls", "APT.", "LIL DEMON") --- spotify_core_title = self.get_core_string(spotify_track.name) plex_core_title = self.get_core_string(plex_track.title) if spotify_core_title and spotify_core_title == plex_core_title: # SAFETY CHECK: Only give high confidence if artist also matches reasonably well # This prevents "Artist A - Girls" from matching "Artist Z - Girls" with high confidence if artist_score >= 0.75: # Require decent artist match # If the core titles are identical and artists match, we are highly confident confidence = 0.90 + (artist_score * 0.09) # Max score of 0.99 return confidence, "core_title_match" # If artist score is too low, fall through to standard weighted calculation # --- Priority 2: Fuzzy Title Match (for variations, typos, etc.) --- spotify_title_cleaned = self.clean_title(spotify_track.name) plex_title_cleaned = self.clean_title(plex_track.title) title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned) duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0) # Use a standard weighted calculation if the core titles didn't match confidence = (title_score * 0.60) + (artist_score * 0.30) + (duration_score * 0.10) match_type = "standard_match" return confidence, match_type def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult: """Finds the best Plex track match from a list of candidates.""" best_match = None best_confidence = 0.0 best_match_type = "no_match" if not plex_tracks: return MatchResult(spotify_track, None, 0.0, "no_candidates") for plex_track in plex_tracks: confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track) if confidence > best_confidence: best_confidence = confidence best_match = plex_track best_match_type = match_type return MatchResult( spotify_track=spotify_track, plex_track=best_match, confidence=best_confidence, match_type=best_match_type ) def detect_album_in_title(self, track_title: str, album_name: str = None) -> Tuple[str, bool]: """ Detect if album name appears in track title and return cleaned version. Returns (cleaned_title, album_detected) tuple. """ if not track_title: return "", False original_title = track_title title_lower = track_title.lower() # Common patterns where album name appears in track titles album_patterns = [ r'\s*-\s*(.+)$', # "Track - Album" (most common) r'\s*\|\s*(.+)$', # "Track | Album" r'\s*\(\s*(.+)\s*\)$' # "Track (Album)" ] # If we have album name, check if it appears in the title if album_name: album_clean = album_name.lower().strip() for pattern in album_patterns: match = re.search(pattern, track_title) if match: potential_album = match.group(1).lower().strip() # Check if the extracted part matches the album name with better fuzzy matching similarity_threshold = 0.8 # Calculate similarity between potential album and actual album if potential_album == album_clean: similarity = 1.0 # Exact match elif potential_album in album_clean or album_clean in potential_album: # Substring match - calculate how much overlap shorter = min(len(potential_album), len(album_clean)) longer = max(len(potential_album), len(album_clean)) similarity = shorter / longer if longer > 0 else 0.0 else: # Use string similarity for fuzzy matching similarity = self.similarity_score(potential_album, album_clean) if similarity >= similarity_threshold: # Remove the album part from the title cleaned_title = re.sub(pattern, '', track_title).strip() # SAFETY CHECK: Don't return empty or too-short titles if not cleaned_title or len(cleaned_title.strip()) < 2: logger.warning(f"Album removal would create empty title: '{original_title}' → '{cleaned_title}' - keeping original") return track_title, False # SAFETY CHECK: Don't remove if it would leave only articles or very short words words = cleaned_title.split() meaningful_words = [w for w in words if len(w) > 2 and w.lower() not in ['the', 'and', 'or', 'of', 'a', 'an']] if not meaningful_words: logger.warning(f"Album removal would leave only short words: '{original_title}' → '{cleaned_title}' - keeping original") return track_title, False logger.debug(f"Detected album in title: '{original_title}' → '{cleaned_title}' (removed: '{match.group(1)}', similarity: {similarity:.2f})") return cleaned_title, True # Fallback: detect common album-like suffixes even without album context # Look for patterns that might be album names (usually after dash) dash_pattern = r'\s*-\s*([A-Za-z][A-Za-z0-9\s&\-\']{3,30})$' match = re.search(dash_pattern, track_title) if match: potential_album_part = match.group(1).strip() # Heuristics: likely an album name if it: # - Doesn't contain common track descriptors # - Is reasonable length (4-30 chars) # - Doesn't look like a feature/remix indicator exclude_patterns = [ r'\b(remix|mix|edit|version|live|acoustic|instrumental|demo|feat|ft|featuring)\b' ] is_likely_album = True for exclude_pattern in exclude_patterns: if re.search(exclude_pattern, potential_album_part.lower()): is_likely_album = False break if is_likely_album and 4 <= len(potential_album_part) <= 30: cleaned_title = re.sub(dash_pattern, '', track_title).strip() print(f"🎵 Heuristic album detection: '{original_title}' → '{cleaned_title}' (removed: '{potential_album_part}')") return cleaned_title, True return track_title, False def generate_download_queries(self, spotify_track: SpotifyTrack) -> List[str]: """ Generate multiple search query variations for better matching. Returns queries in order of preference (cleaned titles first, then original). """ queries = [] if not spotify_track.artists: # No artist info - just use track name variations queries.append(self.clean_title(spotify_track.name)) return queries artist = self.clean_artist(spotify_track.artists[0]) original_title = spotify_track.name # Get album name if available - try multiple attribute names album_name = None for attr in ['album', 'album_name', 'album_title']: album_name = getattr(spotify_track, attr, None) if album_name: break # PRIORITY 1: Try removing potential album from title FIRST cleaned_title, album_detected = self.detect_album_in_title(original_title, album_name) if album_detected and cleaned_title != original_title: cleaned_track = self.clean_title(cleaned_title) if cleaned_track: queries.append(f"{artist} {cleaned_track}".strip()) logger.debug(f"PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'") # PRIORITY 2: Try simplified versions, but preserve important version info # Only remove content that's likely to be album names or noise, not version info # Pattern 1: Intelligently handle content after " - " # Only remove if it looks like album names, preserve version info like "slowed", "remix", etc. dash_pattern = r'^([^-]+?)\s*-\s*(.+)$' match = re.search(dash_pattern, original_title.strip()) if match: title_part = match.group(1).strip() dash_content = match.group(2).strip().lower() # Define version keywords that should be preserved preserve_keywords = [ 'slowed', 'reverb', 'sped up', 'speed up', 'spedup', 'slowdown', 'remix', 'mix', 'edit', 'version', 'remaster', 'acoustic', 'live', 'demo', 'instrumental', 'radio', 'extended', 'club', 'original', 'clean', 'explicit', 'mashup', 'bootleg' ] # Check if the dash content contains version keywords should_preserve = any(keyword in dash_content for keyword in preserve_keywords) if not should_preserve and title_part and len(title_part) >= 3: # This looks like album content, safe to remove dash_clean = self.clean_title(title_part) if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {dash_clean}".strip()) logger.debug(f"PRIORITY 2: Dash-cleaned query (removed album): '{artist} {dash_clean}'") elif should_preserve: logger.debug(f"PRESERVED: Keeping dash content '{dash_content}' as it appears to be version info") # Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info # Check if parentheses contain version-related keywords before removing paren_pattern = r'^(.+?)\s*\(([^)]+)\)(.*)$' paren_match = re.search(paren_pattern, original_title) if paren_match: before_paren = paren_match.group(1).strip() paren_content = paren_match.group(2).strip().lower() after_paren = paren_match.group(3).strip() # Define what we consider "noise" vs "important version info" noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean'] # Expanded version keywords to match the dash preserve keywords version_keywords = [ 'slowed', 'reverb', 'sped up', 'speed up', 'spedup', 'slowdown', 'remix', 'mix', 'edit', 'version', 'remaster', 'acoustic', 'live', 'demo', 'instrumental', 'radio', 'extended', 'club', 'original', 'mashup', 'bootleg' ] # Only remove parentheses if they contain noise, not version info is_noise = any(keyword in paren_content for keyword in noise_keywords) is_version = any(keyword in paren_content for keyword in version_keywords) if is_noise and not is_version and before_paren: simple_title = (before_paren + ' ' + after_paren).strip() if simple_title and len(simple_title) >= 3: simple_clean = self.clean_title(simple_title) if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {simple_clean}".strip()) logger.debug(f"PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'") elif is_version: logger.debug(f"PRESERVED: Keeping parentheses content '({paren_content})' as it appears to be version info") # PRIORITY 3: Original query (ONLY if no album was detected or if it's different) original_track_clean = self.clean_title(original_title) if not album_detected or not queries: # Only add original if no album detected or no other queries if original_track_clean not in [q.split(' ', 1)[1] for q in queries if ' ' in q]: queries.append(f"{artist} {original_track_clean}".strip()) logger.debug(f"PRIORITY 3: Original query: '{artist} {original_track_clean}'") # Remove duplicates while preserving order unique_queries = [] seen = set() for query in queries: if query.lower() not in seen: unique_queries.append(query) seen.add(query.lower()) return unique_queries def generate_download_query(self, spotify_track: SpotifyTrack) -> str: """ Generate optimized search query for downloading tracks. Returns the most specific query (backward compatibility). """ queries = self.generate_download_queries(spotify_track) return queries[0] if queries else "" def calculate_slskd_match_confidence(self, spotify_track: SpotifyTrack, slskd_track: TrackResult) -> float: """ Calculates a confidence score for a Soulseek track against a Spotify track. Uses full-string similarity matching (like Soularr) instead of substring matching to prevent false positives like "Girls" matching "Girls Girls Girls". """ # Normalize the Spotify track info once for efficiency spotify_title_norm = self.normalize_string(spotify_track.name) spotify_artists_norm = [self.normalize_string(a) for a in spotify_track.artists] # The slskd filename is our primary source of truth, so normalize it slskd_filename_norm = self.normalize_string(slskd_track.filename) # 1. Title Score: Use full-string similarity instead of substring matching # This prevents false positives like "Love" matching "Loveless" spotify_cleaned_title = self.clean_title(spotify_track.name) # Calculate full-string similarity ratio (0.0 to 1.0) like Soularr does title_ratio = SequenceMatcher(None, spotify_cleaned_title, slskd_filename_norm).ratio() # Boost score if title appears as a complete word in filename has_word_boundary = bool(re.search(r'\b' + re.escape(spotify_cleaned_title) + r'\b', slskd_filename_norm)) if has_word_boundary: # Title exists as complete word - significant bonus title_score = min(1.0, title_ratio + 0.3) else: # No word boundary match - rely on similarity ratio only title_score = title_ratio # 2. Artist Score: Keep substring matching for artists (they're more unique) # But add similarity-based fallback for better matching artist_score = 0.0 best_artist_similarity = 0.0 for artist in spotify_artists_norm: if artist in slskd_filename_norm: artist_score = 1.0 # Perfect match if any artist is found break else: # Try similarity matching as fallback for misspellings/variations artist_ratio = SequenceMatcher(None, artist, slskd_filename_norm).ratio() best_artist_similarity = max(best_artist_similarity, artist_ratio) # If no exact artist match, use best similarity with penalty if artist_score == 0.0 and best_artist_similarity > 0: artist_score = best_artist_similarity * 0.7 # Penalize similarity-only matches # 3. Duration Score: Increased weight for better accuracy duration_score = self.duration_similarity(spotify_track.duration_ms, slskd_track.duration if slskd_track.duration else 0) # 4. Quality Bonus: Reduced to prevent boosting bad matches quality_bonus = 0.0 if slskd_track.quality: if slskd_track.quality.lower() == 'flac': quality_bonus = 0.03 # Reduced from 0.07 elif slskd_track.quality.lower() == 'mp3' and (slskd_track.bitrate or 0) >= 320: quality_bonus = 0.02 # Reduced from 0.05 # 5. Special handling for short titles (high false positive risk) # Titles like "Run", "Love", "Girls", "Stay" need stricter artist matching title_words = spotify_cleaned_title.split() is_short_title = len(spotify_cleaned_title) <= 5 or len(title_words) == 1 # --- Final Weighted Score --- # Rebalanced weights: Artist matching is now more important to prevent false positives final_confidence = (title_score * 0.45) + (artist_score * 0.40) + (duration_score * 0.15) # Apply short title penalty AFTER calculating base confidence # This allows perfect matches to still pass, but penalizes weak artist matches if is_short_title and artist_score < 0.5: # Heavy penalty but not complete rejection # Multiply by 0.4 (60% penalty) - still possible to pass if title+duration are perfect logger.debug(f"Short title '{spotify_cleaned_title}' with low artist match ({artist_score:.2f}) - applying 60% penalty") final_confidence *= 0.4 # Add the quality bonus to the final score final_confidence += quality_bonus # Store individual scores for debugging (used in enhanced version) slskd_track.title_score = title_score slskd_track.artist_score = artist_score slskd_track.duration_score = duration_score # Debug logging to track matching decisions if final_confidence > 0.3: # Only log potential matches logger.debug( f"Match scoring: '{spotify_track.name}' by {spotify_track.artists[0] if spotify_track.artists else 'Unknown'} " f"vs '{slskd_track.filename[:60]}...' | " f"Title: {title_score:.2f} (ratio: {title_ratio:.2f}, boundary: {has_word_boundary}), " f"Artist: {artist_score:.2f}, Duration: {duration_score:.2f}, " f"Final: {final_confidence:.2f} {'✅ PASS' if final_confidence > 0.58 else '❌ FAIL'}" ) # Ensure the final score doesn't exceed 1.0 return min(final_confidence, 1.0) def find_best_slskd_matches(self, spotify_track: SpotifyTrack, slskd_results: List[TrackResult]) -> List[TrackResult]: """ Scores and sorts a list of Soulseek results against a Spotify track. Returns the list of candidates sorted from best to worst match. """ if not slskd_results: return [] scored_results = [] for slskd_track in slskd_results: confidence = self.calculate_slskd_match_confidence(spotify_track, slskd_track) # We temporarily store the confidence score on the object itself for sorting slskd_track.confidence = confidence scored_results.append(slskd_track) # Sort by confidence score (descending), and then by size as a tie-breaker sorted_results = sorted(scored_results, key=lambda r: (r.confidence, r.size), reverse=True) # Filter out very low-confidence results to avoid bad matches. # Threshold at 0.63 (63%) balances false positive reduction with match rate # Testing showed: 0.65 → 2.2% fewer matches, 0.63 should recover ~1% while keeping safety confident_results = [r for r in sorted_results if r.confidence > 0.63] return confident_results def detect_version_type(self, filename: str) -> Tuple[str, float]: """ Detect version type from filename and return (version_type, penalty). Penalties are applied to prefer original versions over variants. """ if not filename: return 'original', 0.0 filename_lower = filename.lower() # Define version patterns and their penalties (higher penalty = lower priority) version_patterns = { 'remix': { 'patterns': [r'\bremix\b', r'\brmx\b', r'\brework\b', r'\bedit\b(?!ion)'], 'penalty': 0.15 # -15% penalty for remixes }, 'live': { 'patterns': [r'\blive\b', r'\bconcert\b', r'\btour\b', r'\bperformance\b'], 'penalty': 0.20 # -20% penalty for live versions }, 'acoustic': { 'patterns': [r'\bacoustic\b', r'\bunplugged\b', r'\bstripped\b'], 'penalty': 0.12 # -12% penalty for acoustic }, 'instrumental': { 'patterns': [r'\binstrumental\b', r'\bkaraoke\b', r'\bminus one\b'], 'penalty': 0.25 # -25% penalty for instrumentals (most different from original) }, 'radio': { 'patterns': [r'\bradio\s*edit\b', r'\bradio\s*version\b', r'\bclean\s*edit\b'], 'penalty': 0.08 # -8% penalty for radio edits (minor difference) }, 'extended': { 'patterns': [r'\bextended\b', r'\bfull\s*version\b', r'\blong\s*version\b'], 'penalty': 0.05 # -5% penalty for extended (close to original) }, 'demo': { 'patterns': [r'\bdemo\b', r'\broughcut\b', r'\bunreleased\b'], 'penalty': 0.18 # -18% penalty for demos }, 'explicit': { 'patterns': [r'\bexplicit\b', r'\buncensored\b'], 'penalty': 0.02 # -2% minor penalty (might be preferred by some) } } # Check each version type for version_type, config in version_patterns.items(): for pattern in config['patterns']: if re.search(pattern, filename_lower): return version_type, config['penalty'] # No version indicators found - assume original return 'original', 0.0 def calculate_slskd_match_confidence_enhanced(self, spotify_track: SpotifyTrack, slskd_track: TrackResult) -> Tuple[float, str]: """ Enhanced version of calculate_slskd_match_confidence with version-aware scoring. Returns (confidence, version_type) tuple. STRICT VERSION MATCHING: - Live versions are ONLY accepted if Spotify track title contains "live" or "live version" - Remixes are ONLY accepted if Spotify track title contains "remix" or "mix" - Acoustic versions are ONLY accepted if Spotify track title contains "acoustic" - etc. """ # Get base confidence using existing logic base_confidence = self.calculate_slskd_match_confidence(spotify_track, slskd_track) # Detect version type in Soulseek result version_type, penalty = self.detect_version_type(slskd_track.filename) # Check if Spotify track title contains version indicators spotify_title_lower = spotify_track.name.lower() # STRICT VERSION MATCHING: Reject mismatched versions if version_type == 'live': # Only accept live versions if Spotify title has live as a VERSION INDICATOR # Patterns: (Live), - Live, [Live], Live at, Live from, Live in, Live Version # NOT: words ending with 'live' like "Let Me Live" or starting like "Lively" live_patterns = [ r'\(live\)', # (Live) or (Live at Wembley) r'\[live\]', # [Live] r'[-–—]\s*live\b', # - Live or – Live r'\blive\s+at\b', # Live at r'\blive\s+from\b', # Live from r'\blive\s+in\b', # Live in r'\blive\s+version\b', # Live Version r'\blive\s+recording\b' # Live Recording ] has_live_indicator = any(re.search(pattern, spotify_title_lower) for pattern in live_patterns) if not has_live_indicator: # Reject: Soulseek has live version but Spotify doesn't want it return 0.0, 'rejected_version_mismatch' elif version_type == 'remix': # Only accept remixes if Spotify title has remix as a VERSION INDICATOR # Patterns: (Remix), - Remix, [Remix], Remix, Mix remix_patterns = [ r'\(.*?(remix|mix|rmx).*?\)', # (Remix) or (DJ Remix) r'\[.*?(remix|mix|rmx).*?\]', # [Remix] r'[-–—]\s*(remix|mix|rmx)\b', # - Remix r'\b(remix|mix|rmx)\s*$', # Remix at end ] has_remix_indicator = any(re.search(pattern, spotify_title_lower) for pattern in remix_patterns) if not has_remix_indicator: # Reject: Soulseek has remix but Spotify wants original return 0.0, 'rejected_version_mismatch' elif version_type == 'acoustic': # Only accept acoustic if Spotify title has acoustic as a VERSION INDICATOR acoustic_patterns = [ r'\(.*?acoustic.*?\)', # (Acoustic) r'\[.*?acoustic.*?\]', # [Acoustic] r'[-–—]\s*acoustic\b', # - Acoustic r'\bacoustic\s+version\b', # Acoustic Version ] has_acoustic_indicator = any(re.search(pattern, spotify_title_lower) for pattern in acoustic_patterns) if not has_acoustic_indicator: # Reject: Soulseek has acoustic but Spotify wants original return 0.0, 'rejected_version_mismatch' elif version_type == 'instrumental': # Only accept instrumental if Spotify title has instrumental as a VERSION INDICATOR instrumental_patterns = [ r'\(.*?instrumental.*?\)', # (Instrumental) r'\[.*?instrumental.*?\]', # [Instrumental] r'[-–—]\s*instrumental\b', # - Instrumental r'\binstrumental\s+version\b', # Instrumental Version ] has_instrumental_indicator = any(re.search(pattern, spotify_title_lower) for pattern in instrumental_patterns) if not has_instrumental_indicator: # Reject: Soulseek has instrumental but Spotify wants original return 0.0, 'rejected_version_mismatch' # Apply version penalty (for matching versions, slight penalty for quality differences) if version_type != 'original': adjusted_confidence = max(0.0, base_confidence - (penalty * 0.5)) # Reduced penalty since it's a match # Store version info on the track object for UI display slskd_track.version_type = version_type slskd_track.version_penalty = penalty else: adjusted_confidence = base_confidence slskd_track.version_type = 'original' slskd_track.version_penalty = 0.0 return adjusted_confidence, version_type def find_best_slskd_matches_enhanced(self, spotify_track: SpotifyTrack, slskd_results: List[TrackResult]) -> List[TrackResult]: """ Enhanced version of find_best_slskd_matches with version-aware scoring. Returns candidates sorted by adjusted confidence (preferring originals). """ if not slskd_results: return [] scored_results = [] for slskd_track in slskd_results: # Use enhanced confidence calculation confidence, version_type = self.calculate_slskd_match_confidence_enhanced(spotify_track, slskd_track) # Store the adjusted confidence and version info slskd_track.confidence = confidence slskd_track.version_type = getattr(slskd_track, 'version_type', 'original') scored_results.append(slskd_track) # Sort by confidence score (descending), then by version preference, then by size def sort_key(r): # Primary: confidence score # Secondary: prefer originals (original=0, others=penalty value for tie-breaking) version_priority = 0.0 if r.version_type == 'original' else getattr(r, 'version_penalty', 0.1) # Tertiary: file size return (r.confidence, -version_priority, r.size) sorted_results = sorted(scored_results, key=sort_key, reverse=True) # Filter out very low-confidence results # Threshold at 0.58 (58%) to prevent false positives while maintaining good match rate # Testing showed: 0.60 was slightly too strict, 0.58 balances accuracy and recall confident_results = [r for r in sorted_results if r.confidence > 0.58] # Debug logging for troubleshooting if scored_results and not confident_results: print(f"⚠️ DEBUG: Found {len(scored_results)} scored results but none met confidence threshold 0.58") for i, result in enumerate(sorted_results[:3]): # Show top 3 print(f" {i+1}. {result.confidence:.3f} - {getattr(result, 'version_type', 'unknown')} - {result.filename[:60]}...") elif confident_results: print(f"✅ DEBUG: {len(confident_results)} results passed confidence threshold 0.58") for i, result in enumerate(confident_results[:3]): # Show top 3 print(f" {i+1}. {result.confidence:.3f} - {getattr(result, 'version_type', 'unknown')} - {result.filename[:60]}...") return confident_results def calculate_album_confidence(self, spotify_album, plex_album_info: Dict[str, Any]) -> float: """Calculate confidence score for album matching""" if not spotify_album or not plex_album_info: return 0.0 score = 0.0 # 1. Album name similarity (40% weight) spotify_album_clean = self.clean_album_name(spotify_album.name) plex_album_clean = self.clean_album_name(plex_album_info['title']) name_similarity = self.similarity_score(spotify_album_clean, plex_album_clean) score += name_similarity * 0.4 # 2. Artist similarity (40% weight) if spotify_album.artists and plex_album_info.get('artist'): spotify_artist_clean = self.clean_artist(spotify_album.artists[0]) plex_artist_clean = self.clean_artist(plex_album_info['artist']) artist_similarity = self.similarity_score(spotify_artist_clean, plex_artist_clean) score += artist_similarity * 0.4 # 3. Track count similarity (10% weight) spotify_track_count = getattr(spotify_album, 'total_tracks', 0) plex_track_count = plex_album_info.get('track_count', 0) if spotify_track_count > 0 and plex_track_count > 0: # Calculate track count similarity (perfect match = 1.0, close matches get partial credit) track_diff = abs(spotify_track_count - plex_track_count) if track_diff == 0: track_similarity = 1.0 elif track_diff <= 2: # Allow for slight differences (bonus tracks, etc.) track_similarity = 0.8 elif track_diff <= 5: track_similarity = 0.5 else: track_similarity = 0.2 score += track_similarity * 0.1 # 4. Year similarity bonus (10% weight) spotify_year = spotify_album.release_date[:4] if spotify_album.release_date else None plex_year = str(plex_album_info.get('year', '')) if plex_album_info.get('year') else None if spotify_year and plex_year: if spotify_year == plex_year: score += 0.1 # Perfect year match elif abs(int(spotify_year) - int(plex_year)) <= 1: score += 0.05 # Close year match (remaster, etc.) return min(score, 1.0) # Cap at 1.0 def find_best_album_match(self, spotify_album, plex_albums: List[Dict[str, Any]]) -> Tuple[Optional[Dict[str, Any]], float]: """Find the best matching album from Plex candidates""" if not plex_albums: return None, 0.0 best_match = None best_confidence = 0.0 for plex_album in plex_albums: confidence = self.calculate_album_confidence(spotify_album, plex_album) if confidence > best_confidence: best_confidence = confidence best_match = plex_album # Only return matches above confidence threshold if best_confidence >= 0.8: # High threshold for album matching return best_match, best_confidence else: return None, best_confidence