From f1e4539936c3cb53f015e706e50c911f7967e4db Mon Sep 17 00:00:00 2001 From: Broque Thomas Date: Thu, 24 Jul 2025 16:20:36 -0700 Subject: [PATCH] better --- core/matching_engine.py | 211 ++++++++++++++++------------------------ ui/pages/sync.py | 89 ++++++++--------- 2 files changed, 124 insertions(+), 176 deletions(-) diff --git a/core/matching_engine.py b/core/matching_engine.py index c3bac689..bf9a00ce 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Tuple import re from dataclasses import dataclass from difflib import SequenceMatcher +from unidecode import unidecode from utils.logging_config import get_logger from core.spotify_client import Track as SpotifyTrack from core.plex_client import PlexTrackInfo @@ -17,20 +18,28 @@ class MatchResult: @property def is_match(self) -> bool: - return self.plex_track is not None and self.confidence >= 0.7 + return self.plex_track is not None and self.confidence >= 0.8 class MusicMatchingEngine: def __init__(self): + # More comprehensive patterns to strip extra info from titles self.title_patterns = [ - r'\(.*?\)', - r'\[.*?\]', - r'\s*-\s*remaster.*', - r'\s*-\s*remix.*', - r'\s*-\s*live.*', - r'\s*-\s*acoustic.*', - r'\s*feat\..*', - r'\s*ft\..*', - r'\s*featuring.*', + r'\(feat\.?.*\)', + r'\[feat\.?.*\]', + r'\(with.*\)', + r'\(ft\.?.*\)', + r'\[ft\.?.*\]', + r'\(remix\)', + r'\(live\)', + r'\(acoustic\)', + r'\(radio edit\)', + r'\(album version\)', + r'\(original mix\)', + r'-\s*single version', + r'-\s*remaster.*', + r'-\s*live.*', + r'-\s*remix', + r'-\s*radio edit', ] self.artist_patterns = [ @@ -39,37 +48,51 @@ class MusicMatchingEngine: r'\s*featuring.*', r'\s*&.*', r'\s*and.*', + r',.*' ] def normalize_string(self, text: str) -> str: + """ + Normalizes string by converting to ASCII, lowercasing, and removing + specific punctuation while keeping alphanumeric characters. + """ if not text: return "" - text = text.lower().strip() + # Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII + text = unidecode(text) - text = re.sub(r'[^\w\s]', '', text) + # Convert to lowercase + text = text.lower() - text = re.sub(r'\s+', ' ', text) + # Remove specific punctuation but keep alphanumeric and spaces + text = re.sub(r'[^\w\s-]', '', text) + + # Collapse multiple spaces into one + text = re.sub(r'\s+', ' ', text).strip() return text def clean_title(self, title: str) -> str: + """Cleans title by removing common extra info using regex.""" cleaned = title for pattern in self.title_patterns: - cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE) + cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() return self.normalize_string(cleaned) def clean_artist(self, artist: str) -> str: + """Cleans artist name by removing featured artists and other noise.""" cleaned = artist for pattern in self.artist_patterns: - cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE) + cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() return self.normalize_string(cleaned) def extract_main_artist(self, artists: List[str]) -> str: + """Extracts and cleans the primary artist from a list.""" if not artists: return "" @@ -77,68 +100,69 @@ class MusicMatchingEngine: return self.clean_artist(main_artist) def similarity_score(self, str1: str, str2: str) -> float: + """Calculates similarity score between two strings.""" if not str1 or not str2: return 0.0 return SequenceMatcher(None, str1, str2).ratio() def duration_similarity(self, duration1: int, duration2: int) -> float: + """Calculates similarity score based on track duration (in ms).""" if duration1 == 0 or duration2 == 0: - return 0.5 - - max_duration = max(duration1, duration2) - min_duration = min(duration1, duration2) + return 0.5 # Neutral score if a duration is missing - if max_duration == 0: - return 0.5 - - diff_ratio = abs(max_duration - min_duration) / max_duration - - if diff_ratio <= 0.05: + # Allow a 5-second tolerance (5000 ms) + if abs(duration1 - duration2) <= 5000: return 1.0 - elif diff_ratio <= 0.1: - return 0.8 - elif diff_ratio <= 0.2: - return 0.6 - else: - return 0.3 - - def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: - spotify_title = self.clean_title(spotify_track.name) - plex_title = self.clean_title(plex_track.title) - - spotify_artist = self.extract_main_artist(spotify_track.artists) - plex_artist = self.clean_artist(plex_track.artist) - - spotify_album = self.normalize_string(spotify_track.album) - plex_album = self.normalize_string(plex_track.album) - title_score = self.similarity_score(spotify_title, plex_title) - artist_score = self.similarity_score(spotify_artist, plex_artist) - album_score = self.similarity_score(spotify_album, plex_album) - - # CORRECTED: Plex duration is already in milliseconds. - duration_score = self.duration_similarity( - spotify_track.duration_ms, - plex_track.duration if plex_track.duration else 0 - ) + # Penalize larger differences + diff_ratio = abs(duration1 - duration2) / max(duration1, duration2) + return max(0, 1.0 - diff_ratio * 5) # Scale penalty + + def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: + """Calculates a confidence score for a potential match with weighted factors.""" - if title_score >= 0.9 and artist_score >= 0.9 and album_score >= 0.8: - return 0.95, "exact_match" - elif title_score >= 0.8 and artist_score >= 0.8: - return 0.85, "high_confidence" - elif title_score >= 0.7 and artist_score >= 0.7: - return 0.75, "medium_confidence" - elif title_score >= 0.6 and artist_score >= 0.6: - return 0.65, "low_confidence" + # Clean titles and artists for comparison + spotify_title_cleaned = self.clean_title(spotify_track.name) + plex_title_cleaned = self.clean_title(plex_track.title) + + spotify_main_artist_cleaned = self.extract_main_artist(spotify_track.artists) + plex_artist_normalized = self.normalize_string(plex_track.artist) + + # --- Calculate individual scores --- + title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned) + + # Artist score: check if main Spotify artist is in the Plex artist string + artist_score = 1.0 if spotify_main_artist_cleaned in plex_artist_normalized else self.similarity_score(spotify_main_artist_cleaned, self.clean_artist(plex_track.artist)) + + duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0) + + # --- Weighted confidence calculation --- + # Weights: Title (50%), Artist (30%), Duration (20%) + confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2) + + # Determine match type based on scores + if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9: + match_type = "perfect_match" + confidence = max(confidence, 0.98) # Boost confidence for perfect matches + elif title_score > 0.85 and artist_score > 0.8: + match_type = "high_confidence" + elif title_score > 0.75: + match_type = "medium_confidence" else: - return 0.0, "no_match" + match_type = "low_confidence" + + return confidence, match_type def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult: + """Finds the best Plex track match from a list of candidates.""" best_match = None best_confidence = 0.0 best_match_type = "no_match" + if not plex_tracks: + return MatchResult(spotify_track, None, 0.0, "no_candidates") + for plex_track in plex_tracks: confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track) @@ -153,72 +177,3 @@ class MusicMatchingEngine: confidence=best_confidence, match_type=best_match_type ) - - def match_playlist_tracks(self, spotify_tracks: List[SpotifyTrack], plex_tracks: List[PlexTrackInfo]) -> List[MatchResult]: - results = [] - - logger.info(f"Matching {len(spotify_tracks)} Spotify tracks against {len(plex_tracks)} Plex tracks") - - for spotify_track in spotify_tracks: - match_result = self.find_best_match(spotify_track, plex_tracks) - results.append(match_result) - - if match_result.is_match: - logger.debug(f"Matched: {spotify_track.name} by {spotify_track.artists[0]} -> {match_result.plex_track.title} (confidence: {match_result.confidence:.2f})") - else: - logger.debug(f"No match found for: {spotify_track.name} by {spotify_track.artists[0]}") - - matched_count = sum(1 for r in results if r.is_match) - logger.info(f"Successfully matched {matched_count}/{len(spotify_tracks)} tracks") - - return results - - def get_match_statistics(self, match_results: List[MatchResult]) -> Dict[str, Any]: - total_tracks = len(match_results) - matched_tracks = sum(1 for r in match_results if r.is_match) - - match_types = {} - for result in match_results: - if result.is_match: - match_types[result.match_type] = match_types.get(result.match_type, 0) + 1 - - confidence_distribution = { - "high (>0.8)": sum(1 for r in match_results if r.confidence > 0.8), - "medium (0.7-0.8)": sum(1 for r in match_results if 0.7 <= r.confidence <= 0.8), - "low (0.6-0.7)": sum(1 for r in match_results if 0.6 <= r.confidence < 0.7), - "no_match (<0.6)": sum(1 for r in match_results if r.confidence < 0.6) - } - - return { - "total_tracks": total_tracks, - "matched_tracks": matched_tracks, - "match_percentage": (matched_tracks / total_tracks * 100) if total_tracks > 0 else 0, - "match_types": match_types, - "confidence_distribution": confidence_distribution - } - - def create_search_queries(self, spotify_track: SpotifyTrack) -> List[str]: - queries = [] - - main_artist = self.extract_main_artist(spotify_track.artists) - clean_title = self.clean_title(spotify_track.name) - clean_album = self.normalize_string(spotify_track.album) - - queries.append(f"{clean_title} {main_artist}") - queries.append(f"{main_artist} {clean_title}") - queries.append(f"{clean_title} {main_artist} {clean_album}") - queries.append(f"{clean_album} {main_artist}") - - if len(spotify_track.artists) > 1: - all_artists = " ".join([self.clean_artist(a) for a in spotify_track.artists]) - queries.append(f"{clean_title} {all_artists}") - - return queries - - def generate_download_query(self, spotify_track: SpotifyTrack) -> str: - main_artist = self.extract_main_artist(spotify_track.artists) - clean_title = self.clean_title(spotify_track.name) - - return f"{main_artist} {clean_title}" - -matching_engine = MusicMatchingEngine() diff --git a/ui/pages/sync.py b/ui/pages/sync.py index 8b041b12..2e59bf77 100644 --- a/ui/pages/sync.py +++ b/ui/pages/sync.py @@ -121,72 +121,65 @@ class PlaylistTrackAnalysisWorker(QRunnable): def _check_track_in_plex(self, spotify_track): """ Check if a Spotify track exists in Plex by trying several search strategies - and using the MusicMatchingEngine to find the best match. + across ALL artists associated with the track. """ try: - # Use the first artist for the primary search query - artist_name = spotify_track.artists[0] if spotify_track.artists else "" original_title = spotify_track.name - - # --- Generate a list of search queries, from most specific to most broad --- - search_queries = [] - - # Strategy 1: Original, unmodified title. Catches exact matches. - search_queries.append(original_title) - - # Strategy 2: Title with content after a hyphen removed. - # e.g., "Song Title - Remaster" -> "Song Title" - if " - " in original_title: - title_before_hyphen = original_title.split(' - ')[0].strip() - if title_before_hyphen: - search_queries.append(title_before_hyphen) - - # Strategy 3: Title with parenthetical/bracketed content removed. - # (Uses the simple cleaner from this file for an intermediate search) - cleaned_for_search = clean_track_name_for_search(original_title) - if cleaned_for_search.lower() != original_title.lower(): - search_queries.append(cleaned_for_search) - # Strategy 4: A "base" title with all extra info removed (remixes, feats, etc.) - # using the more aggressive cleaning from the matching engine. - base_title = self.matching_engine.clean_title(original_title) - if base_title.lower() != cleaned_for_search.lower() and base_title.lower() != original_title.lower(): - search_queries.append(base_title) - - # Remove duplicate queries that might have resulted from the cleaning steps, preserving order. - unique_queries = list(dict.fromkeys(search_queries)) + # --- Generate a list of title variations --- + title_variations = [] + title_variations.append(original_title) # Strategy 1: Original title + if " - " in original_title: # Strategy 2: Strip content after hyphen + title_variations.append(original_title.split(' - ')[0].strip()) - print(f"🧠 Generated search queries for '{original_title}': {unique_queries}") + cleaned_for_search = clean_track_name_for_search(original_title) # Strategy 3: Strip parenthetical content + if cleaned_for_search.lower() != original_title.lower(): + title_variations.append(cleaned_for_search) + + base_title = self.matching_engine.clean_title(original_title) # Strategy 4: Aggressively cleaned title + if base_title.lower() not in [t.lower() for t in title_variations]: + title_variations.append(base_title) - # --- Execute searches and collect all potential matches --- + unique_title_variations = list(dict.fromkeys(title_variations)) + + # --- Execute searches for EACH artist and collect all potential matches --- all_potential_matches = [] found_match_ids = set() + + # Use all artists from Spotify, not just the first one + artists_to_search = spotify_track.artists if spotify_track.artists else [""] - for query_title in unique_queries: - if self._cancelled: - return None, 0.0 - - # Call the updated search_tracks with the query title and artist - potential_plex_matches = self.plex_client.search_tracks( - title=query_title, - artist=artist_name, - limit=15 # Increased limit to get more candidates - ) + for artist_name in artists_to_search: + if self._cancelled: return None, 0.0 - for track in potential_plex_matches: - if track.id not in found_match_ids: - all_potential_matches.append(track) - found_match_ids.add(track.id) + print(f"🎤 Searching for artist: '{artist_name}'") + for query_title in unique_title_variations: + if self._cancelled: return None, 0.0 + + potential_plex_matches = self.plex_client.search_tracks( + title=query_title, + artist=artist_name, + limit=15 + ) + + for track in potential_plex_matches: + if track.id not in found_match_ids: + all_potential_matches.append(track) + found_match_ids.add(track.id) if not all_potential_matches: - print(f"❌ No Plex candidates found for '{original_title}' after trying all strategies.") + print(f"❌ No Plex candidates found for '{original_title}' after trying all artists and title variations.") return None, 0.0 # --- Use the matching engine to find the best match among ALL candidates --- print(f"✅ Found {len(all_potential_matches)} potential Plex matches for '{original_title}'. Scoring now...") match_result = self.matching_engine.find_best_match(spotify_track, all_potential_matches) - # Return the best Plex track found and its confidence score. + if match_result.is_match: + print(f"✔️ Best match for '{original_title}': '{match_result.plex_track.title}' with confidence {match_result.confidence:.2f}") + else: + print(f"⚠️ No confident match found for '{original_title}'. Best attempt scored {match_result.confidence:.2f}.") + return match_result.plex_track, match_result.confidence except Exception as e: