From f1e4539936c3cb53f015e706e50c911f7967e4db Mon Sep 17 00:00:00 2001
From: Broque Thomas <batmanesignal@gmail.com>
Date: Thu, 24 Jul 2025 16:20:36 -0700
Subject: [PATCH] better

---
 core/matching_engine.py | 211 ++++++++++++++++------------------------
 ui/pages/sync.py        |  89 ++++++++---------
 2 files changed, 124 insertions(+), 176 deletions(-)

diff --git a/core/matching_engine.py b/core/matching_engine.py
index c3bac689..bf9a00ce 100644
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Tuple
 import re
 from dataclasses import dataclass
 from difflib import SequenceMatcher
+from unidecode import unidecode
 from utils.logging_config import get_logger
 from core.spotify_client import Track as SpotifyTrack
 from core.plex_client import PlexTrackInfo
@@ -17,20 +18,28 @@ class MatchResult:
     
     @property
     def is_match(self) -> bool:
-        return self.plex_track is not None and self.confidence >= 0.7
+        return self.plex_track is not None and self.confidence >= 0.8
 
 class MusicMatchingEngine:
     def __init__(self):
+        # More comprehensive patterns to strip extra info from titles
         self.title_patterns = [
-            r'\(.*?\)',
-            r'\[.*?\]',
-            r'\s*-\s*remaster.*',
-            r'\s*-\s*remix.*',
-            r'\s*-\s*live.*',
-            r'\s*-\s*acoustic.*',
-            r'\s*feat\..*',
-            r'\s*ft\..*',
-            r'\s*featuring.*',
+            r'\(feat\.?.*\)',
+            r'\[feat\.?.*\]',
+            r'\(with.*\)',
+            r'\(ft\.?.*\)',
+            r'\[ft\.?.*\]',
+            r'\(remix\)',
+            r'\(live\)',
+            r'\(acoustic\)',
+            r'\(radio edit\)',
+            r'\(album version\)',
+            r'\(original mix\)',
+            r'-\s*single version',
+            r'-\s*remaster.*',
+            r'-\s*live.*',
+            r'-\s*remix',
+            r'-\s*radio edit',
         ]
         
         self.artist_patterns = [
@@ -39,37 +48,51 @@ class MusicMatchingEngine:
             r'\s*featuring.*',
             r'\s*&.*',
             r'\s*and.*',
+            r',.*'
         ]
     
     def normalize_string(self, text: str) -> str:
+        """
+        Normalizes string by converting to ASCII, lowercasing, and removing
+        specific punctuation while keeping alphanumeric characters.
+        """
         if not text:
             return ""
         
-        text = text.lower().strip()
+        # Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII
+        text = unidecode(text)
         
-        text = re.sub(r'[^\w\s]', '', text)
+        # Convert to lowercase
+        text = text.lower()
         
-        text = re.sub(r'\s+', ' ', text)
+        # Remove specific punctuation but keep alphanumeric and spaces
+        text = re.sub(r'[^\w\s-]', '', text)
+        
+        # Collapse multiple spaces into one
+        text = re.sub(r'\s+', ' ', text).strip()
         
         return text
     
     def clean_title(self, title: str) -> str:
+        """Cleans title by removing common extra info using regex."""
         cleaned = title
         
         for pattern in self.title_patterns:
-            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
+            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
         
         return self.normalize_string(cleaned)
     
     def clean_artist(self, artist: str) -> str:
+        """Cleans artist name by removing featured artists and other noise."""
         cleaned = artist
         
         for pattern in self.artist_patterns:
-            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
+            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
         
         return self.normalize_string(cleaned)
     
     def extract_main_artist(self, artists: List[str]) -> str:
+        """Extracts and cleans the primary artist from a list."""
         if not artists:
             return ""
         
@@ -77,68 +100,69 @@ class MusicMatchingEngine:
         return self.clean_artist(main_artist)
     
     def similarity_score(self, str1: str, str2: str) -> float:
+        """Calculates similarity score between two strings."""
         if not str1 or not str2:
             return 0.0
         
         return SequenceMatcher(None, str1, str2).ratio()
     
     def duration_similarity(self, duration1: int, duration2: int) -> float:
+        """Calculates similarity score based on track duration (in ms)."""
         if duration1 == 0 or duration2 == 0:
-            return 0.5
-        
-        max_duration = max(duration1, duration2)
-        min_duration = min(duration1, duration2)
+            return 0.5 # Neutral score if a duration is missing
         
-        if max_duration == 0:
-            return 0.5
-        
-        diff_ratio = abs(max_duration - min_duration) / max_duration
-        
-        if diff_ratio <= 0.05:
+        # Allow a 5-second tolerance (5000 ms)
+        if abs(duration1 - duration2) <= 5000:
             return 1.0
-        elif diff_ratio <= 0.1:
-            return 0.8
-        elif diff_ratio <= 0.2:
-            return 0.6
-        else:
-            return 0.3
-    
-    def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
-        spotify_title = self.clean_title(spotify_track.name)
-        plex_title = self.clean_title(plex_track.title)
-        
-        spotify_artist = self.extract_main_artist(spotify_track.artists)
-        plex_artist = self.clean_artist(plex_track.artist)
-        
-        spotify_album = self.normalize_string(spotify_track.album)
-        plex_album = self.normalize_string(plex_track.album)
         
-        title_score = self.similarity_score(spotify_title, plex_title)
-        artist_score = self.similarity_score(spotify_artist, plex_artist)
-        album_score = self.similarity_score(spotify_album, plex_album)
-        
-        # CORRECTED: Plex duration is already in milliseconds.
-        duration_score = self.duration_similarity(
-            spotify_track.duration_ms, 
-            plex_track.duration if plex_track.duration else 0
-        )
+        # Penalize larger differences
+        diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
+        return max(0, 1.0 - diff_ratio * 5) # Scale penalty
+
+    def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
+        """Calculates a confidence score for a potential match with weighted factors."""
         
-        if title_score >= 0.9 and artist_score >= 0.9 and album_score >= 0.8:
-            return 0.95, "exact_match"
-        elif title_score >= 0.8 and artist_score >= 0.8:
-            return 0.85, "high_confidence"
-        elif title_score >= 0.7 and artist_score >= 0.7:
-            return 0.75, "medium_confidence"
-        elif title_score >= 0.6 and artist_score >= 0.6:
-            return 0.65, "low_confidence"
+        # Clean titles and artists for comparison
+        spotify_title_cleaned = self.clean_title(spotify_track.name)
+        plex_title_cleaned = self.clean_title(plex_track.title)
+
+        spotify_main_artist_cleaned = self.extract_main_artist(spotify_track.artists)
+        plex_artist_normalized = self.normalize_string(plex_track.artist)
+
+        # --- Calculate individual scores ---
+        title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
+        
+        # Artist score: check if main Spotify artist is in the Plex artist string
+        artist_score = 1.0 if spotify_main_artist_cleaned in plex_artist_normalized else self.similarity_score(spotify_main_artist_cleaned, self.clean_artist(plex_track.artist))
+        
+        duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
+        
+        # --- Weighted confidence calculation ---
+        # Weights: Title (50%), Artist (30%), Duration (20%)
+        confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
+        
+        # Determine match type based on scores
+        if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9:
+            match_type = "perfect_match"
+            confidence = max(confidence, 0.98) # Boost confidence for perfect matches
+        elif title_score > 0.85 and artist_score > 0.8:
+            match_type = "high_confidence"
+        elif title_score > 0.75:
+            match_type = "medium_confidence"
         else:
-            return 0.0, "no_match"
+            match_type = "low_confidence"
+
+        return confidence, match_type
     
     def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult:
+        """Finds the best Plex track match from a list of candidates."""
         best_match = None
         best_confidence = 0.0
         best_match_type = "no_match"
         
+        if not plex_tracks:
+            return MatchResult(spotify_track, None, 0.0, "no_candidates")
+
         for plex_track in plex_tracks:
             confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track)
             
@@ -153,72 +177,3 @@ class MusicMatchingEngine:
             confidence=best_confidence,
             match_type=best_match_type
         )
-    
-    def match_playlist_tracks(self, spotify_tracks: List[SpotifyTrack], plex_tracks: List[PlexTrackInfo]) -> List[MatchResult]:
-        results = []
-        
-        logger.info(f"Matching {len(spotify_tracks)} Spotify tracks against {len(plex_tracks)} Plex tracks")
-        
-        for spotify_track in spotify_tracks:
-            match_result = self.find_best_match(spotify_track, plex_tracks)
-            results.append(match_result)
-            
-            if match_result.is_match:
-                logger.debug(f"Matched: {spotify_track.name} by {spotify_track.artists[0]} -> {match_result.plex_track.title} (confidence: {match_result.confidence:.2f})")
-            else:
-                logger.debug(f"No match found for: {spotify_track.name} by {spotify_track.artists[0]}")
-        
-        matched_count = sum(1 for r in results if r.is_match)
-        logger.info(f"Successfully matched {matched_count}/{len(spotify_tracks)} tracks")
-        
-        return results
-    
-    def get_match_statistics(self, match_results: List[MatchResult]) -> Dict[str, Any]:
-        total_tracks = len(match_results)
-        matched_tracks = sum(1 for r in match_results if r.is_match)
-        
-        match_types = {}
-        for result in match_results:
-            if result.is_match:
-                match_types[result.match_type] = match_types.get(result.match_type, 0) + 1
-        
-        confidence_distribution = {
-            "high (>0.8)": sum(1 for r in match_results if r.confidence > 0.8),
-            "medium (0.7-0.8)": sum(1 for r in match_results if 0.7 <= r.confidence <= 0.8),
-            "low (0.6-0.7)": sum(1 for r in match_results if 0.6 <= r.confidence < 0.7),
-            "no_match (<0.6)": sum(1 for r in match_results if r.confidence < 0.6)
-        }
-        
-        return {
-            "total_tracks": total_tracks,
-            "matched_tracks": matched_tracks,
-            "match_percentage": (matched_tracks / total_tracks * 100) if total_tracks > 0 else 0,
-            "match_types": match_types,
-            "confidence_distribution": confidence_distribution
-        }
-    
-    def create_search_queries(self, spotify_track: SpotifyTrack) -> List[str]:
-        queries = []
-        
-        main_artist = self.extract_main_artist(spotify_track.artists)
-        clean_title = self.clean_title(spotify_track.name)
-        clean_album = self.normalize_string(spotify_track.album)
-        
-        queries.append(f"{clean_title} {main_artist}")
-        queries.append(f"{main_artist} {clean_title}")
-        queries.append(f"{clean_title} {main_artist} {clean_album}")
-        queries.append(f"{clean_album} {main_artist}")
-        
-        if len(spotify_track.artists) > 1:
-            all_artists = " ".join([self.clean_artist(a) for a in spotify_track.artists])
-            queries.append(f"{clean_title} {all_artists}")
-        
-        return queries
-    
-    def generate_download_query(self, spotify_track: SpotifyTrack) -> str:
-        main_artist = self.extract_main_artist(spotify_track.artists)
-        clean_title = self.clean_title(spotify_track.name)
-        
-        return f"{main_artist} {clean_title}"
-
-matching_engine = MusicMatchingEngine()
diff --git a/ui/pages/sync.py b/ui/pages/sync.py
index 8b041b12..2e59bf77 100644
--- a/ui/pages/sync.py
+++ b/ui/pages/sync.py
@@ -121,72 +121,65 @@ class PlaylistTrackAnalysisWorker(QRunnable):
     def _check_track_in_plex(self, spotify_track):
         """
         Check if a Spotify track exists in Plex by trying several search strategies
-        and using the MusicMatchingEngine to find the best match.
+        across ALL artists associated with the track.
         """
         try:
-            # Use the first artist for the primary search query
-            artist_name = spotify_track.artists[0] if spotify_track.artists else ""
             original_title = spotify_track.name
-
-            # --- Generate a list of search queries, from most specific to most broad ---
-            search_queries = []
-
-            # Strategy 1: Original, unmodified title. Catches exact matches.
-            search_queries.append(original_title)
-
-            # Strategy 2: Title with content after a hyphen removed.
-            # e.g., "Song Title - Remaster" -> "Song Title"
-            if " - " in original_title:
-                title_before_hyphen = original_title.split(' - ')[0].strip()
-                if title_before_hyphen:
-                    search_queries.append(title_before_hyphen)
-
-            # Strategy 3: Title with parenthetical/bracketed content removed.
-            # (Uses the simple cleaner from this file for an intermediate search)
-            cleaned_for_search = clean_track_name_for_search(original_title)
-            if cleaned_for_search.lower() != original_title.lower():
-                search_queries.append(cleaned_for_search)
             
-            # Strategy 4: A "base" title with all extra info removed (remixes, feats, etc.)
-            # using the more aggressive cleaning from the matching engine.
-            base_title = self.matching_engine.clean_title(original_title)
-            if base_title.lower() != cleaned_for_search.lower() and base_title.lower() != original_title.lower():
-                search_queries.append(base_title)
-                
-            # Remove duplicate queries that might have resulted from the cleaning steps, preserving order.
-            unique_queries = list(dict.fromkeys(search_queries))
+            # --- Generate a list of title variations ---
+            title_variations = []
+            title_variations.append(original_title) # Strategy 1: Original title
+            if " - " in original_title: # Strategy 2: Strip content after hyphen
+                title_variations.append(original_title.split(' - ')[0].strip())
             
-            print(f"🧠 Generated search queries for '{original_title}': {unique_queries}")
+            cleaned_for_search = clean_track_name_for_search(original_title) # Strategy 3: Strip parenthetical content
+            if cleaned_for_search.lower() != original_title.lower():
+                title_variations.append(cleaned_for_search)
+
+            base_title = self.matching_engine.clean_title(original_title) # Strategy 4: Aggressively cleaned title
+            if base_title.lower() not in [t.lower() for t in title_variations]:
+                title_variations.append(base_title)
 
-            # --- Execute searches and collect all potential matches ---
+            unique_title_variations = list(dict.fromkeys(title_variations))
+            
+            # --- Execute searches for EACH artist and collect all potential matches ---
             all_potential_matches = []
             found_match_ids = set()
+            
+            # Use all artists from Spotify, not just the first one
+            artists_to_search = spotify_track.artists if spotify_track.artists else [""]
 
-            for query_title in unique_queries:
-                if self._cancelled:
-                    return None, 0.0
-
-                # Call the updated search_tracks with the query title and artist
-                potential_plex_matches = self.plex_client.search_tracks(
-                    title=query_title, 
-                    artist=artist_name, 
-                    limit=15 # Increased limit to get more candidates
-                )
+            for artist_name in artists_to_search:
+                if self._cancelled: return None, 0.0
                 
-                for track in potential_plex_matches:
-                    if track.id not in found_match_ids:
-                        all_potential_matches.append(track)
-                        found_match_ids.add(track.id)
+                print(f"🎤 Searching for artist: '{artist_name}'")
+                for query_title in unique_title_variations:
+                    if self._cancelled: return None, 0.0
+
+                    potential_plex_matches = self.plex_client.search_tracks(
+                        title=query_title, 
+                        artist=artist_name, 
+                        limit=15
+                    )
+                    
+                    for track in potential_plex_matches:
+                        if track.id not in found_match_ids:
+                            all_potential_matches.append(track)
+                            found_match_ids.add(track.id)
             
             if not all_potential_matches:
-                print(f"❌ No Plex candidates found for '{original_title}' after trying all strategies.")
+                print(f"❌ No Plex candidates found for '{original_title}' after trying all artists and title variations.")
                 return None, 0.0
             
             # --- Use the matching engine to find the best match among ALL candidates ---
             print(f"✅ Found {len(all_potential_matches)} potential Plex matches for '{original_title}'. Scoring now...")
             match_result = self.matching_engine.find_best_match(spotify_track, all_potential_matches)
             
-            # Return the best Plex track found and its confidence score.
+            if match_result.is_match:
+                print(f"✔️ Best match for '{original_title}': '{match_result.plex_track.title}' with confidence {match_result.confidence:.2f}")
+            else:
+                print(f"⚠️ No confident match found for '{original_title}'. Best attempt scored {match_result.confidence:.2f}.")
+
             return match_result.plex_track, match_result.confidence
             
         except Exception as e: