From 44c262c7b4539485740f5fcbeccc403bf6939dd3 Mon Sep 17 00:00:00 2001
From: Broque Thomas <batmanesignal@gmail.com>
Date: Thu, 24 Jul 2025 17:03:30 -0700
Subject: [PATCH] better

---
 core/matching_engine.py | 80 +++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 47 deletions(-)

diff --git a/core/matching_engine.py b/core/matching_engine.py
index 7e66826f..54e30b22 100644
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@@ -22,21 +22,17 @@ class MatchResult:
 
 class MusicMatchingEngine:
     def __init__(self):
-        # More comprehensive patterns to strip extra info from titles
+        # The order of these patterns is important. More general patterns go first.
         self.title_patterns = [
-            # NEW: General patterns to remove all content in brackets/parentheses first
+            # General patterns to remove all content in brackets/parentheses
             r'\(.*\)',
             r'\[.*\]',
-            # Patterns after a hyphen
-            r'-\s*single version',
-            r'-\s*remaster.*',
-            r'-\s*live.*',
-            r'-\s*remix',
-            r'-\s*radio edit',
-            # Patterns in the open title string (not in brackets)
-            r'\s+feat\.?.*',
-            r'\s+ft\.?.*',
-            r'\s+featuring.*'
+            # General pattern to remove everything after a hyphen
+            r'\s-\s.*',
+            # Patterns to remove featuring artists from the title itself
+            r'\sfeat\.?.*',
+            r'\sft\.?.*',
+            r'\sfeaturing.*'
         ]
         
         self.artist_patterns = [
@@ -56,16 +52,10 @@ class MusicMatchingEngine:
         if not text:
             return ""
         
-        # Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII
         text = unidecode(text)
-        
-        # Convert to lowercase
         text = text.lower()
-        
-        # Remove specific punctuation but keep alphanumeric and spaces
+        # Keep alphanumeric, spaces, and hyphens, but remove other punctuation like '.' or ','
         text = re.sub(r'[^\w\s-]', '', text)
-        
-        # Collapse multiple spaces into one
         text = re.sub(r'\s+', ' ', text).strip()
         
         return text
@@ -104,56 +94,52 @@ class MusicMatchingEngine:
         if abs(duration1 - duration2) <= 5000:
             return 1.0
         
-        # Penalize larger differences
         diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
-        return max(0, 1.0 - diff_ratio * 5) # Scale penalty
+        return max(0, 1.0 - diff_ratio * 5)
 
     def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
-        """Calculates a confidence score for a potential match with weighted factors."""
+        """Calculates a confidence score for a potential match with a more robust, prioritized logic."""
         
         spotify_title_cleaned = self.clean_title(spotify_track.name)
         plex_title_cleaned = self.clean_title(plex_track.title)
 
-        # --- Enhanced Artist Scoring ---
+        # --- Artist Scoring ---
         spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a]
-        plex_artist_cleaned = self.clean_artist(plex_track.artist)
         plex_artist_normalized = self.normalize_string(plex_track.artist)
 
         best_artist_score = 0.0
         for spotify_artist in spotify_artists_cleaned:
-            if spotify_artist in plex_artist_normalized:
-                score = 1.0
-            else:
-                score = self.similarity_score(spotify_artist, plex_artist_cleaned)
-            
+            if spotify_artist and spotify_artist in plex_artist_normalized:
+                best_artist_score = 1.0
+                break
+            score = self.similarity_score(spotify_artist, self.clean_artist(plex_track.artist))
             if score > best_artist_score:
                 best_artist_score = score
-                if best_artist_score == 1.0:
-                    break
         
         artist_score = best_artist_score
         
-        # --- Calculate other scores ---
+        # --- Title and Duration Scoring ---
         title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
         duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
         
-        # --- Weighted confidence calculation ---
-        confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
-        
-        # --- NEW: Add confidence boost for exact title matches ---
-        if spotify_title_cleaned == plex_title_cleaned and len(spotify_title_cleaned) > 0:
-            confidence = max(confidence, 0.85) # Boost to at least 0.85 for exact titles
-        
-        # Determine match type based on scores
-        if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9:
-            match_type = "perfect_match"
-            confidence = max(confidence, 0.98)
-        elif title_score > 0.85 and artist_score > 0.8:
+        # --- Prioritized Confidence Logic ---
+        # Priority 1: Near-perfect title and artist match is a very strong signal.
+        if title_score > 0.98 and artist_score > 0.9:
+            confidence = 0.98
+            match_type = "strong_match"
+        # Priority 2: Exact title match, even with a weaker artist match, should have high confidence.
+        # This helps with short titles like "Girls" or "LIL DEMON".
+        elif title_score > 0.98:
+            confidence = 0.90 + (artist_score * 0.05) # Base of 0.9, with a small artist bonus
+            match_type = "exact_title_match"
+        # Priority 3: High title similarity is still a good indicator.
+        elif title_score > 0.9:
+            confidence = (title_score * 0.6) + (artist_score * 0.3) + (duration_score * 0.1)
             match_type = "high_confidence"
-        elif title_score > 0.75:
-            match_type = "medium_confidence"
+        # Default: Standard weighted calculation for all other cases.
         else:
-            match_type = "low_confidence"
+            confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
+            match_type = "standard_match"
 
         return confidence, match_type