From 44c262c7b4539485740f5fcbeccc403bf6939dd3 Mon Sep 17 00:00:00 2001 From: Broque Thomas Date: Thu, 24 Jul 2025 17:03:30 -0700 Subject: [PATCH] better --- core/matching_engine.py | 80 +++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/core/matching_engine.py b/core/matching_engine.py index 7e66826f..54e30b22 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -22,21 +22,17 @@ class MatchResult: class MusicMatchingEngine: def __init__(self): - # More comprehensive patterns to strip extra info from titles + # The order of these patterns is important. More general patterns go first. self.title_patterns = [ - # NEW: General patterns to remove all content in brackets/parentheses first + # General patterns to remove all content in brackets/parentheses r'\(.*\)', r'\[.*\]', - # Patterns after a hyphen - r'-\s*single version', - r'-\s*remaster.*', - r'-\s*live.*', - r'-\s*remix', - r'-\s*radio edit', - # Patterns in the open title string (not in brackets) - r'\s+feat\.?.*', - r'\s+ft\.?.*', - r'\s+featuring.*' + # General pattern to remove everything after a hyphen + r'\s-\s.*', + # Patterns to remove featuring artists from the title itself + r'\sfeat\.?.*', + r'\sft\.?.*', + r'\sfeaturing.*' ] self.artist_patterns = [ @@ -56,16 +52,10 @@ class MusicMatchingEngine: if not text: return "" - # Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII text = unidecode(text) - - # Convert to lowercase text = text.lower() - - # Remove specific punctuation but keep alphanumeric and spaces + # Keep alphanumeric, spaces, and hyphens, but remove other punctuation like '.' or ',' text = re.sub(r'[^\w\s-]', '', text) - - # Collapse multiple spaces into one text = re.sub(r'\s+', ' ', text).strip() return text @@ -104,56 +94,52 @@ class MusicMatchingEngine: if abs(duration1 - duration2) <= 5000: return 1.0 - # Penalize larger differences diff_ratio = abs(duration1 - duration2) / max(duration1, duration2) - return max(0, 1.0 - diff_ratio * 5) # Scale penalty + return max(0, 1.0 - diff_ratio * 5) def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]: - """Calculates a confidence score for a potential match with weighted factors.""" + """Calculates a confidence score for a potential match with a more robust, prioritized logic.""" spotify_title_cleaned = self.clean_title(spotify_track.name) plex_title_cleaned = self.clean_title(plex_track.title) - # --- Enhanced Artist Scoring --- + # --- Artist Scoring --- spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a] - plex_artist_cleaned = self.clean_artist(plex_track.artist) plex_artist_normalized = self.normalize_string(plex_track.artist) best_artist_score = 0.0 for spotify_artist in spotify_artists_cleaned: - if spotify_artist in plex_artist_normalized: - score = 1.0 - else: - score = self.similarity_score(spotify_artist, plex_artist_cleaned) - + if spotify_artist and spotify_artist in plex_artist_normalized: + best_artist_score = 1.0 + break + score = self.similarity_score(spotify_artist, self.clean_artist(plex_track.artist)) if score > best_artist_score: best_artist_score = score - if best_artist_score == 1.0: - break artist_score = best_artist_score - # --- Calculate other scores --- + # --- Title and Duration Scoring --- title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned) duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0) - # --- Weighted confidence calculation --- - confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2) - - # --- NEW: Add confidence boost for exact title matches --- - if spotify_title_cleaned == plex_title_cleaned and len(spotify_title_cleaned) > 0: - confidence = max(confidence, 0.85) # Boost to at least 0.85 for exact titles - - # Determine match type based on scores - if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9: - match_type = "perfect_match" - confidence = max(confidence, 0.98) - elif title_score > 0.85 and artist_score > 0.8: + # --- Prioritized Confidence Logic --- + # Priority 1: Near-perfect title and artist match is a very strong signal. + if title_score > 0.98 and artist_score > 0.9: + confidence = 0.98 + match_type = "strong_match" + # Priority 2: Exact title match, even with a weaker artist match, should have high confidence. + # This helps with short titles like "Girls" or "LIL DEMON". + elif title_score > 0.98: + confidence = 0.90 + (artist_score * 0.05) # Base of 0.9, with a small artist bonus + match_type = "exact_title_match" + # Priority 3: High title similarity is still a good indicator. + elif title_score > 0.9: + confidence = (title_score * 0.6) + (artist_score * 0.3) + (duration_score * 0.1) match_type = "high_confidence" - elif title_score > 0.75: - match_type = "medium_confidence" + # Default: Standard weighted calculation for all other cases. else: - match_type = "low_confidence" + confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2) + match_type = "standard_match" return confidence, match_type