pull/2/head
Broque Thomas 10 months ago
parent fda2b92724
commit 44c262c7b4

@ -22,21 +22,17 @@ class MatchResult:
class MusicMatchingEngine:
def __init__(self):
# More comprehensive patterns to strip extra info from titles
# The order of these patterns is important. More general patterns go first.
self.title_patterns = [
# NEW: General patterns to remove all content in brackets/parentheses first
# General patterns to remove all content in brackets/parentheses
r'\(.*\)',
r'\[.*\]',
# Patterns after a hyphen
r'-\s*single version',
r'-\s*remaster.*',
r'-\s*live.*',
r'-\s*remix',
r'-\s*radio edit',
# Patterns in the open title string (not in brackets)
r'\s+feat\.?.*',
r'\s+ft\.?.*',
r'\s+featuring.*'
# General pattern to remove everything after a hyphen
r'\s-\s.*',
# Patterns to remove featuring artists from the title itself
r'\sfeat\.?.*',
r'\sft\.?.*',
r'\sfeaturing.*'
]
self.artist_patterns = [
@ -56,16 +52,10 @@ class MusicMatchingEngine:
if not text:
return ""
# Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII
text = unidecode(text)
# Convert to lowercase
text = text.lower()
# Remove specific punctuation but keep alphanumeric and spaces
# Keep alphanumeric, spaces, and hyphens, but remove other punctuation like '.' or ','
text = re.sub(r'[^\w\s-]', '', text)
# Collapse multiple spaces into one
text = re.sub(r'\s+', ' ', text).strip()
return text
@ -104,56 +94,52 @@ class MusicMatchingEngine:
if abs(duration1 - duration2) <= 5000:
return 1.0
# Penalize larger differences
diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
return max(0, 1.0 - diff_ratio * 5) # Scale penalty
return max(0, 1.0 - diff_ratio * 5)
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
"""Calculates a confidence score for a potential match with weighted factors."""
"""Calculates a confidence score for a potential match with a more robust, prioritized logic."""
spotify_title_cleaned = self.clean_title(spotify_track.name)
plex_title_cleaned = self.clean_title(plex_track.title)
# --- Enhanced Artist Scoring ---
# --- Artist Scoring ---
spotify_artists_cleaned = [self.clean_artist(a) for a in spotify_track.artists if a]
plex_artist_cleaned = self.clean_artist(plex_track.artist)
plex_artist_normalized = self.normalize_string(plex_track.artist)
best_artist_score = 0.0
for spotify_artist in spotify_artists_cleaned:
if spotify_artist in plex_artist_normalized:
score = 1.0
else:
score = self.similarity_score(spotify_artist, plex_artist_cleaned)
if spotify_artist and spotify_artist in plex_artist_normalized:
best_artist_score = 1.0
break
score = self.similarity_score(spotify_artist, self.clean_artist(plex_track.artist))
if score > best_artist_score:
best_artist_score = score
if best_artist_score == 1.0:
break
artist_score = best_artist_score
# --- Calculate other scores ---
# --- Title and Duration Scoring ---
title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
# --- Weighted confidence calculation ---
confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
# --- NEW: Add confidence boost for exact title matches ---
if spotify_title_cleaned == plex_title_cleaned and len(spotify_title_cleaned) > 0:
confidence = max(confidence, 0.85) # Boost to at least 0.85 for exact titles
# Determine match type based on scores
if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9:
match_type = "perfect_match"
confidence = max(confidence, 0.98)
elif title_score > 0.85 and artist_score > 0.8:
# --- Prioritized Confidence Logic ---
# Priority 1: Near-perfect title and artist match is a very strong signal.
if title_score > 0.98 and artist_score > 0.9:
confidence = 0.98
match_type = "strong_match"
# Priority 2: Exact title match, even with a weaker artist match, should have high confidence.
# This helps with short titles like "Girls" or "LIL DEMON".
elif title_score > 0.98:
confidence = 0.90 + (artist_score * 0.05) # Base of 0.9, with a small artist bonus
match_type = "exact_title_match"
# Priority 3: High title similarity is still a good indicator.
elif title_score > 0.9:
confidence = (title_score * 0.6) + (artist_score * 0.3) + (duration_score * 0.1)
match_type = "high_confidence"
elif title_score > 0.75:
match_type = "medium_confidence"
# Default: Standard weighted calculation for all other cases.
else:
match_type = "low_confidence"
confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
match_type = "standard_match"
return confidence, match_type

Loading…
Cancel
Save