pull/2/head
Broque Thomas 9 months ago
parent 3f04d7f984
commit f1e4539936

@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Tuple
import re
from dataclasses import dataclass
from difflib import SequenceMatcher
from unidecode import unidecode
from utils.logging_config import get_logger
from core.spotify_client import Track as SpotifyTrack
from core.plex_client import PlexTrackInfo
@ -17,20 +18,28 @@ class MatchResult:
@property
def is_match(self) -> bool:
return self.plex_track is not None and self.confidence >= 0.7
return self.plex_track is not None and self.confidence >= 0.8
class MusicMatchingEngine:
def __init__(self):
# More comprehensive patterns to strip extra info from titles
self.title_patterns = [
r'\(.*?\)',
r'\[.*?\]',
r'\s*-\s*remaster.*',
r'\s*-\s*remix.*',
r'\s*-\s*live.*',
r'\s*-\s*acoustic.*',
r'\s*feat\..*',
r'\s*ft\..*',
r'\s*featuring.*',
r'\(feat\.?.*\)',
r'\[feat\.?.*\]',
r'\(with.*\)',
r'\(ft\.?.*\)',
r'\[ft\.?.*\]',
r'\(remix\)',
r'\(live\)',
r'\(acoustic\)',
r'\(radio edit\)',
r'\(album version\)',
r'\(original mix\)',
r'-\s*single version',
r'-\s*remaster.*',
r'-\s*live.*',
r'-\s*remix',
r'-\s*radio edit',
]
self.artist_patterns = [
@ -39,37 +48,51 @@ class MusicMatchingEngine:
r'\s*featuring.*',
r'\s*&.*',
r'\s*and.*',
r',.*'
]
def normalize_string(self, text: str) -> str:
"""
Normalizes string by converting to ASCII, lowercasing, and removing
specific punctuation while keeping alphanumeric characters.
"""
if not text:
return ""
text = text.lower().strip()
# Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII
text = unidecode(text)
text = re.sub(r'[^\w\s]', '', text)
# Convert to lowercase
text = text.lower()
text = re.sub(r'\s+', ' ', text)
# Remove specific punctuation but keep alphanumeric and spaces
text = re.sub(r'[^\w\s-]', '', text)
# Collapse multiple spaces into one
text = re.sub(r'\s+', ' ', text).strip()
return text
def clean_title(self, title: str) -> str:
"""Cleans title by removing common extra info using regex."""
cleaned = title
for pattern in self.title_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
return self.normalize_string(cleaned)
def clean_artist(self, artist: str) -> str:
"""Cleans artist name by removing featured artists and other noise."""
cleaned = artist
for pattern in self.artist_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
return self.normalize_string(cleaned)
def extract_main_artist(self, artists: List[str]) -> str:
"""Extracts and cleans the primary artist from a list."""
if not artists:
return ""
@ -77,68 +100,69 @@ class MusicMatchingEngine:
return self.clean_artist(main_artist)
def similarity_score(self, str1: str, str2: str) -> float:
"""Calculates similarity score between two strings."""
if not str1 or not str2:
return 0.0
return SequenceMatcher(None, str1, str2).ratio()
def duration_similarity(self, duration1: int, duration2: int) -> float:
"""Calculates similarity score based on track duration (in ms)."""
if duration1 == 0 or duration2 == 0:
return 0.5
max_duration = max(duration1, duration2)
min_duration = min(duration1, duration2)
return 0.5 # Neutral score if a duration is missing
if max_duration == 0:
return 0.5
diff_ratio = abs(max_duration - min_duration) / max_duration
if diff_ratio <= 0.05:
# Allow a 5-second tolerance (5000 ms)
if abs(duration1 - duration2) <= 5000:
return 1.0
elif diff_ratio <= 0.1:
return 0.8
elif diff_ratio <= 0.2:
return 0.6
else:
return 0.3
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
spotify_title = self.clean_title(spotify_track.name)
plex_title = self.clean_title(plex_track.title)
spotify_artist = self.extract_main_artist(spotify_track.artists)
plex_artist = self.clean_artist(plex_track.artist)
spotify_album = self.normalize_string(spotify_track.album)
plex_album = self.normalize_string(plex_track.album)
title_score = self.similarity_score(spotify_title, plex_title)
artist_score = self.similarity_score(spotify_artist, plex_artist)
album_score = self.similarity_score(spotify_album, plex_album)
# CORRECTED: Plex duration is already in milliseconds.
duration_score = self.duration_similarity(
spotify_track.duration_ms,
plex_track.duration if plex_track.duration else 0
)
# Penalize larger differences
diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
return max(0, 1.0 - diff_ratio * 5) # Scale penalty
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
"""Calculates a confidence score for a potential match with weighted factors."""
if title_score >= 0.9 and artist_score >= 0.9 and album_score >= 0.8:
return 0.95, "exact_match"
elif title_score >= 0.8 and artist_score >= 0.8:
return 0.85, "high_confidence"
elif title_score >= 0.7 and artist_score >= 0.7:
return 0.75, "medium_confidence"
elif title_score >= 0.6 and artist_score >= 0.6:
return 0.65, "low_confidence"
# Clean titles and artists for comparison
spotify_title_cleaned = self.clean_title(spotify_track.name)
plex_title_cleaned = self.clean_title(plex_track.title)
spotify_main_artist_cleaned = self.extract_main_artist(spotify_track.artists)
plex_artist_normalized = self.normalize_string(plex_track.artist)
# --- Calculate individual scores ---
title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
# Artist score: check if main Spotify artist is in the Plex artist string
artist_score = 1.0 if spotify_main_artist_cleaned in plex_artist_normalized else self.similarity_score(spotify_main_artist_cleaned, self.clean_artist(plex_track.artist))
duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
# --- Weighted confidence calculation ---
# Weights: Title (50%), Artist (30%), Duration (20%)
confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
# Determine match type based on scores
if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9:
match_type = "perfect_match"
confidence = max(confidence, 0.98) # Boost confidence for perfect matches
elif title_score > 0.85 and artist_score > 0.8:
match_type = "high_confidence"
elif title_score > 0.75:
match_type = "medium_confidence"
else:
return 0.0, "no_match"
match_type = "low_confidence"
return confidence, match_type
def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult:
"""Finds the best Plex track match from a list of candidates."""
best_match = None
best_confidence = 0.0
best_match_type = "no_match"
if not plex_tracks:
return MatchResult(spotify_track, None, 0.0, "no_candidates")
for plex_track in plex_tracks:
confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track)
@ -153,72 +177,3 @@ class MusicMatchingEngine:
confidence=best_confidence,
match_type=best_match_type
)
def match_playlist_tracks(self, spotify_tracks: List[SpotifyTrack], plex_tracks: List[PlexTrackInfo]) -> List[MatchResult]:
results = []
logger.info(f"Matching {len(spotify_tracks)} Spotify tracks against {len(plex_tracks)} Plex tracks")
for spotify_track in spotify_tracks:
match_result = self.find_best_match(spotify_track, plex_tracks)
results.append(match_result)
if match_result.is_match:
logger.debug(f"Matched: {spotify_track.name} by {spotify_track.artists[0]} -> {match_result.plex_track.title} (confidence: {match_result.confidence:.2f})")
else:
logger.debug(f"No match found for: {spotify_track.name} by {spotify_track.artists[0]}")
matched_count = sum(1 for r in results if r.is_match)
logger.info(f"Successfully matched {matched_count}/{len(spotify_tracks)} tracks")
return results
def get_match_statistics(self, match_results: List[MatchResult]) -> Dict[str, Any]:
total_tracks = len(match_results)
matched_tracks = sum(1 for r in match_results if r.is_match)
match_types = {}
for result in match_results:
if result.is_match:
match_types[result.match_type] = match_types.get(result.match_type, 0) + 1
confidence_distribution = {
"high (>0.8)": sum(1 for r in match_results if r.confidence > 0.8),
"medium (0.7-0.8)": sum(1 for r in match_results if 0.7 <= r.confidence <= 0.8),
"low (0.6-0.7)": sum(1 for r in match_results if 0.6 <= r.confidence < 0.7),
"no_match (<0.6)": sum(1 for r in match_results if r.confidence < 0.6)
}
return {
"total_tracks": total_tracks,
"matched_tracks": matched_tracks,
"match_percentage": (matched_tracks / total_tracks * 100) if total_tracks > 0 else 0,
"match_types": match_types,
"confidence_distribution": confidence_distribution
}
def create_search_queries(self, spotify_track: SpotifyTrack) -> List[str]:
queries = []
main_artist = self.extract_main_artist(spotify_track.artists)
clean_title = self.clean_title(spotify_track.name)
clean_album = self.normalize_string(spotify_track.album)
queries.append(f"{clean_title} {main_artist}")
queries.append(f"{main_artist} {clean_title}")
queries.append(f"{clean_title} {main_artist} {clean_album}")
queries.append(f"{clean_album} {main_artist}")
if len(spotify_track.artists) > 1:
all_artists = " ".join([self.clean_artist(a) for a in spotify_track.artists])
queries.append(f"{clean_title} {all_artists}")
return queries
def generate_download_query(self, spotify_track: SpotifyTrack) -> str:
main_artist = self.extract_main_artist(spotify_track.artists)
clean_title = self.clean_title(spotify_track.name)
return f"{main_artist} {clean_title}"
matching_engine = MusicMatchingEngine()

@ -121,72 +121,65 @@ class PlaylistTrackAnalysisWorker(QRunnable):
def _check_track_in_plex(self, spotify_track):
"""
Check if a Spotify track exists in Plex by trying several search strategies
and using the MusicMatchingEngine to find the best match.
across ALL artists associated with the track.
"""
try:
# Use the first artist for the primary search query
artist_name = spotify_track.artists[0] if spotify_track.artists else ""
original_title = spotify_track.name
# --- Generate a list of search queries, from most specific to most broad ---
search_queries = []
# Strategy 1: Original, unmodified title. Catches exact matches.
search_queries.append(original_title)
# Strategy 2: Title with content after a hyphen removed.
# e.g., "Song Title - Remaster" -> "Song Title"
if " - " in original_title:
title_before_hyphen = original_title.split(' - ')[0].strip()
if title_before_hyphen:
search_queries.append(title_before_hyphen)
# Strategy 3: Title with parenthetical/bracketed content removed.
# (Uses the simple cleaner from this file for an intermediate search)
cleaned_for_search = clean_track_name_for_search(original_title)
if cleaned_for_search.lower() != original_title.lower():
search_queries.append(cleaned_for_search)
# Strategy 4: A "base" title with all extra info removed (remixes, feats, etc.)
# using the more aggressive cleaning from the matching engine.
base_title = self.matching_engine.clean_title(original_title)
if base_title.lower() != cleaned_for_search.lower() and base_title.lower() != original_title.lower():
search_queries.append(base_title)
# Remove duplicate queries that might have resulted from the cleaning steps, preserving order.
unique_queries = list(dict.fromkeys(search_queries))
# --- Generate a list of title variations ---
title_variations = []
title_variations.append(original_title) # Strategy 1: Original title
if " - " in original_title: # Strategy 2: Strip content after hyphen
title_variations.append(original_title.split(' - ')[0].strip())
print(f"🧠 Generated search queries for '{original_title}': {unique_queries}")
cleaned_for_search = clean_track_name_for_search(original_title) # Strategy 3: Strip parenthetical content
if cleaned_for_search.lower() != original_title.lower():
title_variations.append(cleaned_for_search)
base_title = self.matching_engine.clean_title(original_title) # Strategy 4: Aggressively cleaned title
if base_title.lower() not in [t.lower() for t in title_variations]:
title_variations.append(base_title)
# --- Execute searches and collect all potential matches ---
unique_title_variations = list(dict.fromkeys(title_variations))
# --- Execute searches for EACH artist and collect all potential matches ---
all_potential_matches = []
found_match_ids = set()
# Use all artists from Spotify, not just the first one
artists_to_search = spotify_track.artists if spotify_track.artists else [""]
for query_title in unique_queries:
if self._cancelled:
return None, 0.0
# Call the updated search_tracks with the query title and artist
potential_plex_matches = self.plex_client.search_tracks(
title=query_title,
artist=artist_name,
limit=15 # Increased limit to get more candidates
)
for artist_name in artists_to_search:
if self._cancelled: return None, 0.0
for track in potential_plex_matches:
if track.id not in found_match_ids:
all_potential_matches.append(track)
found_match_ids.add(track.id)
print(f"🎤 Searching for artist: '{artist_name}'")
for query_title in unique_title_variations:
if self._cancelled: return None, 0.0
potential_plex_matches = self.plex_client.search_tracks(
title=query_title,
artist=artist_name,
limit=15
)
for track in potential_plex_matches:
if track.id not in found_match_ids:
all_potential_matches.append(track)
found_match_ids.add(track.id)
if not all_potential_matches:
print(f"❌ No Plex candidates found for '{original_title}' after trying all strategies.")
print(f"❌ No Plex candidates found for '{original_title}' after trying all artists and title variations.")
return None, 0.0
# --- Use the matching engine to find the best match among ALL candidates ---
print(f"✅ Found {len(all_potential_matches)} potential Plex matches for '{original_title}'. Scoring now...")
match_result = self.matching_engine.find_best_match(spotify_track, all_potential_matches)
# Return the best Plex track found and its confidence score.
if match_result.is_match:
print(f"✔️ Best match for '{original_title}': '{match_result.plex_track.title}' with confidence {match_result.confidence:.2f}")
else:
print(f"⚠️ No confident match found for '{original_title}'. Best attempt scored {match_result.confidence:.2f}.")
return match_result.plex_track, match_result.confidence
except Exception as e:

Loading…
Cancel
Save