You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/matching_engine.py

223 lines
8.0 KiB

from typing import List, Optional, Dict, Any, Tuple
import re
from dataclasses import dataclass
from difflib import SequenceMatcher
from utils.logging_config import get_logger
from core.spotify_client import Track as SpotifyTrack
from core.plex_client import PlexTrackInfo
logger = get_logger("matching_engine")
@dataclass
class MatchResult:
spotify_track: SpotifyTrack
plex_track: Optional[PlexTrackInfo]
confidence: float
match_type: str
@property
def is_match(self) -> bool:
return self.plex_track is not None and self.confidence >= 0.7
class MusicMatchingEngine:
def __init__(self):
self.title_patterns = [
r'\(.*?\)',
r'\[.*?\]',
r'\s*-\s*remaster.*',
r'\s*-\s*remix.*',
r'\s*-\s*live.*',
r'\s*-\s*acoustic.*',
r'\s*feat\..*',
r'\s*ft\..*',
r'\s*featuring.*',
]
self.artist_patterns = [
r'\s*feat\..*',
r'\s*ft\..*',
r'\s*featuring.*',
r'\s*&.*',
r'\s*and.*',
]
def normalize_string(self, text: str) -> str:
if not text:
return ""
text = text.lower().strip()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text)
return text
def clean_title(self, title: str) -> str:
cleaned = title
for pattern in self.title_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
return self.normalize_string(cleaned)
def clean_artist(self, artist: str) -> str:
cleaned = artist
for pattern in self.artist_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
return self.normalize_string(cleaned)
def extract_main_artist(self, artists: List[str]) -> str:
if not artists:
return ""
main_artist = artists[0]
return self.clean_artist(main_artist)
def similarity_score(self, str1: str, str2: str) -> float:
if not str1 or not str2:
return 0.0
return SequenceMatcher(None, str1, str2).ratio()
def duration_similarity(self, duration1: int, duration2: int) -> float:
if duration1 == 0 or duration2 == 0:
return 0.5
max_duration = max(duration1, duration2)
min_duration = min(duration1, duration2)
if max_duration == 0:
return 0.5
diff_ratio = abs(max_duration - min_duration) / max_duration
if diff_ratio <= 0.05:
return 1.0
elif diff_ratio <= 0.1:
return 0.8
elif diff_ratio <= 0.2:
return 0.6
else:
return 0.3
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
spotify_title = self.clean_title(spotify_track.name)
plex_title = self.clean_title(plex_track.title)
spotify_artist = self.extract_main_artist(spotify_track.artists)
plex_artist = self.clean_artist(plex_track.artist)
spotify_album = self.normalize_string(spotify_track.album)
plex_album = self.normalize_string(plex_track.album)
title_score = self.similarity_score(spotify_title, plex_title)
artist_score = self.similarity_score(spotify_artist, plex_artist)
album_score = self.similarity_score(spotify_album, plex_album)
duration_score = self.duration_similarity(
spotify_track.duration_ms,
plex_track.duration * 1000 if plex_track.duration else 0
)
if title_score >= 0.9 and artist_score >= 0.9 and album_score >= 0.8:
return 0.95, "exact_match"
elif title_score >= 0.8 and artist_score >= 0.8:
return 0.85, "high_confidence"
elif title_score >= 0.7 and artist_score >= 0.7:
return 0.75, "medium_confidence"
elif title_score >= 0.6 and artist_score >= 0.6:
return 0.65, "low_confidence"
else:
return 0.0, "no_match"
def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult:
best_match = None
best_confidence = 0.0
best_match_type = "no_match"
for plex_track in plex_tracks:
confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track)
if confidence > best_confidence:
best_confidence = confidence
best_match = plex_track
best_match_type = match_type
return MatchResult(
spotify_track=spotify_track,
plex_track=best_match,
confidence=best_confidence,
match_type=best_match_type
)
def match_playlist_tracks(self, spotify_tracks: List[SpotifyTrack], plex_tracks: List[PlexTrackInfo]) -> List[MatchResult]:
results = []
logger.info(f"Matching {len(spotify_tracks)} Spotify tracks against {len(plex_tracks)} Plex tracks")
for spotify_track in spotify_tracks:
match_result = self.find_best_match(spotify_track, plex_tracks)
results.append(match_result)
if match_result.is_match:
logger.debug(f"Matched: {spotify_track.name} by {spotify_track.artists[0]} -> {match_result.plex_track.title} (confidence: {match_result.confidence:.2f})")
else:
logger.debug(f"No match found for: {spotify_track.name} by {spotify_track.artists[0]}")
matched_count = sum(1 for r in results if r.is_match)
logger.info(f"Successfully matched {matched_count}/{len(spotify_tracks)} tracks")
return results
def get_match_statistics(self, match_results: List[MatchResult]) -> Dict[str, Any]:
total_tracks = len(match_results)
matched_tracks = sum(1 for r in match_results if r.is_match)
match_types = {}
for result in match_results:
if result.is_match:
match_types[result.match_type] = match_types.get(result.match_type, 0) + 1
confidence_distribution = {
"high (>0.8)": sum(1 for r in match_results if r.confidence > 0.8),
"medium (0.7-0.8)": sum(1 for r in match_results if 0.7 <= r.confidence <= 0.8),
"low (0.6-0.7)": sum(1 for r in match_results if 0.6 <= r.confidence < 0.7),
"no_match (<0.6)": sum(1 for r in match_results if r.confidence < 0.6)
}
return {
"total_tracks": total_tracks,
"matched_tracks": matched_tracks,
"match_percentage": (matched_tracks / total_tracks * 100) if total_tracks > 0 else 0,
"match_types": match_types,
"confidence_distribution": confidence_distribution
}
def create_search_queries(self, spotify_track: SpotifyTrack) -> List[str]:
queries = []
main_artist = self.extract_main_artist(spotify_track.artists)
clean_title = self.clean_title(spotify_track.name)
clean_album = self.normalize_string(spotify_track.album)
queries.append(f"{clean_title} {main_artist}")
queries.append(f"{main_artist} {clean_title}")
queries.append(f"{clean_title} {main_artist} {clean_album}")
queries.append(f"{clean_album} {main_artist}")
if len(spotify_track.artists) > 1:
all_artists = " ".join([self.clean_artist(a) for a in spotify_track.artists])
queries.append(f"{clean_title} {all_artists}")
return queries
def generate_download_query(self, spotify_track: SpotifyTrack) -> str:
main_artist = self.extract_main_artist(spotify_track.artists)
clean_title = self.clean_title(spotify_track.name)
return f"{main_artist} {clean_title}"
matching_engine = MusicMatchingEngine()