|
|
|
|
@ -2,6 +2,7 @@ from typing import List, Optional, Dict, Any, Tuple
|
|
|
|
|
import re
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from difflib import SequenceMatcher
|
|
|
|
|
from unidecode import unidecode
|
|
|
|
|
from utils.logging_config import get_logger
|
|
|
|
|
from core.spotify_client import Track as SpotifyTrack
|
|
|
|
|
from core.plex_client import PlexTrackInfo
|
|
|
|
|
@ -17,20 +18,28 @@ class MatchResult:
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def is_match(self) -> bool:
|
|
|
|
|
return self.plex_track is not None and self.confidence >= 0.7
|
|
|
|
|
return self.plex_track is not None and self.confidence >= 0.8
|
|
|
|
|
|
|
|
|
|
class MusicMatchingEngine:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
# More comprehensive patterns to strip extra info from titles
|
|
|
|
|
self.title_patterns = [
|
|
|
|
|
r'\(.*?\)',
|
|
|
|
|
r'\[.*?\]',
|
|
|
|
|
r'\s*-\s*remaster.*',
|
|
|
|
|
r'\s*-\s*remix.*',
|
|
|
|
|
r'\s*-\s*live.*',
|
|
|
|
|
r'\s*-\s*acoustic.*',
|
|
|
|
|
r'\s*feat\..*',
|
|
|
|
|
r'\s*ft\..*',
|
|
|
|
|
r'\s*featuring.*',
|
|
|
|
|
r'\(feat\.?.*\)',
|
|
|
|
|
r'\[feat\.?.*\]',
|
|
|
|
|
r'\(with.*\)',
|
|
|
|
|
r'\(ft\.?.*\)',
|
|
|
|
|
r'\[ft\.?.*\]',
|
|
|
|
|
r'\(remix\)',
|
|
|
|
|
r'\(live\)',
|
|
|
|
|
r'\(acoustic\)',
|
|
|
|
|
r'\(radio edit\)',
|
|
|
|
|
r'\(album version\)',
|
|
|
|
|
r'\(original mix\)',
|
|
|
|
|
r'-\s*single version',
|
|
|
|
|
r'-\s*remaster.*',
|
|
|
|
|
r'-\s*live.*',
|
|
|
|
|
r'-\s*remix',
|
|
|
|
|
r'-\s*radio edit',
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
self.artist_patterns = [
|
|
|
|
|
@ -39,37 +48,51 @@ class MusicMatchingEngine:
|
|
|
|
|
r'\s*featuring.*',
|
|
|
|
|
r'\s*&.*',
|
|
|
|
|
r'\s*and.*',
|
|
|
|
|
r',.*'
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def normalize_string(self, text: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Normalizes string by converting to ASCII, lowercasing, and removing
|
|
|
|
|
specific punctuation while keeping alphanumeric characters.
|
|
|
|
|
"""
|
|
|
|
|
if not text:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
text = text.lower().strip()
|
|
|
|
|
# Transliterate Unicode characters (e.g., ñ -> n, é -> e) to ASCII
|
|
|
|
|
text = unidecode(text)
|
|
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s]', '', text)
|
|
|
|
|
# Convert to lowercase
|
|
|
|
|
text = text.lower()
|
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
# Remove specific punctuation but keep alphanumeric and spaces
|
|
|
|
|
text = re.sub(r'[^\w\s-]', '', text)
|
|
|
|
|
|
|
|
|
|
# Collapse multiple spaces into one
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
def clean_title(self, title: str) -> str:
|
|
|
|
|
"""Cleans title by removing common extra info using regex."""
|
|
|
|
|
cleaned = title
|
|
|
|
|
|
|
|
|
|
for pattern in self.title_patterns:
|
|
|
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
|
|
|
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
|
|
|
|
|
|
|
|
|
|
return self.normalize_string(cleaned)
|
|
|
|
|
|
|
|
|
|
def clean_artist(self, artist: str) -> str:
|
|
|
|
|
"""Cleans artist name by removing featured artists and other noise."""
|
|
|
|
|
cleaned = artist
|
|
|
|
|
|
|
|
|
|
for pattern in self.artist_patterns:
|
|
|
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
|
|
|
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
|
|
|
|
|
|
|
|
|
|
return self.normalize_string(cleaned)
|
|
|
|
|
|
|
|
|
|
def extract_main_artist(self, artists: List[str]) -> str:
|
|
|
|
|
"""Extracts and cleans the primary artist from a list."""
|
|
|
|
|
if not artists:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
@ -77,68 +100,69 @@ class MusicMatchingEngine:
|
|
|
|
|
return self.clean_artist(main_artist)
|
|
|
|
|
|
|
|
|
|
def similarity_score(self, str1: str, str2: str) -> float:
|
|
|
|
|
"""Calculates similarity score between two strings."""
|
|
|
|
|
if not str1 or not str2:
|
|
|
|
|
return 0.0
|
|
|
|
|
|
|
|
|
|
return SequenceMatcher(None, str1, str2).ratio()
|
|
|
|
|
|
|
|
|
|
def duration_similarity(self, duration1: int, duration2: int) -> float:
|
|
|
|
|
"""Calculates similarity score based on track duration (in ms)."""
|
|
|
|
|
if duration1 == 0 or duration2 == 0:
|
|
|
|
|
return 0.5
|
|
|
|
|
|
|
|
|
|
max_duration = max(duration1, duration2)
|
|
|
|
|
min_duration = min(duration1, duration2)
|
|
|
|
|
return 0.5 # Neutral score if a duration is missing
|
|
|
|
|
|
|
|
|
|
if max_duration == 0:
|
|
|
|
|
return 0.5
|
|
|
|
|
|
|
|
|
|
diff_ratio = abs(max_duration - min_duration) / max_duration
|
|
|
|
|
|
|
|
|
|
if diff_ratio <= 0.05:
|
|
|
|
|
# Allow a 5-second tolerance (5000 ms)
|
|
|
|
|
if abs(duration1 - duration2) <= 5000:
|
|
|
|
|
return 1.0
|
|
|
|
|
elif diff_ratio <= 0.1:
|
|
|
|
|
return 0.8
|
|
|
|
|
elif diff_ratio <= 0.2:
|
|
|
|
|
return 0.6
|
|
|
|
|
else:
|
|
|
|
|
return 0.3
|
|
|
|
|
|
|
|
|
|
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
|
|
|
|
|
spotify_title = self.clean_title(spotify_track.name)
|
|
|
|
|
plex_title = self.clean_title(plex_track.title)
|
|
|
|
|
|
|
|
|
|
spotify_artist = self.extract_main_artist(spotify_track.artists)
|
|
|
|
|
plex_artist = self.clean_artist(plex_track.artist)
|
|
|
|
|
|
|
|
|
|
spotify_album = self.normalize_string(spotify_track.album)
|
|
|
|
|
plex_album = self.normalize_string(plex_track.album)
|
|
|
|
|
|
|
|
|
|
title_score = self.similarity_score(spotify_title, plex_title)
|
|
|
|
|
artist_score = self.similarity_score(spotify_artist, plex_artist)
|
|
|
|
|
album_score = self.similarity_score(spotify_album, plex_album)
|
|
|
|
|
|
|
|
|
|
# CORRECTED: Plex duration is already in milliseconds.
|
|
|
|
|
duration_score = self.duration_similarity(
|
|
|
|
|
spotify_track.duration_ms,
|
|
|
|
|
plex_track.duration if plex_track.duration else 0
|
|
|
|
|
)
|
|
|
|
|
# Penalize larger differences
|
|
|
|
|
diff_ratio = abs(duration1 - duration2) / max(duration1, duration2)
|
|
|
|
|
return max(0, 1.0 - diff_ratio * 5) # Scale penalty
|
|
|
|
|
|
|
|
|
|
def calculate_match_confidence(self, spotify_track: SpotifyTrack, plex_track: PlexTrackInfo) -> Tuple[float, str]:
|
|
|
|
|
"""Calculates a confidence score for a potential match with weighted factors."""
|
|
|
|
|
|
|
|
|
|
if title_score >= 0.9 and artist_score >= 0.9 and album_score >= 0.8:
|
|
|
|
|
return 0.95, "exact_match"
|
|
|
|
|
elif title_score >= 0.8 and artist_score >= 0.8:
|
|
|
|
|
return 0.85, "high_confidence"
|
|
|
|
|
elif title_score >= 0.7 and artist_score >= 0.7:
|
|
|
|
|
return 0.75, "medium_confidence"
|
|
|
|
|
elif title_score >= 0.6 and artist_score >= 0.6:
|
|
|
|
|
return 0.65, "low_confidence"
|
|
|
|
|
# Clean titles and artists for comparison
|
|
|
|
|
spotify_title_cleaned = self.clean_title(spotify_track.name)
|
|
|
|
|
plex_title_cleaned = self.clean_title(plex_track.title)
|
|
|
|
|
|
|
|
|
|
spotify_main_artist_cleaned = self.extract_main_artist(spotify_track.artists)
|
|
|
|
|
plex_artist_normalized = self.normalize_string(plex_track.artist)
|
|
|
|
|
|
|
|
|
|
# --- Calculate individual scores ---
|
|
|
|
|
title_score = self.similarity_score(spotify_title_cleaned, plex_title_cleaned)
|
|
|
|
|
|
|
|
|
|
# Artist score: check if main Spotify artist is in the Plex artist string
|
|
|
|
|
artist_score = 1.0 if spotify_main_artist_cleaned in plex_artist_normalized else self.similarity_score(spotify_main_artist_cleaned, self.clean_artist(plex_track.artist))
|
|
|
|
|
|
|
|
|
|
duration_score = self.duration_similarity(spotify_track.duration_ms, plex_track.duration if plex_track.duration else 0)
|
|
|
|
|
|
|
|
|
|
# --- Weighted confidence calculation ---
|
|
|
|
|
# Weights: Title (50%), Artist (30%), Duration (20%)
|
|
|
|
|
confidence = (title_score * 0.5) + (artist_score * 0.3) + (duration_score * 0.2)
|
|
|
|
|
|
|
|
|
|
# Determine match type based on scores
|
|
|
|
|
if title_score > 0.95 and artist_score > 0.9 and duration_score > 0.9:
|
|
|
|
|
match_type = "perfect_match"
|
|
|
|
|
confidence = max(confidence, 0.98) # Boost confidence for perfect matches
|
|
|
|
|
elif title_score > 0.85 and artist_score > 0.8:
|
|
|
|
|
match_type = "high_confidence"
|
|
|
|
|
elif title_score > 0.75:
|
|
|
|
|
match_type = "medium_confidence"
|
|
|
|
|
else:
|
|
|
|
|
return 0.0, "no_match"
|
|
|
|
|
match_type = "low_confidence"
|
|
|
|
|
|
|
|
|
|
return confidence, match_type
|
|
|
|
|
|
|
|
|
|
def find_best_match(self, spotify_track: SpotifyTrack, plex_tracks: List[PlexTrackInfo]) -> MatchResult:
|
|
|
|
|
"""Finds the best Plex track match from a list of candidates."""
|
|
|
|
|
best_match = None
|
|
|
|
|
best_confidence = 0.0
|
|
|
|
|
best_match_type = "no_match"
|
|
|
|
|
|
|
|
|
|
if not plex_tracks:
|
|
|
|
|
return MatchResult(spotify_track, None, 0.0, "no_candidates")
|
|
|
|
|
|
|
|
|
|
for plex_track in plex_tracks:
|
|
|
|
|
confidence, match_type = self.calculate_match_confidence(spotify_track, plex_track)
|
|
|
|
|
|
|
|
|
|
@ -153,72 +177,3 @@ class MusicMatchingEngine:
|
|
|
|
|
confidence=best_confidence,
|
|
|
|
|
match_type=best_match_type
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def match_playlist_tracks(self, spotify_tracks: List[SpotifyTrack], plex_tracks: List[PlexTrackInfo]) -> List[MatchResult]:
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
logger.info(f"Matching {len(spotify_tracks)} Spotify tracks against {len(plex_tracks)} Plex tracks")
|
|
|
|
|
|
|
|
|
|
for spotify_track in spotify_tracks:
|
|
|
|
|
match_result = self.find_best_match(spotify_track, plex_tracks)
|
|
|
|
|
results.append(match_result)
|
|
|
|
|
|
|
|
|
|
if match_result.is_match:
|
|
|
|
|
logger.debug(f"Matched: {spotify_track.name} by {spotify_track.artists[0]} -> {match_result.plex_track.title} (confidence: {match_result.confidence:.2f})")
|
|
|
|
|
else:
|
|
|
|
|
logger.debug(f"No match found for: {spotify_track.name} by {spotify_track.artists[0]}")
|
|
|
|
|
|
|
|
|
|
matched_count = sum(1 for r in results if r.is_match)
|
|
|
|
|
logger.info(f"Successfully matched {matched_count}/{len(spotify_tracks)} tracks")
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def get_match_statistics(self, match_results: List[MatchResult]) -> Dict[str, Any]:
|
|
|
|
|
total_tracks = len(match_results)
|
|
|
|
|
matched_tracks = sum(1 for r in match_results if r.is_match)
|
|
|
|
|
|
|
|
|
|
match_types = {}
|
|
|
|
|
for result in match_results:
|
|
|
|
|
if result.is_match:
|
|
|
|
|
match_types[result.match_type] = match_types.get(result.match_type, 0) + 1
|
|
|
|
|
|
|
|
|
|
confidence_distribution = {
|
|
|
|
|
"high (>0.8)": sum(1 for r in match_results if r.confidence > 0.8),
|
|
|
|
|
"medium (0.7-0.8)": sum(1 for r in match_results if 0.7 <= r.confidence <= 0.8),
|
|
|
|
|
"low (0.6-0.7)": sum(1 for r in match_results if 0.6 <= r.confidence < 0.7),
|
|
|
|
|
"no_match (<0.6)": sum(1 for r in match_results if r.confidence < 0.6)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"total_tracks": total_tracks,
|
|
|
|
|
"matched_tracks": matched_tracks,
|
|
|
|
|
"match_percentage": (matched_tracks / total_tracks * 100) if total_tracks > 0 else 0,
|
|
|
|
|
"match_types": match_types,
|
|
|
|
|
"confidence_distribution": confidence_distribution
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def create_search_queries(self, spotify_track: SpotifyTrack) -> List[str]:
|
|
|
|
|
queries = []
|
|
|
|
|
|
|
|
|
|
main_artist = self.extract_main_artist(spotify_track.artists)
|
|
|
|
|
clean_title = self.clean_title(spotify_track.name)
|
|
|
|
|
clean_album = self.normalize_string(spotify_track.album)
|
|
|
|
|
|
|
|
|
|
queries.append(f"{clean_title} {main_artist}")
|
|
|
|
|
queries.append(f"{main_artist} {clean_title}")
|
|
|
|
|
queries.append(f"{clean_title} {main_artist} {clean_album}")
|
|
|
|
|
queries.append(f"{clean_album} {main_artist}")
|
|
|
|
|
|
|
|
|
|
if len(spotify_track.artists) > 1:
|
|
|
|
|
all_artists = " ".join([self.clean_artist(a) for a in spotify_track.artists])
|
|
|
|
|
queries.append(f"{clean_title} {all_artists}")
|
|
|
|
|
|
|
|
|
|
return queries
|
|
|
|
|
|
|
|
|
|
def generate_download_query(self, spotify_track: SpotifyTrack) -> str:
|
|
|
|
|
main_artist = self.extract_main_artist(spotify_track.artists)
|
|
|
|
|
clean_title = self.clean_title(spotify_track.name)
|
|
|
|
|
|
|
|
|
|
return f"{main_artist} {clean_title}"
|
|
|
|
|
|
|
|
|
|
matching_engine = MusicMatchingEngine()
|
|
|
|
|
|