consistent matching between sync and artists.

pull/2/head
Broque Thomas 9 months ago
parent 35fdef76ce
commit aafc0ca49f

@ -336,21 +336,44 @@ class MusicMatchingEngine:
queries.append(f"{artist} {cleaned_track}".strip())
print(f"🎯 PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'")
# PRIORITY 2: Try with just the first part before any dash/parentheses
simple_patterns = [
r'^([^-\(]+)', # Everything before first dash or parenthesis
r'^([^-]+)', # Everything before first dash only
]
# PRIORITY 2: Try simplified versions, but preserve important version info
# Only remove content that's likely to be album names or noise, not version info
for pattern in simple_patterns:
match = re.search(pattern, original_title.strip())
if match:
simple_title = match.group(1).strip()
if simple_title and len(simple_title) >= 3: # Avoid too-short titles
# Pattern 1: Remove content after " - " (likely album names)
dash_pattern = r'^([^-]+?)(?:\s*-\s*.+)?$'
match = re.search(dash_pattern, original_title.strip())
if match:
dash_title = match.group(1).strip()
if dash_title and len(dash_title) >= 3 and dash_title != original_title:
dash_clean = self.clean_title(dash_title)
if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]:
queries.append(f"{artist} {dash_clean}".strip())
print(f"🎯 PRIORITY 2: Dash-cleaned query: '{artist} {dash_clean}'")
# Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info
# Check if parentheses contain version-related keywords before removing
paren_pattern = r'^(.+?)\s*\(([^)]+)\)(.*)$'
paren_match = re.search(paren_pattern, original_title)
if paren_match:
before_paren = paren_match.group(1).strip()
paren_content = paren_match.group(2).strip().lower()
after_paren = paren_match.group(3).strip()
# Define what we consider "noise" vs "important version info"
noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean', 'radio edit', 'radio version']
version_keywords = ['extended', 'live', 'acoustic', 'remix', 'remaster', 'demo', 'instrumental', 'version', 'edit', 'mix']
# Only remove parentheses if they contain noise, not version info
is_noise = any(keyword in paren_content for keyword in noise_keywords)
is_version = any(keyword in paren_content for keyword in version_keywords)
if is_noise and not is_version and before_paren:
simple_title = (before_paren + ' ' + after_paren).strip()
if simple_title and len(simple_title) >= 3:
simple_clean = self.clean_title(simple_title)
if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]:
queries.append(f"{artist} {simple_clean}".strip())
print(f"🎯 PRIORITY 2: Simple-cleaned query: '{artist} {simple_clean}'")
print(f"🎯 PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'")
# PRIORITY 3: Original query (ONLY if no album was detected or if it's different)
original_track_clean = self.clean_title(original_title)

@ -458,67 +458,216 @@ class MusicDatabase:
return []
def search_tracks(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseTrack]:
"""Search tracks by title and/or artist name with fuzzy matching"""
"""Search tracks by title and/or artist name with Unicode-aware fuzzy matching"""
try:
if not title and not artist:
return []
conn = self._get_connection()
cursor = conn.cursor()
# Build dynamic query based on provided parameters
where_conditions = []
params = []
if title:
where_conditions.append("tracks.title LIKE ?")
params.append(f"%{title}%")
# STRATEGY 1: Try basic SQL LIKE search first (fastest)
basic_results = self._search_tracks_basic(cursor, title, artist, limit)
if artist:
where_conditions.append("artists.name LIKE ?")
params.append(f"%{artist}%")
if basic_results:
logger.debug(f"🔍 Basic search found {len(basic_results)} results")
return basic_results
if not where_conditions:
# If no search criteria, return empty list
return []
where_clause = " AND ".join(where_conditions)
params.append(limit)
cursor.execute(f"""
SELECT tracks.*, artists.name as artist_name, albums.title as album_title
FROM tracks
JOIN artists ON tracks.artist_id = artists.id
JOIN albums ON tracks.album_id = albums.id
WHERE {where_clause}
ORDER BY tracks.title, artists.name
LIMIT ?
""", params)
# STRATEGY 2: If basic search fails and we have Unicode support, try normalized search
try:
from unidecode import unidecode
unicode_support = True
except ImportError:
unicode_support = False
rows = cursor.fetchall()
if unicode_support:
normalized_results = self._search_tracks_unicode_fallback(cursor, title, artist, limit)
if normalized_results:
logger.debug(f"🔍 Unicode fallback search found {len(normalized_results)} results")
return normalized_results
tracks = []
for row in rows:
track = DatabaseTrack(
id=row['id'],
album_id=row['album_id'],
artist_id=row['artist_id'],
title=row['title'],
track_number=row['track_number'],
duration=row['duration'],
file_path=row['file_path'],
bitrate=row['bitrate'],
created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None,
updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None
)
# Add artist and album info for compatibility with Plex responses
track.artist_name = row['artist_name']
track.album_title = row['album_title']
tracks.append(track)
# STRATEGY 3: Last resort - broader fuzzy search with Python filtering
fuzzy_results = self._search_tracks_fuzzy_fallback(cursor, title, artist, limit)
if fuzzy_results:
logger.debug(f"🔍 Fuzzy fallback search found {len(fuzzy_results)} results")
return tracks
return fuzzy_results
except Exception as e:
logger.error(f"Error searching tracks with title='{title}', artist='{artist}': {e}")
return []
def _search_tracks_basic(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
"""Basic SQL LIKE search - fastest method"""
where_conditions = []
params = []
if title:
where_conditions.append("tracks.title LIKE ?")
params.append(f"%{title}%")
if artist:
where_conditions.append("artists.name LIKE ?")
params.append(f"%{artist}%")
if not where_conditions:
return []
where_clause = " AND ".join(where_conditions)
params.append(limit)
cursor.execute(f"""
SELECT tracks.*, artists.name as artist_name, albums.title as album_title
FROM tracks
JOIN artists ON tracks.artist_id = artists.id
JOIN albums ON tracks.album_id = albums.id
WHERE {where_clause}
ORDER BY tracks.title, artists.name
LIMIT ?
""", params)
return self._rows_to_tracks(cursor.fetchall())
def _search_tracks_unicode_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
"""Unicode-aware fallback search - tries normalized versions"""
from unidecode import unidecode
# Normalize search terms
title_norm = unidecode(title).lower() if title else ""
artist_norm = unidecode(artist).lower() if artist else ""
# Try searching with normalized versions
where_conditions = []
params = []
if title:
where_conditions.append("LOWER(tracks.title) LIKE ?")
params.append(f"%{title_norm}%")
if artist:
where_conditions.append("LOWER(artists.name) LIKE ?")
params.append(f"%{artist_norm}%")
if not where_conditions:
return []
where_clause = " AND ".join(where_conditions)
params.append(limit * 2) # Get more results for filtering
cursor.execute(f"""
SELECT tracks.*, artists.name as artist_name, albums.title as album_title
FROM tracks
JOIN artists ON tracks.artist_id = artists.id
JOIN albums ON tracks.album_id = albums.id
WHERE {where_clause}
ORDER BY tracks.title, artists.name
LIMIT ?
""", params)
rows = cursor.fetchall()
# Filter results with proper Unicode normalization
filtered_tracks = []
for row in rows:
db_title_norm = unidecode(row['title'].lower()) if row['title'] else ""
db_artist_norm = unidecode(row['artist_name'].lower()) if row['artist_name'] else ""
title_matches = not title or title_norm in db_title_norm
artist_matches = not artist or artist_norm in db_artist_norm
if title_matches and artist_matches:
filtered_tracks.append(row)
if len(filtered_tracks) >= limit:
break
return self._rows_to_tracks(filtered_tracks)
def _search_tracks_fuzzy_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
"""Broadest fuzzy search - partial word matching"""
# Get broader results by searching for individual words
search_terms = []
if title:
# Split title into words and search for each
title_words = [w.strip() for w in title.lower().split() if len(w.strip()) >= 3]
search_terms.extend(title_words)
if artist:
# Split artist into words and search for each
artist_words = [w.strip() for w in artist.lower().split() if len(w.strip()) >= 3]
search_terms.extend(artist_words)
if not search_terms:
return []
# Build a query that searches for any of the words
like_conditions = []
params = []
for term in search_terms[:5]: # Limit to 5 terms to avoid too broad search
like_conditions.append("(LOWER(tracks.title) LIKE ? OR LOWER(artists.name) LIKE ?)")
params.extend([f"%{term}%", f"%{term}%"])
if not like_conditions:
return []
where_clause = " OR ".join(like_conditions)
params.append(limit * 3) # Get more results for scoring
cursor.execute(f"""
SELECT tracks.*, artists.name as artist_name, albums.title as album_title
FROM tracks
JOIN artists ON tracks.artist_id = artists.id
JOIN albums ON tracks.album_id = albums.id
WHERE {where_clause}
ORDER BY tracks.title, artists.name
LIMIT ?
""", params)
rows = cursor.fetchall()
# Score and filter results
scored_results = []
for row in rows:
# Simple scoring based on how many search terms match
score = 0
db_title_lower = row['title'].lower()
db_artist_lower = row['artist_name'].lower()
for term in search_terms:
if term in db_title_lower or term in db_artist_lower:
score += 1
if score > 0:
scored_results.append((score, row))
# Sort by score and take top results
scored_results.sort(key=lambda x: x[0], reverse=True)
top_rows = [row for score, row in scored_results[:limit]]
return self._rows_to_tracks(top_rows)
def _rows_to_tracks(self, rows) -> List[DatabaseTrack]:
"""Convert database rows to DatabaseTrack objects"""
tracks = []
for row in rows:
track = DatabaseTrack(
id=row['id'],
album_id=row['album_id'],
artist_id=row['artist_id'],
title=row['title'],
track_number=row['track_number'],
duration=row['duration'],
file_path=row['file_path'],
bitrate=row['bitrate'],
created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None,
updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None
)
# Add artist and album info for compatibility with Plex responses
track.artist_name = row['artist_name']
track.album_title = row['album_title']
tracks.append(track)
return tracks
def search_albums(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseAlbum]:
"""Search albums by title and/or artist name with fuzzy matching"""
try:
@ -1007,12 +1156,43 @@ class MusicDatabase:
return unique_variations
def _normalize_for_comparison(self, text: str) -> str:
"""Normalize text for comparison with Unicode accent handling"""
if not text:
return ""
# Try to use unidecode for accent normalization, fallback to basic if not available
try:
from unidecode import unidecode
# Convert accents: é→e, ñ→n, ü→u, etc.
normalized = unidecode(text)
except ImportError:
# Fallback: basic normalization without accent handling
normalized = text
logger.warning("unidecode not available, accent matching may be limited")
# Convert to lowercase and strip
return normalized.lower().strip()
def _calculate_track_confidence(self, search_title: str, search_artist: str, db_track: DatabaseTrack) -> float:
"""Calculate confidence score for track match with enhanced cleaning"""
"""Calculate confidence score for track match with enhanced cleaning and Unicode normalization"""
try:
# Direct similarity
title_similarity = self._string_similarity(search_title.lower(), db_track.title.lower())
artist_similarity = self._string_similarity(search_artist.lower(), db_track.artist_name.lower())
# Unicode-aware normalization for accent matching (é→e, ñ→n, etc.)
search_title_norm = self._normalize_for_comparison(search_title)
search_artist_norm = self._normalize_for_comparison(search_artist)
db_title_norm = self._normalize_for_comparison(db_track.title)
db_artist_norm = self._normalize_for_comparison(db_track.artist_name)
# Debug logging for Unicode normalization
if search_title != search_title_norm or search_artist != search_artist_norm or \
db_track.title != db_title_norm or db_track.artist_name != db_artist_norm:
logger.debug(f"🔤 Unicode normalization:")
logger.debug(f" Search: '{search_title}''{search_title_norm}' | '{search_artist}''{search_artist_norm}'")
logger.debug(f" Database: '{db_track.title}''{db_title_norm}' | '{db_track.artist_name}''{db_artist_norm}'")
# Direct similarity with Unicode normalization
title_similarity = self._string_similarity(search_title_norm, db_title_norm)
artist_similarity = self._string_similarity(search_artist_norm, db_artist_norm)
# Also try with cleaned versions (removing parentheses, brackets, etc.)
clean_search_title = self._clean_track_title_for_comparison(search_title)

@ -83,25 +83,51 @@ def save_sync_status(data):
def clean_track_name_for_search(track_name):
"""
Cleans a track name for searching by removing text in parentheses and brackets.
If cleaning the name results in an empty string, the original name is returned.
Intelligently cleans a track name for searching by removing noise while preserving important version information.
Removes: (feat. Artist), (Explicit), (Clean), etc.
Keeps: (Extended Version), (Live), (Acoustic), (Remix), etc.
"""
if not track_name or not isinstance(track_name, str):
return track_name
# Remove content in parentheses, e.g., (feat. Artist), (Remix)
cleaned_name = re.sub(r'\s*\([^)]*\)', '', track_name).strip()
# Remove content in square brackets, e.g., [Live], [Explicit]
cleaned_name = re.sub(r'\s*\[[^\]]*\]', '', cleaned_name).strip()
cleaned_name = track_name
# If cleaning results in an empty string (e.g., track name was only "(Intro)"),
# return the original track name to avoid an empty search.
if not cleaned_name:
# Define patterns to REMOVE (noise that doesn't affect track identity)
remove_patterns = [
r'\s*\(explicit\)', # (Explicit)
r'\s*\(clean\)', # (Clean)
r'\s*\(radio\s*edit\)', # (Radio Edit)
r'\s*\(radio\s*version\)', # (Radio Version)
r'\s*\(feat\.?\s*[^)]+\)', # (feat. Artist) or (ft. Artist)
r'\s*\(ft\.?\s*[^)]+\)', # (ft Artist)
r'\s*\(featuring\s*[^)]+\)', # (featuring Artist)
r'\s*\(with\s*[^)]+\)', # (with Artist)
r'\s*\[[^\]]*explicit[^\]]*\]', # [Explicit] in brackets
r'\s*\[[^\]]*clean[^\]]*\]', # [Clean] in brackets
]
# Apply removal patterns
for pattern in remove_patterns:
cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE).strip()
# PRESERVE important version information (do NOT remove these)
# These patterns are intentionally NOT in the remove list:
# - (Extended Version), (Extended), (Long Version)
# - (Live), (Live Version), (Concert)
# - (Acoustic), (Acoustic Version)
# - (Remix), (Club Mix), (Dance Mix)
# - (Remastered), (Remaster)
# - (Demo), (Studio Version)
# - (Instrumental)
# - Album/year info like (2023), (Deluxe Edition)
# If cleaning results in an empty string, return the original track name
if not cleaned_name.strip():
return track_name
# Log cleaning if significant changes were made
if cleaned_name != track_name:
print(f"🧹 Cleaned track name for search: '{track_name}' -> '{cleaned_name}'")
print(f"🧹 Intelligent track cleaning: '{track_name}' -> '{cleaned_name}'")
return cleaned_name

Loading…
Cancel
Save