diff --git a/core/__pycache__/matching_engine.cpython-310.pyc b/core/__pycache__/matching_engine.cpython-310.pyc new file mode 100644 index 00000000..25e6e581 Binary files /dev/null and b/core/__pycache__/matching_engine.cpython-310.pyc differ diff --git a/core/__pycache__/matching_engine.cpython-312.pyc b/core/__pycache__/matching_engine.cpython-312.pyc index 8605fe69..8cd5e1a9 100644 Binary files a/core/__pycache__/matching_engine.cpython-312.pyc and b/core/__pycache__/matching_engine.cpython-312.pyc differ diff --git a/core/matching_engine.py b/core/matching_engine.py index 0b1cbc42..ac44bddf 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -336,21 +336,44 @@ class MusicMatchingEngine: queries.append(f"{artist} {cleaned_track}".strip()) print(f"🎯 PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'") - # PRIORITY 2: Try with just the first part before any dash/parentheses - simple_patterns = [ - r'^([^-\(]+)', # Everything before first dash or parenthesis - r'^([^-]+)', # Everything before first dash only - ] + # PRIORITY 2: Try simplified versions, but preserve important version info + # Only remove content that's likely to be album names or noise, not version info - for pattern in simple_patterns: - match = re.search(pattern, original_title.strip()) - if match: - simple_title = match.group(1).strip() - if simple_title and len(simple_title) >= 3: # Avoid too-short titles + # Pattern 1: Remove content after " - " (likely album names) + dash_pattern = r'^([^-]+?)(?:\s*-\s*.+)?$' + match = re.search(dash_pattern, original_title.strip()) + if match: + dash_title = match.group(1).strip() + if dash_title and len(dash_title) >= 3 and dash_title != original_title: + dash_clean = self.clean_title(dash_title) + if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: + queries.append(f"{artist} {dash_clean}".strip()) + print(f"🎯 PRIORITY 2: Dash-cleaned query: '{artist} {dash_clean}'") + + # Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info + # Check if parentheses contain version-related keywords before removing + paren_pattern = r'^(.+?)\s*\(([^)]+)\)(.*)$' + paren_match = re.search(paren_pattern, original_title) + if paren_match: + before_paren = paren_match.group(1).strip() + paren_content = paren_match.group(2).strip().lower() + after_paren = paren_match.group(3).strip() + + # Define what we consider "noise" vs "important version info" + noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean', 'radio edit', 'radio version'] + version_keywords = ['extended', 'live', 'acoustic', 'remix', 'remaster', 'demo', 'instrumental', 'version', 'edit', 'mix'] + + # Only remove parentheses if they contain noise, not version info + is_noise = any(keyword in paren_content for keyword in noise_keywords) + is_version = any(keyword in paren_content for keyword in version_keywords) + + if is_noise and not is_version and before_paren: + simple_title = (before_paren + ' ' + after_paren).strip() + if simple_title and len(simple_title) >= 3: simple_clean = self.clean_title(simple_title) if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {simple_clean}".strip()) - print(f"🎯 PRIORITY 2: Simple-cleaned query: '{artist} {simple_clean}'") + print(f"🎯 PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'") # PRIORITY 3: Original query (ONLY if no album was detected or if it's different) original_track_clean = self.clean_title(original_title) diff --git a/database/__pycache__/music_database.cpython-310.pyc b/database/__pycache__/music_database.cpython-310.pyc index f4fdcf3a..4d19f993 100644 Binary files a/database/__pycache__/music_database.cpython-310.pyc and b/database/__pycache__/music_database.cpython-310.pyc differ diff --git a/database/__pycache__/music_database.cpython-312.pyc b/database/__pycache__/music_database.cpython-312.pyc index dfb5dc16..33dbdac6 100644 Binary files a/database/__pycache__/music_database.cpython-312.pyc and b/database/__pycache__/music_database.cpython-312.pyc differ diff --git a/database/music_database.py b/database/music_database.py index fe07cd0a..bec4eb70 100644 --- a/database/music_database.py +++ b/database/music_database.py @@ -458,67 +458,216 @@ class MusicDatabase: return [] def search_tracks(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseTrack]: - """Search tracks by title and/or artist name with fuzzy matching""" + """Search tracks by title and/or artist name with Unicode-aware fuzzy matching""" try: + if not title and not artist: + return [] + conn = self._get_connection() cursor = conn.cursor() - # Build dynamic query based on provided parameters - where_conditions = [] - params = [] - - if title: - where_conditions.append("tracks.title LIKE ?") - params.append(f"%{title}%") + # STRATEGY 1: Try basic SQL LIKE search first (fastest) + basic_results = self._search_tracks_basic(cursor, title, artist, limit) - if artist: - where_conditions.append("artists.name LIKE ?") - params.append(f"%{artist}%") + if basic_results: + logger.debug(f"πŸ” Basic search found {len(basic_results)} results") + return basic_results - if not where_conditions: - # If no search criteria, return empty list - return [] - - where_clause = " AND ".join(where_conditions) - params.append(limit) - - cursor.execute(f""" - SELECT tracks.*, artists.name as artist_name, albums.title as album_title - FROM tracks - JOIN artists ON tracks.artist_id = artists.id - JOIN albums ON tracks.album_id = albums.id - WHERE {where_clause} - ORDER BY tracks.title, artists.name - LIMIT ? - """, params) + # STRATEGY 2: If basic search fails and we have Unicode support, try normalized search + try: + from unidecode import unidecode + unicode_support = True + except ImportError: + unicode_support = False - rows = cursor.fetchall() + if unicode_support: + normalized_results = self._search_tracks_unicode_fallback(cursor, title, artist, limit) + if normalized_results: + logger.debug(f"πŸ” Unicode fallback search found {len(normalized_results)} results") + return normalized_results - tracks = [] - for row in rows: - track = DatabaseTrack( - id=row['id'], - album_id=row['album_id'], - artist_id=row['artist_id'], - title=row['title'], - track_number=row['track_number'], - duration=row['duration'], - file_path=row['file_path'], - bitrate=row['bitrate'], - created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None, - updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None - ) - # Add artist and album info for compatibility with Plex responses - track.artist_name = row['artist_name'] - track.album_title = row['album_title'] - tracks.append(track) + # STRATEGY 3: Last resort - broader fuzzy search with Python filtering + fuzzy_results = self._search_tracks_fuzzy_fallback(cursor, title, artist, limit) + if fuzzy_results: + logger.debug(f"πŸ” Fuzzy fallback search found {len(fuzzy_results)} results") - return tracks + return fuzzy_results except Exception as e: logger.error(f"Error searching tracks with title='{title}', artist='{artist}': {e}") return [] + def _search_tracks_basic(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]: + """Basic SQL LIKE search - fastest method""" + where_conditions = [] + params = [] + + if title: + where_conditions.append("tracks.title LIKE ?") + params.append(f"%{title}%") + + if artist: + where_conditions.append("artists.name LIKE ?") + params.append(f"%{artist}%") + + if not where_conditions: + return [] + + where_clause = " AND ".join(where_conditions) + params.append(limit) + + cursor.execute(f""" + SELECT tracks.*, artists.name as artist_name, albums.title as album_title + FROM tracks + JOIN artists ON tracks.artist_id = artists.id + JOIN albums ON tracks.album_id = albums.id + WHERE {where_clause} + ORDER BY tracks.title, artists.name + LIMIT ? + """, params) + + return self._rows_to_tracks(cursor.fetchall()) + + def _search_tracks_unicode_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]: + """Unicode-aware fallback search - tries normalized versions""" + from unidecode import unidecode + + # Normalize search terms + title_norm = unidecode(title).lower() if title else "" + artist_norm = unidecode(artist).lower() if artist else "" + + # Try searching with normalized versions + where_conditions = [] + params = [] + + if title: + where_conditions.append("LOWER(tracks.title) LIKE ?") + params.append(f"%{title_norm}%") + + if artist: + where_conditions.append("LOWER(artists.name) LIKE ?") + params.append(f"%{artist_norm}%") + + if not where_conditions: + return [] + + where_clause = " AND ".join(where_conditions) + params.append(limit * 2) # Get more results for filtering + + cursor.execute(f""" + SELECT tracks.*, artists.name as artist_name, albums.title as album_title + FROM tracks + JOIN artists ON tracks.artist_id = artists.id + JOIN albums ON tracks.album_id = albums.id + WHERE {where_clause} + ORDER BY tracks.title, artists.name + LIMIT ? + """, params) + + rows = cursor.fetchall() + + # Filter results with proper Unicode normalization + filtered_tracks = [] + for row in rows: + db_title_norm = unidecode(row['title'].lower()) if row['title'] else "" + db_artist_norm = unidecode(row['artist_name'].lower()) if row['artist_name'] else "" + + title_matches = not title or title_norm in db_title_norm + artist_matches = not artist or artist_norm in db_artist_norm + + if title_matches and artist_matches: + filtered_tracks.append(row) + if len(filtered_tracks) >= limit: + break + + return self._rows_to_tracks(filtered_tracks) + + def _search_tracks_fuzzy_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]: + """Broadest fuzzy search - partial word matching""" + # Get broader results by searching for individual words + search_terms = [] + if title: + # Split title into words and search for each + title_words = [w.strip() for w in title.lower().split() if len(w.strip()) >= 3] + search_terms.extend(title_words) + + if artist: + # Split artist into words and search for each + artist_words = [w.strip() for w in artist.lower().split() if len(w.strip()) >= 3] + search_terms.extend(artist_words) + + if not search_terms: + return [] + + # Build a query that searches for any of the words + like_conditions = [] + params = [] + + for term in search_terms[:5]: # Limit to 5 terms to avoid too broad search + like_conditions.append("(LOWER(tracks.title) LIKE ? OR LOWER(artists.name) LIKE ?)") + params.extend([f"%{term}%", f"%{term}%"]) + + if not like_conditions: + return [] + + where_clause = " OR ".join(like_conditions) + params.append(limit * 3) # Get more results for scoring + + cursor.execute(f""" + SELECT tracks.*, artists.name as artist_name, albums.title as album_title + FROM tracks + JOIN artists ON tracks.artist_id = artists.id + JOIN albums ON tracks.album_id = albums.id + WHERE {where_clause} + ORDER BY tracks.title, artists.name + LIMIT ? + """, params) + + rows = cursor.fetchall() + + # Score and filter results + scored_results = [] + for row in rows: + # Simple scoring based on how many search terms match + score = 0 + db_title_lower = row['title'].lower() + db_artist_lower = row['artist_name'].lower() + + for term in search_terms: + if term in db_title_lower or term in db_artist_lower: + score += 1 + + if score > 0: + scored_results.append((score, row)) + + # Sort by score and take top results + scored_results.sort(key=lambda x: x[0], reverse=True) + top_rows = [row for score, row in scored_results[:limit]] + + return self._rows_to_tracks(top_rows) + + def _rows_to_tracks(self, rows) -> List[DatabaseTrack]: + """Convert database rows to DatabaseTrack objects""" + tracks = [] + for row in rows: + track = DatabaseTrack( + id=row['id'], + album_id=row['album_id'], + artist_id=row['artist_id'], + title=row['title'], + track_number=row['track_number'], + duration=row['duration'], + file_path=row['file_path'], + bitrate=row['bitrate'], + created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None, + updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None + ) + # Add artist and album info for compatibility with Plex responses + track.artist_name = row['artist_name'] + track.album_title = row['album_title'] + tracks.append(track) + return tracks + def search_albums(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseAlbum]: """Search albums by title and/or artist name with fuzzy matching""" try: @@ -1007,12 +1156,43 @@ class MusicDatabase: return unique_variations + def _normalize_for_comparison(self, text: str) -> str: + """Normalize text for comparison with Unicode accent handling""" + if not text: + return "" + + # Try to use unidecode for accent normalization, fallback to basic if not available + try: + from unidecode import unidecode + # Convert accents: Γ©β†’e, Γ±β†’n, ΓΌβ†’u, etc. + normalized = unidecode(text) + except ImportError: + # Fallback: basic normalization without accent handling + normalized = text + logger.warning("unidecode not available, accent matching may be limited") + + # Convert to lowercase and strip + return normalized.lower().strip() + def _calculate_track_confidence(self, search_title: str, search_artist: str, db_track: DatabaseTrack) -> float: - """Calculate confidence score for track match with enhanced cleaning""" + """Calculate confidence score for track match with enhanced cleaning and Unicode normalization""" try: - # Direct similarity - title_similarity = self._string_similarity(search_title.lower(), db_track.title.lower()) - artist_similarity = self._string_similarity(search_artist.lower(), db_track.artist_name.lower()) + # Unicode-aware normalization for accent matching (Γ©β†’e, Γ±β†’n, etc.) + search_title_norm = self._normalize_for_comparison(search_title) + search_artist_norm = self._normalize_for_comparison(search_artist) + db_title_norm = self._normalize_for_comparison(db_track.title) + db_artist_norm = self._normalize_for_comparison(db_track.artist_name) + + # Debug logging for Unicode normalization + if search_title != search_title_norm or search_artist != search_artist_norm or \ + db_track.title != db_title_norm or db_track.artist_name != db_artist_norm: + logger.debug(f"πŸ”€ Unicode normalization:") + logger.debug(f" Search: '{search_title}' β†’ '{search_title_norm}' | '{search_artist}' β†’ '{search_artist_norm}'") + logger.debug(f" Database: '{db_track.title}' β†’ '{db_title_norm}' | '{db_track.artist_name}' β†’ '{db_artist_norm}'") + + # Direct similarity with Unicode normalization + title_similarity = self._string_similarity(search_title_norm, db_title_norm) + artist_similarity = self._string_similarity(search_artist_norm, db_artist_norm) # Also try with cleaned versions (removing parentheses, brackets, etc.) clean_search_title = self._clean_track_title_for_comparison(search_title) diff --git a/ui/pages/__pycache__/sync.cpython-312.pyc b/ui/pages/__pycache__/sync.cpython-312.pyc index bedc1871..c50db33c 100644 Binary files a/ui/pages/__pycache__/sync.cpython-312.pyc and b/ui/pages/__pycache__/sync.cpython-312.pyc differ diff --git a/ui/pages/sync.py b/ui/pages/sync.py index fa7b6d6d..409f4c87 100644 --- a/ui/pages/sync.py +++ b/ui/pages/sync.py @@ -83,25 +83,51 @@ def save_sync_status(data): def clean_track_name_for_search(track_name): """ - Cleans a track name for searching by removing text in parentheses and brackets. - If cleaning the name results in an empty string, the original name is returned. + Intelligently cleans a track name for searching by removing noise while preserving important version information. + Removes: (feat. Artist), (Explicit), (Clean), etc. + Keeps: (Extended Version), (Live), (Acoustic), (Remix), etc. """ if not track_name or not isinstance(track_name, str): return track_name - # Remove content in parentheses, e.g., (feat. Artist), (Remix) - cleaned_name = re.sub(r'\s*\([^)]*\)', '', track_name).strip() - # Remove content in square brackets, e.g., [Live], [Explicit] - cleaned_name = re.sub(r'\s*\[[^\]]*\]', '', cleaned_name).strip() + cleaned_name = track_name - # If cleaning results in an empty string (e.g., track name was only "(Intro)"), - # return the original track name to avoid an empty search. - if not cleaned_name: + # Define patterns to REMOVE (noise that doesn't affect track identity) + remove_patterns = [ + r'\s*\(explicit\)', # (Explicit) + r'\s*\(clean\)', # (Clean) + r'\s*\(radio\s*edit\)', # (Radio Edit) + r'\s*\(radio\s*version\)', # (Radio Version) + r'\s*\(feat\.?\s*[^)]+\)', # (feat. Artist) or (ft. Artist) + r'\s*\(ft\.?\s*[^)]+\)', # (ft Artist) + r'\s*\(featuring\s*[^)]+\)', # (featuring Artist) + r'\s*\(with\s*[^)]+\)', # (with Artist) + r'\s*\[[^\]]*explicit[^\]]*\]', # [Explicit] in brackets + r'\s*\[[^\]]*clean[^\]]*\]', # [Clean] in brackets + ] + + # Apply removal patterns + for pattern in remove_patterns: + cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE).strip() + + # PRESERVE important version information (do NOT remove these) + # These patterns are intentionally NOT in the remove list: + # - (Extended Version), (Extended), (Long Version) + # - (Live), (Live Version), (Concert) + # - (Acoustic), (Acoustic Version) + # - (Remix), (Club Mix), (Dance Mix) + # - (Remastered), (Remaster) + # - (Demo), (Studio Version) + # - (Instrumental) + # - Album/year info like (2023), (Deluxe Edition) + + # If cleaning results in an empty string, return the original track name + if not cleaned_name.strip(): return track_name # Log cleaning if significant changes were made if cleaned_name != track_name: - print(f"🧹 Cleaned track name for search: '{track_name}' -> '{cleaned_name}'") + print(f"🧹 Intelligent track cleaning: '{track_name}' -> '{cleaned_name}'") return cleaned_name