consistent matching between sync and artists.

9 months ago · aafc0ca49f
parent 35fdef76ce
commit aafc0ca49f
8 changed files with 301 additions and 72 deletions
--- a/core/pycache/matching_engine.cpython-310.pyc
+++ b/core/pycache/matching_engine.cpython-310.pyc
--- a/core/pycache/matching_engine.cpython-312.pyc
+++ b/core/pycache/matching_engine.cpython-312.pyc
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@ -336,21 +336,44 @@ class MusicMatchingEngine:
                queries.append(f"{artist} {cleaned_track}".strip())
                print(f"🎯 PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'")
        
-        # PRIORITY 2: Try with just the first part before any dash/parentheses
-        simple_patterns = [
-            r'^([^-\(]+)',  # Everything before first dash or parenthesis
-            r'^([^-]+)',    # Everything before first dash only
-        ]
+        # PRIORITY 2: Try simplified versions, but preserve important version info
+        # Only remove content that's likely to be album names or noise, not version info
        
-        for pattern in simple_patterns:
-            match = re.search(pattern, original_title.strip())
-            if match:
-                simple_title = match.group(1).strip()
-                if simple_title and len(simple_title) >= 3:  # Avoid too-short titles
+        # Pattern 1: Remove content after " - " (likely album names)
+        dash_pattern = r'^([^-]+?)(?:\s*-\s*.+)?$'
+        match = re.search(dash_pattern, original_title.strip())
+        if match:
+            dash_title = match.group(1).strip()
+            if dash_title and len(dash_title) >= 3 and dash_title != original_title:
+                dash_clean = self.clean_title(dash_title) 
+                if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]:
+                    queries.append(f"{artist} {dash_clean}".strip())
+                    print(f"🎯 PRIORITY 2: Dash-cleaned query: '{artist} {dash_clean}'")
+        
+        # Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info
+        # Check if parentheses contain version-related keywords before removing
+        paren_pattern = r'^(.+?)\s*\(([^)]+)\)(.*)$'
+        paren_match = re.search(paren_pattern, original_title)
+        if paren_match:
+            before_paren = paren_match.group(1).strip()
+            paren_content = paren_match.group(2).strip().lower()
+            after_paren = paren_match.group(3).strip()
+            
+            # Define what we consider "noise" vs "important version info"
+            noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean', 'radio edit', 'radio version']
+            version_keywords = ['extended', 'live', 'acoustic', 'remix', 'remaster', 'demo', 'instrumental', 'version', 'edit', 'mix']
+            
+            # Only remove parentheses if they contain noise, not version info
+            is_noise = any(keyword in paren_content for keyword in noise_keywords)
+            is_version = any(keyword in paren_content for keyword in version_keywords)
+            
+            if is_noise and not is_version and before_paren:
+                simple_title = (before_paren + ' ' + after_paren).strip()
+                if simple_title and len(simple_title) >= 3:
                    simple_clean = self.clean_title(simple_title)
                    if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]:
                        queries.append(f"{artist} {simple_clean}".strip())
-                        print(f"🎯 PRIORITY 2: Simple-cleaned query: '{artist} {simple_clean}'")
+                        print(f"🎯 PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'")
        
        # PRIORITY 3: Original query (ONLY if no album was detected or if it's different)
        original_track_clean = self.clean_title(original_title)
--- a/database/pycache/music_database.cpython-310.pyc
+++ b/database/pycache/music_database.cpython-310.pyc
--- a/database/pycache/music_database.cpython-312.pyc
+++ b/database/pycache/music_database.cpython-312.pyc
--- a/database/music_database.py
+++ b/database/music_database.py
@ -458,67 +458,216 @@ class MusicDatabase:
            return []
    
    def search_tracks(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseTrack]:
-        """Search tracks by title and/or artist name with fuzzy matching"""
+        """Search tracks by title and/or artist name with Unicode-aware fuzzy matching"""
        try:
+            if not title and not artist:
+                return []
+            
            conn = self._get_connection()
            cursor = conn.cursor()
            
-            # Build dynamic query based on provided parameters
-            where_conditions = []
-            params = []
-            
-            if title:
-                where_conditions.append("tracks.title LIKE ?")
-                params.append(f"%{title}%")
+            # STRATEGY 1: Try basic SQL LIKE search first (fastest)
+            basic_results = self._search_tracks_basic(cursor, title, artist, limit)
            
-            if artist:
-                where_conditions.append("artists.name LIKE ?")
-                params.append(f"%{artist}%")
+            if basic_results:
+                logger.debug(f"🔍 Basic search found {len(basic_results)} results")
+                return basic_results
            
-            if not where_conditions:
-                # If no search criteria, return empty list
-                return []
-            
-            where_clause = " AND ".join(where_conditions)
-            params.append(limit)
-            
-            cursor.execute(f"""
-                SELECT tracks.*, artists.name as artist_name, albums.title as album_title
-                FROM tracks
-                JOIN artists ON tracks.artist_id = artists.id
-                JOIN albums ON tracks.album_id = albums.id
-                WHERE {where_clause}
-                ORDER BY tracks.title, artists.name
-                LIMIT ?
-            """, params)
+            # STRATEGY 2: If basic search fails and we have Unicode support, try normalized search
+            try:
+                from unidecode import unidecode
+                unicode_support = True
+            except ImportError:
+                unicode_support = False
            
-            rows = cursor.fetchall()
+            if unicode_support:
+                normalized_results = self._search_tracks_unicode_fallback(cursor, title, artist, limit)
+                if normalized_results:
+                    logger.debug(f"🔍 Unicode fallback search found {len(normalized_results)} results")
+                    return normalized_results
            
-            tracks = []
-            for row in rows:
-                track = DatabaseTrack(
-                    id=row['id'],
-                    album_id=row['album_id'],
-                    artist_id=row['artist_id'],
-                    title=row['title'],
-                    track_number=row['track_number'],
-                    duration=row['duration'],
-                    file_path=row['file_path'],
-                    bitrate=row['bitrate'],
-                    created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None,
-                    updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None
-                )
-                # Add artist and album info for compatibility with Plex responses
-                track.artist_name = row['artist_name']
-                track.album_title = row['album_title']
-                tracks.append(track)
+            # STRATEGY 3: Last resort - broader fuzzy search with Python filtering
+            fuzzy_results = self._search_tracks_fuzzy_fallback(cursor, title, artist, limit)
+            if fuzzy_results:
+                logger.debug(f"🔍 Fuzzy fallback search found {len(fuzzy_results)} results")
            
-            return tracks
+            return fuzzy_results
            
        except Exception as e:
            logger.error(f"Error searching tracks with title='{title}', artist='{artist}': {e}")
            return []
    
+    def _search_tracks_basic(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
+        """Basic SQL LIKE search - fastest method"""
+        where_conditions = []
+        params = []
+        
+        if title:
+            where_conditions.append("tracks.title LIKE ?")
+            params.append(f"%{title}%")
+        
+        if artist:
+            where_conditions.append("artists.name LIKE ?")
+            params.append(f"%{artist}%")
+        
+        if not where_conditions:
+            return []
+        
+        where_clause = " AND ".join(where_conditions)
+        params.append(limit)
+        
+        cursor.execute(f"""
+            SELECT tracks.*, artists.name as artist_name, albums.title as album_title
+            FROM tracks
+            JOIN artists ON tracks.artist_id = artists.id
+            JOIN albums ON tracks.album_id = albums.id
+            WHERE {where_clause}
+            ORDER BY tracks.title, artists.name
+            LIMIT ?
+        """, params)
+        
+        return self._rows_to_tracks(cursor.fetchall())
+    
+    def _search_tracks_unicode_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
+        """Unicode-aware fallback search - tries normalized versions"""
+        from unidecode import unidecode
+        
+        # Normalize search terms
+        title_norm = unidecode(title).lower() if title else ""
+        artist_norm = unidecode(artist).lower() if artist else ""
+        
+        # Try searching with normalized versions
+        where_conditions = []
+        params = []
+        
+        if title:
+            where_conditions.append("LOWER(tracks.title) LIKE ?")
+            params.append(f"%{title_norm}%")
+        
+        if artist:
+            where_conditions.append("LOWER(artists.name) LIKE ?")
+            params.append(f"%{artist_norm}%")
+        
+        if not where_conditions:
+            return []
+        
+        where_clause = " AND ".join(where_conditions)
+        params.append(limit * 2)  # Get more results for filtering
+        
+        cursor.execute(f"""
+            SELECT tracks.*, artists.name as artist_name, albums.title as album_title
+            FROM tracks
+            JOIN artists ON tracks.artist_id = artists.id
+            JOIN albums ON tracks.album_id = albums.id
+            WHERE {where_clause}
+            ORDER BY tracks.title, artists.name
+            LIMIT ?
+        """, params)
+        
+        rows = cursor.fetchall()
+        
+        # Filter results with proper Unicode normalization
+        filtered_tracks = []
+        for row in rows:
+            db_title_norm = unidecode(row['title'].lower()) if row['title'] else ""
+            db_artist_norm = unidecode(row['artist_name'].lower()) if row['artist_name'] else ""
+            
+            title_matches = not title or title_norm in db_title_norm
+            artist_matches = not artist or artist_norm in db_artist_norm
+            
+            if title_matches and artist_matches:
+                filtered_tracks.append(row)
+                if len(filtered_tracks) >= limit:
+                    break
+        
+        return self._rows_to_tracks(filtered_tracks)
+    
+    def _search_tracks_fuzzy_fallback(self, cursor, title: str, artist: str, limit: int) -> List[DatabaseTrack]:
+        """Broadest fuzzy search - partial word matching"""
+        # Get broader results by searching for individual words
+        search_terms = []
+        if title:
+            # Split title into words and search for each
+            title_words = [w.strip() for w in title.lower().split() if len(w.strip()) >= 3]
+            search_terms.extend(title_words)
+        
+        if artist:
+            # Split artist into words and search for each
+            artist_words = [w.strip() for w in artist.lower().split() if len(w.strip()) >= 3]
+            search_terms.extend(artist_words)
+        
+        if not search_terms:
+            return []
+        
+        # Build a query that searches for any of the words
+        like_conditions = []
+        params = []
+        
+        for term in search_terms[:5]:  # Limit to 5 terms to avoid too broad search
+            like_conditions.append("(LOWER(tracks.title) LIKE ? OR LOWER(artists.name) LIKE ?)")
+            params.extend([f"%{term}%", f"%{term}%"])
+        
+        if not like_conditions:
+            return []
+        
+        where_clause = " OR ".join(like_conditions)
+        params.append(limit * 3)  # Get more results for scoring
+        
+        cursor.execute(f"""
+            SELECT tracks.*, artists.name as artist_name, albums.title as album_title
+            FROM tracks
+            JOIN artists ON tracks.artist_id = artists.id
+            JOIN albums ON tracks.album_id = albums.id
+            WHERE {where_clause}
+            ORDER BY tracks.title, artists.name
+            LIMIT ?
+        """, params)
+        
+        rows = cursor.fetchall()
+        
+        # Score and filter results
+        scored_results = []
+        for row in rows:
+            # Simple scoring based on how many search terms match
+            score = 0
+            db_title_lower = row['title'].lower()
+            db_artist_lower = row['artist_name'].lower()
+            
+            for term in search_terms:
+                if term in db_title_lower or term in db_artist_lower:
+                    score += 1
+            
+            if score > 0:
+                scored_results.append((score, row))
+        
+        # Sort by score and take top results
+        scored_results.sort(key=lambda x: x[0], reverse=True)
+        top_rows = [row for score, row in scored_results[:limit]]
+        
+        return self._rows_to_tracks(top_rows)
+    
+    def _rows_to_tracks(self, rows) -> List[DatabaseTrack]:
+        """Convert database rows to DatabaseTrack objects"""
+        tracks = []
+        for row in rows:
+            track = DatabaseTrack(
+                id=row['id'],
+                album_id=row['album_id'],
+                artist_id=row['artist_id'],
+                title=row['title'],
+                track_number=row['track_number'],
+                duration=row['duration'],
+                file_path=row['file_path'],
+                bitrate=row['bitrate'],
+                created_at=datetime.fromisoformat(row['created_at']) if row['created_at'] else None,
+                updated_at=datetime.fromisoformat(row['updated_at']) if row['updated_at'] else None
+            )
+            # Add artist and album info for compatibility with Plex responses
+            track.artist_name = row['artist_name']
+            track.album_title = row['album_title']
+            tracks.append(track)
+        return tracks
+    
    def search_albums(self, title: str = "", artist: str = "", limit: int = 50) -> List[DatabaseAlbum]:
        """Search albums by title and/or artist name with fuzzy matching"""
        try:
@ -1007,12 +1156,43 @@ class MusicDatabase:
        
        return unique_variations
    
+    def _normalize_for_comparison(self, text: str) -> str:
+        """Normalize text for comparison with Unicode accent handling"""
+        if not text:
+            return ""
+        
+        # Try to use unidecode for accent normalization, fallback to basic if not available
+        try:
+            from unidecode import unidecode
+            # Convert accents: é→e, ñ→n, ü→u, etc.
+            normalized = unidecode(text)
+        except ImportError:
+            # Fallback: basic normalization without accent handling
+            normalized = text
+            logger.warning("unidecode not available, accent matching may be limited")
+        
+        # Convert to lowercase and strip
+        return normalized.lower().strip()
+    
    def _calculate_track_confidence(self, search_title: str, search_artist: str, db_track: DatabaseTrack) -> float:
-        """Calculate confidence score for track match with enhanced cleaning"""
+        """Calculate confidence score for track match with enhanced cleaning and Unicode normalization"""
        try:
-            # Direct similarity
-            title_similarity = self._string_similarity(search_title.lower(), db_track.title.lower())
-            artist_similarity = self._string_similarity(search_artist.lower(), db_track.artist_name.lower())
+            # Unicode-aware normalization for accent matching (é→e, ñ→n, etc.)
+            search_title_norm = self._normalize_for_comparison(search_title)
+            search_artist_norm = self._normalize_for_comparison(search_artist)
+            db_title_norm = self._normalize_for_comparison(db_track.title)
+            db_artist_norm = self._normalize_for_comparison(db_track.artist_name)
+            
+            # Debug logging for Unicode normalization
+            if search_title != search_title_norm or search_artist != search_artist_norm or \
+               db_track.title != db_title_norm or db_track.artist_name != db_artist_norm:
+                logger.debug(f"🔤 Unicode normalization:")
+                logger.debug(f"   Search: '{search_title}' → '{search_title_norm}' | '{search_artist}' → '{search_artist_norm}'")
+                logger.debug(f"   Database: '{db_track.title}' → '{db_title_norm}' | '{db_track.artist_name}' → '{db_artist_norm}'")
+            
+            # Direct similarity with Unicode normalization
+            title_similarity = self._string_similarity(search_title_norm, db_title_norm)
+            artist_similarity = self._string_similarity(search_artist_norm, db_artist_norm)
            
            # Also try with cleaned versions (removing parentheses, brackets, etc.)
            clean_search_title = self._clean_track_title_for_comparison(search_title)
--- a/ui/pages/pycache/sync.cpython-312.pyc
+++ b/ui/pages/pycache/sync.cpython-312.pyc
--- a/ui/pages/sync.py
+++ b/ui/pages/sync.py
@ -83,25 +83,51 @@ def save_sync_status(data):

 def clean_track_name_for_search(track_name):
    """
-    Cleans a track name for searching by removing text in parentheses and brackets.
-    If cleaning the name results in an empty string, the original name is returned.
+    Intelligently cleans a track name for searching by removing noise while preserving important version information.
+    Removes: (feat. Artist), (Explicit), (Clean), etc.
+    Keeps: (Extended Version), (Live), (Acoustic), (Remix), etc.
    """
    if not track_name or not isinstance(track_name, str):
        return track_name

-    # Remove content in parentheses, e.g., (feat. Artist), (Remix)
-    cleaned_name = re.sub(r'\s*\([^)]*\)', '', track_name).strip()
-    # Remove content in square brackets, e.g., [Live], [Explicit]
-    cleaned_name = re.sub(r'\s*\[[^\]]*\]', '', cleaned_name).strip()
+    cleaned_name = track_name
    
-    # If cleaning results in an empty string (e.g., track name was only "(Intro)"),
-    # return the original track name to avoid an empty search.
-    if not cleaned_name:
+    # Define patterns to REMOVE (noise that doesn't affect track identity)
+    remove_patterns = [
+        r'\s*\(explicit\)',           # (Explicit)
+        r'\s*\(clean\)',              # (Clean) 
+        r'\s*\(radio\s*edit\)',       # (Radio Edit)
+        r'\s*\(radio\s*version\)',    # (Radio Version)
+        r'\s*\(feat\.?\s*[^)]+\)',    # (feat. Artist) or (ft. Artist)
+        r'\s*\(ft\.?\s*[^)]+\)',      # (ft Artist)
+        r'\s*\(featuring\s*[^)]+\)',  # (featuring Artist)
+        r'\s*\(with\s*[^)]+\)',       # (with Artist)
+        r'\s*\[[^\]]*explicit[^\]]*\]', # [Explicit] in brackets
+        r'\s*\[[^\]]*clean[^\]]*\]',    # [Clean] in brackets
+    ]
+    
+    # Apply removal patterns
+    for pattern in remove_patterns:
+        cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE).strip()
+    
+    # PRESERVE important version information (do NOT remove these)
+    # These patterns are intentionally NOT in the remove list:
+    # - (Extended Version), (Extended), (Long Version)
+    # - (Live), (Live Version), (Concert)
+    # - (Acoustic), (Acoustic Version)  
+    # - (Remix), (Club Mix), (Dance Mix)
+    # - (Remastered), (Remaster)
+    # - (Demo), (Studio Version)
+    # - (Instrumental)
+    # - Album/year info like (2023), (Deluxe Edition)
+    
+    # If cleaning results in an empty string, return the original track name
+    if not cleaned_name.strip():
        return track_name
        
    # Log cleaning if significant changes were made
    if cleaned_name != track_name:
-        print(f"🧹 Cleaned track name for search: '{track_name}' -> '{cleaned_name}'")
+        print(f"🧹 Intelligent track cleaning: '{track_name}' -> '{cleaned_name}'")
    
    return cleaned_name