diff --git a/core/__pycache__/matching_engine.cpython-310.pyc b/core/__pycache__/matching_engine.cpython-310.pyc index 25e6e581..7a24895d 100644 Binary files a/core/__pycache__/matching_engine.cpython-310.pyc and b/core/__pycache__/matching_engine.cpython-310.pyc differ diff --git a/core/__pycache__/matching_engine.cpython-312.pyc b/core/__pycache__/matching_engine.cpython-312.pyc index c18910f7..d90c8cc6 100644 Binary files a/core/__pycache__/matching_engine.cpython-312.pyc and b/core/__pycache__/matching_engine.cpython-312.pyc differ diff --git a/core/matching_engine.py b/core/matching_engine.py index ac44bddf..31fd2c98 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -134,11 +134,40 @@ class MusicMatchingEngine: return self.normalize_string(cleaned) def similarity_score(self, str1: str, str2: str) -> float: - """Calculates similarity score between two strings.""" + """Calculates similarity score between two strings with enhanced version handling.""" if not str1 or not str2: return 0.0 - return SequenceMatcher(None, str1, str2).ratio() + # Standard similarity + standard_ratio = SequenceMatcher(None, str1, str2).ratio() + + # Enhanced logic: Check if one string is a version of the other + # This handles cases like "Back & forth" vs "Back & forth original mix" + shorter, longer = (str1, str2) if len(str1) <= len(str2) else (str2, str1) + + # If the shorter string is at the start of the longer string + if longer.startswith(shorter): + # Extract the extra content + extra_content = longer[len(shorter):].strip() + + # Check if the extra content looks like version info + version_keywords = [ + 'original mix', 'radio mix', 'club mix', 'extended mix', + 'slowed', 'reverb', 'sped up', 'acoustic', 'remix', 'remaster', + 'live', 'demo', 'instrumental', 'clean', 'explicit', + 'radio edit', 'extended', 'version' + ] + + # Normalize extra content for comparison + extra_normalized = extra_content.lower().strip(' -()[]') + + # If the extra content matches version keywords, boost the similarity + for keyword in version_keywords: + if keyword in extra_normalized: + # High similarity but not perfect (to distinguish from exact matches) + return max(standard_ratio, 0.85) + + return standard_ratio def duration_similarity(self, duration1: int, duration2: int) -> float: """Calculates similarity score based on track duration (in ms).""" @@ -339,16 +368,33 @@ class MusicMatchingEngine: # PRIORITY 2: Try simplified versions, but preserve important version info # Only remove content that's likely to be album names or noise, not version info - # Pattern 1: Remove content after " - " (likely album names) - dash_pattern = r'^([^-]+?)(?:\s*-\s*.+)?$' + # Pattern 1: Intelligently handle content after " - " + # Only remove if it looks like album names, preserve version info like "slowed", "remix", etc. + dash_pattern = r'^([^-]+?)\s*-\s*(.+)$' match = re.search(dash_pattern, original_title.strip()) if match: - dash_title = match.group(1).strip() - if dash_title and len(dash_title) >= 3 and dash_title != original_title: - dash_clean = self.clean_title(dash_title) + title_part = match.group(1).strip() + dash_content = match.group(2).strip().lower() + + # Define version keywords that should be preserved + preserve_keywords = [ + 'slowed', 'reverb', 'sped up', 'speed up', 'spedup', 'slowdown', + 'remix', 'mix', 'edit', 'version', 'remaster', 'acoustic', + 'live', 'demo', 'instrumental', 'radio', 'extended', 'club', + 'original', 'clean', 'explicit', 'mashup', 'bootleg' + ] + + # Check if the dash content contains version keywords + should_preserve = any(keyword in dash_content for keyword in preserve_keywords) + + if not should_preserve and title_part and len(title_part) >= 3: + # This looks like album content, safe to remove + dash_clean = self.clean_title(title_part) if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {dash_clean}".strip()) - print(f"🎯 PRIORITY 2: Dash-cleaned query: '{artist} {dash_clean}'") + print(f"🎯 PRIORITY 2: Dash-cleaned query (removed album): '{artist} {dash_clean}'") + elif should_preserve: + print(f"🎯 PRESERVED: Keeping dash content '{dash_content}' as it appears to be version info") # Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info # Check if parentheses contain version-related keywords before removing @@ -360,8 +406,14 @@ class MusicMatchingEngine: after_paren = paren_match.group(3).strip() # Define what we consider "noise" vs "important version info" - noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean', 'radio edit', 'radio version'] - version_keywords = ['extended', 'live', 'acoustic', 'remix', 'remaster', 'demo', 'instrumental', 'version', 'edit', 'mix'] + noise_keywords = ['feat', 'ft', 'featuring', 'explicit', 'clean'] + # Expanded version keywords to match the dash preserve keywords + version_keywords = [ + 'slowed', 'reverb', 'sped up', 'speed up', 'spedup', 'slowdown', + 'remix', 'mix', 'edit', 'version', 'remaster', 'acoustic', + 'live', 'demo', 'instrumental', 'radio', 'extended', 'club', + 'original', 'mashup', 'bootleg' + ] # Only remove parentheses if they contain noise, not version info is_noise = any(keyword in paren_content for keyword in noise_keywords) @@ -374,6 +426,8 @@ class MusicMatchingEngine: if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {simple_clean}".strip()) print(f"🎯 PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'") + elif is_version: + print(f"🎯 PRESERVED: Keeping parentheses content '({paren_content})' as it appears to be version info") # PRIORITY 3: Original query (ONLY if no album was detected or if it's different) original_track_clean = self.clean_title(original_title) diff --git a/database/__pycache__/music_database.cpython-312.pyc b/database/__pycache__/music_database.cpython-312.pyc index 834246cf..755b6417 100644 Binary files a/database/__pycache__/music_database.cpython-312.pyc and b/database/__pycache__/music_database.cpython-312.pyc differ diff --git a/database/music_database.py b/database/music_database.py index b991bd3a..6e4b1051 100644 --- a/database/music_database.py +++ b/database/music_database.py @@ -13,6 +13,14 @@ from pathlib import Path from utils.logging_config import get_logger logger = get_logger("music_database") + +# Import matching engine for enhanced similarity logic +try: + from core.matching_engine import MusicMatchingEngine + _matching_engine = MusicMatchingEngine() +except ImportError: + logger.warning("Could not import MusicMatchingEngine, falling back to basic similarity") + _matching_engine = None # Temporarily enable debug logging for edition matching logger.setLevel(logging.DEBUG) @@ -860,7 +868,8 @@ class MusicDatabase: def _string_similarity(self, s1: str, s2: str) -> float: """ - Calculate simple string similarity using Levenshtein distance. + Calculate string similarity using enhanced matching engine logic if available, + otherwise falls back to Levenshtein distance. Returns value between 0.0 (no similarity) and 1.0 (identical) """ if s1 == s2: @@ -869,6 +878,10 @@ class MusicDatabase: if not s1 or not s2: return 0.0 + # Use enhanced similarity from matching engine if available + if _matching_engine: + return _matching_engine.similarity_score(s1, s2) + # Simple Levenshtein distance implementation len1, len2 = len(s1), len(s2) if len1 < len2: