diff --git a/core/matching_engine.py b/core/matching_engine.py index 1f8d930..9d9b47b 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -64,6 +64,17 @@ class MusicMatchingEngine: text = unidecode(text) text = text.lower() + # Expand specific abbreviations for better matching + abbreviation_map = { + r'\bpt\.': 'part', # "pt." → "part" + r'\bvol\.': 'volume', # "vol." → "volume" + r'\bfeat\.': 'featured' # "feat." → "featured" + # Removed "ft." → "featured" (ambiguous: could be "feet" in measurements) + } + + for pattern, replacement in abbreviation_map.items(): + text = re.sub(pattern, replacement, text) + # --- IMPROVEMENT V4 --- # The user correctly pointed out that replacing '$' with 's' was incorrect # as it breaks searching for stylized names like A$AP Rocky. @@ -84,9 +95,9 @@ class MusicMatchingEngine: """Returns a 'core' version of a string with only letters and numbers for a strict comparison.""" if not text: return "" - # Transliterate, lowercase, and remove everything that isn't a letter or digit. - text = unidecode(text).lower() - return re.sub(r'[^a-z0-9]', '', text) + # Use normalize_string first to get abbreviation expansion, then strip to core + normalized = self.normalize_string(text) + return re.sub(r'[^a-z0-9]', '', normalized) def clean_title(self, title: str) -> str: """Cleans title by removing common extra info using regex for fuzzy matching."""