Handle edit versions, improve cleanup & thresholds

- matching_engine.py: Add 'single edit' and 'album edit' tokens and clarify radio edit comment so edit/cut variants are recognized as different cuts rather than being silently normalized away.
- database/music_database.py: Fix SQL param ordering by appending server_source to params; add a pre-step to strip "(with ...)" / "[with ...]" only when used inside brackets (so titles like "Stay With Me" are preserved); stop removing edit/version tokens in the generic cleanup and document that radio/single/album edits are treated as distinct by the similarity scorer to avoid incorrect matches.
- web_server.py: Increase DB match confidence threshold from 0.70 to 0.80 and update the runtime check accordingly.

These changes prevent edit/cut variants from being conflated with original recordings, improve title normalization for "with" featuring syntax in brackets, and fix a params ordering bug and a too-low match threshold.
pull/165/head
Broque Thomas 3 months ago
parent 35a03d6839
commit 4bff57cb70

@ -195,7 +195,9 @@ class MusicMatchingEngine:
'live', 'live at', 'live from', # Live versions (different recording)
'acoustic', 'unplugged', # Acoustic versions (different arrangement)
'slowed', 'reverb', 'sped up', 'speed up', # TikTok edits (different)
'radio edit', 'radio version', # Radio edits (different)
'radio edit', 'radio version', # Radio edits (different cut)
'single edit', # Single edits (different cut)
'album edit', # Album edits (different cut)
'instrumental', 'karaoke', # Instrumental (different)
'extended', 'extended version', # Extended (different length)
'demo', 'rough cut', # Demos (different recording)

@ -2256,7 +2256,7 @@ class MusicDatabase:
where_parts = [f"({' OR '.join(like_conditions)})"]
if server_source:
where_parts.append("tracks.server_source = ?")
params.insert(-1 if params else 0, server_source) # Insert before limit
params.append(server_source) # Append after LIKE params, before LIMIT
where_clause = " AND ".join(where_parts)
params.append(limit * 3) # Get more results for scoring
@ -3035,6 +3035,15 @@ class MusicDatabase:
"""Clean track title for comparison by normalizing brackets/dashes and removing noise"""
cleaned = title.lower().strip()
# PRE-STEP: Handle "(with Artist)" featuring BEFORE bracket removal.
# This catches "with" only when used as featuring syntax inside brackets,
# NOT when "with" is part of the song title like "Stay With Me".
# e.g. "Levitating (with DaBaby)" → "Levitating"
# "Stay (with Justin Bieber)" → "Stay"
# "Stay With Me" → unchanged (no brackets around "with")
cleaned = re.sub(r'\s*\(with\s+[^)]*\)', '', cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r'\s*\[with\s+[^\]]*\]', '', cleaned, flags=re.IGNORECASE)
# STEP 1: Normalize bracket/dash styles for consistent matching
# Convert all bracket styles to spaces for better matching
cleaned = re.sub(r'\s*[\[\(]\s*', ' ', cleaned) # Convert opening brackets/parens to space
@ -3054,19 +3063,18 @@ class MusicDatabase:
r'\s*feat\..*', # Remove featuring
r'\s*featuring.*', # Remove featuring
r'\s*ft\..*', # Remove ft.
r'\s*with\s+.*', # Remove "with Artist"
# Edit versions (same recording, different edit for format)
r'\s*radio\s+edit.*', # Remove "radio edit" - same song, radio format
r'\s*single\s+edit.*', # Remove "single edit" - same song, single format
r'\s*album\s+edit.*', # Remove "album edit" - same song, album format
r'\s*edit\s*$', # Remove trailing "edit"
# Remasters (same recording, different mastering)
r'\s*\d{4}\s*remaster.*', # Remove "2015 remaster"
r'\s*remaster.*', # Remove "remaster/remastered"
r'\s*remastered.*', # Remove "remastered"
# NOTE: Edit versions (radio edit, single edit, album edit) are NOT
# removed here — they are treated as different versions by
# matching_engine.similarity_score() which applies a 0.30 penalty.
# Removing them here would override that penalty via max() and
# cause incorrect matches (e.g. radio edit matched to full version).
# Version clarifications (metadata, not different recordings)
r'\s*original\s+version.*', # Remove "original version" - clarification
r'\s*album\s+version.*', # Remove "album version" - clarification
@ -3086,6 +3094,7 @@ class MusicDatabase:
# - instrumental (different version)
# - demo (different recording)
# - extended (different length/content)
# - radio edit, single edit, album edit (different cuts)
# These are handled by matching_engine.similarity_score() which applies penalties
for pattern in patterns_to_remove:

@ -19893,12 +19893,12 @@ def _run_sync_task(playlist_id, playlist_name, tracks_json):
artist_name = str(artist)
db_track, confidence = db.check_track_exists(
original_title, artist_name,
confidence_threshold=0.7,
original_title, artist_name,
confidence_threshold=0.80,
server_source=active_server
)
if db_track and confidence >= 0.7:
if db_track and confidence >= 0.80:
print(f"✅ Database match: '{db_track.title}' (confidence: {confidence:.2f})")
# Create mock track object for playlist creation

Loading…
Cancel
Save