mirror of https://github.com/Nezreka/SoulSync.git
Merge pull request #596 from Nezreka/fix/mtv-unplugged-version-detection
Fix/mtv unplugged version detectionpull/597/head
commit
273fbd37bf
@ -0,0 +1,195 @@
|
||||
"""Strip redundant album-context suffixes from track titles.
|
||||
|
||||
Issue #589 — MTV Unplugged albums (and similar live-concert / session
|
||||
releases) have source-side track titles like ``"Shy Away (MTV Unplugged
|
||||
Live)"`` while the local DB stored title is just ``"Shy Away"``. The
|
||||
album-scoped library check at ``core/downloads/master.py`` compares
|
||||
the two with raw string similarity, the length asymmetry tanks the
|
||||
score, and tracks the user already owns get marked missing.
|
||||
|
||||
This helper normalizes a track title by stripping the parenthetical
|
||||
or dash suffix when its tokens are fully subsumed by the album
|
||||
context: at least one version marker (live / unplugged / acoustic /
|
||||
session / etc) is present in BOTH the suffix AND the album title, and
|
||||
every other suffix token is either a known marker, a year, a
|
||||
connecting noise word, or a word that appears in the album title.
|
||||
|
||||
Pure function. No I/O. Tests at the function boundary.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Iterable, Tuple
|
||||
|
||||
# Version-marker keywords. When the album title contains any of these,
|
||||
# stripping is enabled. Singular forms — plurals get matched separately
|
||||
# via stem expansion below.
|
||||
_VERSION_MARKERS = (
|
||||
'live',
|
||||
'unplugged',
|
||||
'acoustic',
|
||||
'session',
|
||||
'concert',
|
||||
'tour',
|
||||
)
|
||||
|
||||
# Markers that are implied "live" context — when the album mentions any
|
||||
# of these, a bare ``live`` token in the suffix counts as album context
|
||||
# even if the album title doesn't literally say "live". MTV Unplugged
|
||||
# albums are live recordings; same for "in concert" / "tour" releases.
|
||||
_IMPLIES_LIVE = ('unplugged', 'concert', 'tour', 'session')
|
||||
|
||||
# Connecting / filler words that don't carry meaning by themselves.
|
||||
_NOISE_TOKENS = frozenset({
|
||||
'version', 'edition', 'recording', 'recordings', 'remaster',
|
||||
'remastered', 'mix',
|
||||
'the', 'a', 'an', 'from', 'at', 'in', 'on', 'for', 'of',
|
||||
'and', 'or', 'with', 'by',
|
||||
'vol', 'pt', 'part', 'no',
|
||||
})
|
||||
|
||||
_SUFFIX_PATTERNS: Tuple[re.Pattern, ...] = (
|
||||
re.compile(r'\s*\(([^()]+)\)\s*$'),
|
||||
re.compile(r'\s*\[([^\[\]]+)\]\s*$'),
|
||||
re.compile(r'\s+-\s+(.+?)\s*$'),
|
||||
)
|
||||
|
||||
_YEAR_RE = re.compile(r'^(?:19|20)\d{2}$')
|
||||
_TOKEN_RE = re.compile(r'\w+')
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
return (text or '').lower().strip()
|
||||
|
||||
|
||||
def _tokenize(text: str) -> set:
|
||||
return set(_TOKEN_RE.findall(_normalize(text)))
|
||||
|
||||
|
||||
def _expand_marker_set(markers: Iterable[str]) -> set:
|
||||
"""Expand each marker into its singular + plural forms."""
|
||||
out = set()
|
||||
for marker in markers:
|
||||
out.add(marker)
|
||||
if not marker.endswith('s'):
|
||||
out.add(marker + 's')
|
||||
return out
|
||||
|
||||
|
||||
_EXPANDED_MARKERS = _expand_marker_set(_VERSION_MARKERS)
|
||||
|
||||
|
||||
def album_context_markers(album_title: str) -> Tuple[str, ...]:
|
||||
"""Return the version markers present in the album title (singular form)."""
|
||||
if not album_title:
|
||||
return ()
|
||||
album_tokens = _tokenize(album_title)
|
||||
found = []
|
||||
for marker in _VERSION_MARKERS:
|
||||
if marker in album_tokens or (marker + 's') in album_tokens:
|
||||
found.append(marker)
|
||||
return tuple(found)
|
||||
|
||||
|
||||
def _suffix_is_album_redundant(
|
||||
inner: str,
|
||||
album_tokens: set,
|
||||
album_markers: Tuple[str, ...],
|
||||
) -> bool:
|
||||
"""Decide whether a suffix's tokens are all subsumed by album context.
|
||||
|
||||
Three requirements:
|
||||
1. The suffix contains at least one version-marker token. Stops
|
||||
a generic "feat. X" suffix from being stripped because the
|
||||
album happened to be live.
|
||||
2. The shared marker matches one the album implies — either
|
||||
literally in the album title, OR via the implied-live set
|
||||
(unplugged/concert/tour albums imply "live").
|
||||
3. Every other suffix token is either a marker, a year, a
|
||||
tolerated noise word, or a word that appears in the album
|
||||
title. If any token falls outside, the suffix carries
|
||||
info beyond album context (featured artist, different
|
||||
version, etc) — keep it on.
|
||||
"""
|
||||
if not inner:
|
||||
return False
|
||||
|
||||
suffix_tokens = _tokenize(inner)
|
||||
if not suffix_tokens:
|
||||
return False
|
||||
|
||||
# Markers the album effectively implies (literal + implied-live).
|
||||
implied_markers = set(album_markers)
|
||||
if any(m in implied_markers for m in _IMPLIES_LIVE):
|
||||
implied_markers.add('live')
|
||||
|
||||
suffix_markers = suffix_tokens & _EXPANDED_MARKERS
|
||||
if not suffix_markers:
|
||||
return False
|
||||
|
||||
# At least one marker must overlap with album-implied set. Plural
|
||||
# tolerance — strip trailing 's' for the comparison.
|
||||
def _stem(tok: str) -> str:
|
||||
return tok[:-1] if tok.endswith('s') and len(tok) > 1 else tok
|
||||
|
||||
if not any(_stem(t) in implied_markers for t in suffix_markers):
|
||||
return False
|
||||
|
||||
# Every remaining suffix token must be subsumed.
|
||||
for tok in suffix_tokens:
|
||||
if tok in _EXPANDED_MARKERS:
|
||||
continue
|
||||
if _YEAR_RE.match(tok):
|
||||
continue
|
||||
if tok in _NOISE_TOKENS:
|
||||
continue
|
||||
if tok in album_tokens:
|
||||
continue
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def strip_redundant_album_suffix(track_title: str, album_title: str) -> str:
|
||||
"""Strip a trailing parenthetical/bracket/dash suffix from `track_title`
|
||||
when the suffix duplicates context already implied by `album_title`.
|
||||
|
||||
Examples:
|
||||
- ("Shy Away (MTV Unplugged Live)", "MTV Unplugged") → "Shy Away"
|
||||
- ("Only If For A Night (MTV Unplugged, 2012 / Live)",
|
||||
"Ceremonials (Live At MTV Unplugged)") → "Only If For A Night"
|
||||
- ("In My Feelings (Instrumental)", "Scorpion")
|
||||
→ unchanged (instrumental NOT implied by studio album)
|
||||
- ("Hello (Live - feat. Other)", "Live At Wembley")
|
||||
→ unchanged (suffix carries featured-artist beyond album context)
|
||||
- ("Shy Away", "MTV Unplugged") → unchanged (no suffix)
|
||||
|
||||
Pure function — never raises, returns the input unchanged on any
|
||||
edge / unexpected input.
|
||||
"""
|
||||
if not track_title:
|
||||
return track_title or ''
|
||||
album_markers = album_context_markers(album_title)
|
||||
if not album_markers:
|
||||
return track_title
|
||||
|
||||
album_tokens = _tokenize(album_title)
|
||||
stripped = track_title
|
||||
|
||||
# Stacked suffixes ("Track (MTV Unplugged) [Live]") — peel one at a
|
||||
# time. Bound the loop defensively.
|
||||
for _ in range(4):
|
||||
peeled = None
|
||||
for pattern in _SUFFIX_PATTERNS:
|
||||
m = pattern.search(stripped)
|
||||
if not m:
|
||||
continue
|
||||
inner = m.group(1)
|
||||
if _suffix_is_album_redundant(inner, album_tokens, album_markers):
|
||||
peeled = stripped[: m.start()].rstrip()
|
||||
break
|
||||
if peeled is None:
|
||||
return stripped
|
||||
stripped = peeled
|
||||
return stripped
|
||||
@ -0,0 +1,168 @@
|
||||
"""Tests for the album-context-aware track-title stripping helper.
|
||||
|
||||
Issue #589 — MTV Unplugged track titles like ``"Shy Away (MTV Unplugged
|
||||
Live)"`` got false-rejected by the album-scoped library check because
|
||||
the local DB stored title is just ``"Shy Away"``. The pure helper here
|
||||
strips the redundant suffix when (and only when) the album title
|
||||
implies the same context.
|
||||
"""
|
||||
|
||||
from core.matching.album_context_title import (
|
||||
album_context_markers,
|
||||
strip_redundant_album_suffix,
|
||||
)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# album_context_markers
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_mtv_unplugged_album_carries_unplugged_marker():
|
||||
# 'mtv' isn't a version marker on its own — the unplugged token is
|
||||
# the load-bearing one. Implied-live logic adds 'live' coverage too.
|
||||
markers = album_context_markers('MTV Unplugged')
|
||||
assert 'unplugged' in markers
|
||||
|
||||
|
||||
def test_live_at_album_carries_live_marker():
|
||||
markers = album_context_markers('Live At Wembley')
|
||||
assert 'live' in markers
|
||||
|
||||
|
||||
def test_studio_album_has_no_markers():
|
||||
assert album_context_markers('Scorpion') == ()
|
||||
assert album_context_markers('DAMN.') == ()
|
||||
assert album_context_markers('') == ()
|
||||
assert album_context_markers(None) == ()
|
||||
|
||||
|
||||
def test_acoustic_session_album_marker():
|
||||
assert 'acoustic' in album_context_markers('Acoustic Sessions Vol. 2')
|
||||
assert 'session' in album_context_markers('Acoustic Sessions Vol. 2')
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# strip_redundant_album_suffix — the headline cases from #589
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_strips_mtv_unplugged_live_suffix_when_album_is_mtv_unplugged():
|
||||
assert strip_redundant_album_suffix('Shy Away (MTV Unplugged Live)', 'MTV Unplugged') == 'Shy Away'
|
||||
|
||||
|
||||
def test_strips_complex_mtv_unplugged_suffix_with_year():
|
||||
# Reporter case 2: "Only If For A Night (MTV Unplugged, 2012 / Live)"
|
||||
assert strip_redundant_album_suffix(
|
||||
'Only If For A Night (MTV Unplugged, 2012 / Live)',
|
||||
'Ceremonials (Live At MTV Unplugged)',
|
||||
) == 'Only If For A Night'
|
||||
|
||||
|
||||
def test_strips_dash_style_live_suffix_when_album_is_live():
|
||||
assert strip_redundant_album_suffix(
|
||||
'Bohemian Rhapsody - Live At Wembley',
|
||||
'Live At Wembley Stadium',
|
||||
) == 'Bohemian Rhapsody'
|
||||
|
||||
|
||||
def test_strips_brackets_live_suffix():
|
||||
assert strip_redundant_album_suffix(
|
||||
'Hello [Live]',
|
||||
'Live At The Royal Albert Hall',
|
||||
) == 'Hello'
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Negative cases — must NOT strip when it would mask a genuine variant
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_does_not_strip_instrumental_when_album_is_studio():
|
||||
# Critical anti-regression — keeping AcoustID's vocal/instrumental
|
||||
# gate working downstream. Don't drop the marker just because the
|
||||
# title is on a studio album.
|
||||
assert strip_redundant_album_suffix(
|
||||
'In My Feelings (Instrumental)',
|
||||
'Scorpion',
|
||||
) == 'In My Feelings (Instrumental)'
|
||||
|
||||
|
||||
def test_does_not_strip_remix_when_album_is_studio():
|
||||
assert strip_redundant_album_suffix(
|
||||
'Hello (Acoustic Remix)',
|
||||
'Scorpion',
|
||||
) == 'Hello (Acoustic Remix)'
|
||||
|
||||
|
||||
def test_does_not_strip_live_when_album_does_not_imply_live():
|
||||
# User's "Live At Wembley" might be a single-track release on an
|
||||
# otherwise-studio album. Don't strip.
|
||||
assert strip_redundant_album_suffix(
|
||||
'Hello (Live At Wembley)',
|
||||
'Greatest Hits',
|
||||
) == 'Hello (Live At Wembley)'
|
||||
|
||||
|
||||
def test_does_not_strip_when_suffix_carries_extra_context():
|
||||
# Suffix has both the album marker AND a featured-artist credit;
|
||||
# the credit isn't album context, so keep the suffix.
|
||||
assert strip_redundant_album_suffix(
|
||||
'Track Name (Live - feat. Other Artist)',
|
||||
'Live At Wembley',
|
||||
) == 'Track Name (Live - feat. Other Artist)'
|
||||
|
||||
|
||||
def test_no_suffix_returns_unchanged():
|
||||
assert strip_redundant_album_suffix('Shy Away', 'MTV Unplugged') == 'Shy Away'
|
||||
|
||||
|
||||
def test_empty_or_none_inputs_handled():
|
||||
assert strip_redundant_album_suffix('', 'MTV Unplugged') == ''
|
||||
assert strip_redundant_album_suffix(None, 'MTV Unplugged') == ''
|
||||
assert strip_redundant_album_suffix('Shy Away', '') == 'Shy Away'
|
||||
assert strip_redundant_album_suffix('Shy Away', None) == 'Shy Away'
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Stacked-suffix cases
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_strips_stacked_redundant_suffixes():
|
||||
# Some sources double up: parens + brackets, both album-context
|
||||
assert strip_redundant_album_suffix(
|
||||
'Track Name (Live) [Unplugged]',
|
||||
'MTV Unplugged Live',
|
||||
) == 'Track Name'
|
||||
|
||||
|
||||
def test_stops_stripping_when_remaining_suffix_is_genuine():
|
||||
# Outer is redundant (live → album-context), inner is not (remix)
|
||||
assert strip_redundant_album_suffix(
|
||||
'Track Name (Remix) (Live)',
|
||||
'Live At Wembley',
|
||||
) == 'Track Name (Remix)'
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# Year + connector tolerance
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_year_in_suffix_does_not_block_stripping():
|
||||
assert strip_redundant_album_suffix(
|
||||
'Track Name (Live, 2012)',
|
||||
'Live At Wembley',
|
||||
) == 'Track Name'
|
||||
|
||||
|
||||
def test_version_word_in_suffix_does_not_block_stripping():
|
||||
# "Live Version" is still album-context (just the word "version"
|
||||
# in there). Strip.
|
||||
assert strip_redundant_album_suffix(
|
||||
'Track Name (Live Version)',
|
||||
'Live At Wembley',
|
||||
) == 'Track Name'
|
||||
|
||||
|
||||
def test_session_marker_preserved_for_acoustic_session_album():
|
||||
assert strip_redundant_album_suffix(
|
||||
'Hello (Acoustic Session)',
|
||||
'Acoustic Sessions Vol. 2',
|
||||
) == 'Hello'
|
||||
@ -0,0 +1,123 @@
|
||||
"""Tests for Tidal qualifier filtering across primary + fallback search.
|
||||
|
||||
Issue #589 — when a download query carries a version qualifier ("live",
|
||||
"unplugged", "acoustic", etc), the qualifier filter must apply to BOTH
|
||||
the primary search AND fallback variants. Previously it only fired on
|
||||
fallbacks, so a primary search for "Shy Away (MTV Unplugged Live)" that
|
||||
happened to surface the studio cut first would accept the wrong file
|
||||
and only get caught by AcoustID downstream.
|
||||
|
||||
Also covers the album-context extension: for concert / unplugged
|
||||
releases the live signal lives in the album title, not the track
|
||||
title. The filter inspects both ``track.name`` AND ``track.album.name``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from core.tidal_download_client import TidalDownloadClient
|
||||
|
||||
|
||||
def _make_track(name: str, album_name: str = ''):
|
||||
"""Build a minimal duck-typed track object matching what the Tidal
|
||||
SDK returns: a `name` attribute and an `album` attribute with its
|
||||
own `name`."""
|
||||
track = MagicMock()
|
||||
track.name = name
|
||||
track.album = MagicMock()
|
||||
track.album.name = album_name
|
||||
return track
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# _track_name_contains_qualifiers — legacy track-only behavior preserved
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_legacy_helper_passes_when_track_name_has_qualifier():
|
||||
assert TidalDownloadClient._track_name_contains_qualifiers(
|
||||
'Shy Away (MTV Unplugged Live)', ['live']
|
||||
) is True
|
||||
|
||||
|
||||
def test_legacy_helper_fails_when_track_name_lacks_qualifier():
|
||||
assert TidalDownloadClient._track_name_contains_qualifiers(
|
||||
'Shy Away', ['live']
|
||||
) is False
|
||||
|
||||
|
||||
def test_legacy_helper_passes_when_no_qualifiers_required():
|
||||
assert TidalDownloadClient._track_name_contains_qualifiers(
|
||||
'Anything', []
|
||||
) is True
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
# _track_matches_qualifiers — new helper inspects track + album
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_qualifier_in_track_name_alone_passes():
|
||||
track = _make_track('Shy Away (Live)', 'DAMN.')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['live']) is True
|
||||
|
||||
|
||||
def test_qualifier_in_album_name_alone_passes():
|
||||
# MTV Unplugged scenario — track titled "Shy Away" but album
|
||||
# carries the live context. Pre-fix this returned False because
|
||||
# only track.name was checked.
|
||||
track = _make_track('Shy Away', 'MTV Unplugged Live')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['live']) is True
|
||||
|
||||
|
||||
def test_qualifier_missing_from_both_fails():
|
||||
# User asked for live, Tidal returned the studio cut on a studio
|
||||
# album. Must reject so the search keeps looking.
|
||||
track = _make_track('Shy Away', 'Trench')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['live']) is False
|
||||
|
||||
|
||||
def test_unplugged_qualifier_in_album_name():
|
||||
track = _make_track('Only If For A Night', 'MTV Unplugged')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['unplugged']) is True
|
||||
|
||||
|
||||
def test_multiple_qualifiers_all_required():
|
||||
# Both "live" AND "acoustic" must be present somewhere
|
||||
track = _make_track('Hello', 'Live Acoustic Sessions')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['live', 'acoustic']) is True
|
||||
track2 = _make_track('Hello', 'Live Sessions') # missing acoustic
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track2, ['live', 'acoustic']) is False
|
||||
|
||||
|
||||
def test_no_qualifiers_required_always_passes():
|
||||
track = _make_track('Anything', 'Anything')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, []) is True
|
||||
|
||||
|
||||
def test_track_with_no_album_attribute():
|
||||
# Defensive — duck-typed tracks may not all have album. Use a
|
||||
# plain object instead of MagicMock so missing .album is real.
|
||||
class BareTrack:
|
||||
name = 'Live Track'
|
||||
album = None
|
||||
assert TidalDownloadClient._track_matches_qualifiers(BareTrack(), ['live']) is True
|
||||
assert TidalDownloadClient._track_matches_qualifiers(BareTrack(), ['unplugged']) is False
|
||||
|
||||
|
||||
def test_track_with_empty_name_and_album():
|
||||
class BareTrack:
|
||||
name = ''
|
||||
album = None
|
||||
assert TidalDownloadClient._track_matches_qualifiers(BareTrack(), ['live']) is False
|
||||
|
||||
|
||||
def test_word_boundary_avoids_false_match_on_substring():
|
||||
# "session" should NOT match "obsession"
|
||||
track = _make_track('Obsession', 'Pop Hits')
|
||||
assert TidalDownloadClient._track_matches_qualifiers(track, ['session']) is False
|
||||
|
||||
|
||||
def test_extract_qualifiers_picks_up_live_unplugged():
|
||||
quals = TidalDownloadClient._extract_qualifiers('Shy Away (MTV Unplugged Live)')
|
||||
assert 'live' in quals
|
||||
assert 'unplugged' in quals
|
||||
Loading…
Reference in new issue