Refine variant release dedup

- broaden the artist-detail dedup helper to catch trailing parenthetical edition and remaster variants
- keep the legacy hyphenated suffix fallback for older metadata
- add regression coverage for language-specific Edition and remaster cases
pull/327/head
Antti Kettunen 1 month ago
parent 33b4ea6429
commit 32e2281b9c
No known key found for this signature in database
GPG Key ID: C6B2A3D250359BD7

@ -355,18 +355,36 @@ def _dedup_variant_releases(releases: List[Dict[str, Any]]) -> List[Dict[str, An
import re
from difflib import SequenceMatcher
variant_patterns = [
r'\s*[\(\[]\s*(explicit|clean|deluxe|deluxe edition|standard edition|clean version|explicit version|remastered|bonus track version)\s*[\)\]]',
variant_suffix_pattern = re.compile(
r'\s*[\(\[][^()\[\]]*\b(?:edition|editions|deluxe|remaster|remastered|'
r'explicit|clean|version|anniversary|collector|expanded|redux)\b[^()\[\]]*[\)\]]\s*$',
re.IGNORECASE,
)
legacy_suffix_pattern = re.compile(
r'\s*-\s*(explicit|clean|deluxe edition|single)\s*$',
]
re.IGNORECASE,
)
variant_keyword_pattern = re.compile(
r'\b(?:edition|editions|deluxe|remaster|remastered|explicit|clean|version|'
r'anniversary|collector|expanded|redux)\b',
re.IGNORECASE,
)
def _clean_title(title: Any) -> str:
cleaned = str(title or '').strip().lower()
for pattern in variant_patterns:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
while True:
new_cleaned = variant_suffix_pattern.sub('', cleaned).strip()
new_cleaned = legacy_suffix_pattern.sub('', new_cleaned).strip()
if new_cleaned == cleaned:
break
cleaned = new_cleaned
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned
def _has_variant_suffix(title: Any) -> bool:
raw = str(title or '').strip()
return bool(re.search(r'[\(\[][^\)\]]*' + variant_keyword_pattern.pattern + r'[^\)\]]*[\)\]]\s*$', raw, flags=re.IGNORECASE))
def _is_compilation(release: Dict[str, Any]) -> bool:
title = str(_extract_lookup_value(release, 'name', 'title', default='') or '').lower()
album_type = str(_extract_lookup_value(release, 'album_type', default='') or '').lower()
@ -385,10 +403,12 @@ def _dedup_variant_releases(releases: List[Dict[str, Any]]) -> List[Dict[str, An
has_clean = 'clean' in title and not has_explicit
track_count = int(_extract_lookup_value(release, 'track_count', 'total_tracks', default=0) or 0)
release_date = str(_extract_lookup_value(release, 'release_date', default='') or '')
has_variant_suffix = _has_variant_suffix(title)
# Higher is better.
return (
1 if not _is_compilation(release) else 0,
1 if not has_variant_suffix else 0,
2 if has_explicit else (1 if not has_clean else 0),
track_count,
release_date,

@ -452,12 +452,20 @@ def test_get_artist_detail_discography_dedups_variant_releases(monkeypatch):
"total_tracks": 10,
},
{
"id": "album-deluxe",
"name": "Variant Album (Deluxe Edition)",
"id": "album-swedish",
"name": "Variant Album (Swedish Edition)",
"album_type": "album",
"image_url": "https://img.example/deluxe.jpg",
"image_url": "https://img.example/swedish.jpg",
"release_date": "2024-01-05",
"total_tracks": 14,
"total_tracks": 12,
},
{
"id": "album-remaster",
"name": "Variant Album (2023 Abbey Road Remaster)",
"album_type": "album",
"image_url": "https://img.example/remaster.jpg",
"release_date": "2024-01-05",
"total_tracks": 10,
},
],
"singles": [],
@ -469,6 +477,6 @@ def test_get_artist_detail_discography_dedups_variant_releases(monkeypatch):
result = metadata_service.get_artist_detail_discography("artist-1", "Artist One", MetadataLookupOptions())
assert result["success"] is True
assert [album["id"] for album in result["albums"]] == ["album-deluxe"]
assert result["albums"][0]["title"] == "Variant Album (Deluxe Edition)"
assert result["albums"][0]["track_count"] == 14
assert [album["id"] for album in result["albums"]] == ["album-standard"]
assert result["albums"][0]["title"] == "Variant Album"
assert result["albums"][0]["track_count"] == 10

Loading…
Cancel
Save