feat: embed MusicBrainz, Spotify/iTunes IDs, ISRC, and merged genres into audio file tags

Enrich downloaded audio files with external identifiers and improved genre metadata in a single post-processing write. During metadata enhancement, the app now looks up the MusicBrainz recording and artist MBIDs, retrieves the ISRC and MusicBrainz genres from a follow-up detail lookup, merges them with Spotify's artist-level genres (deduplicated, capped at 5), and embeds everything alongside the Spotify/iTunes track, artist, and album IDs. All MusicBrainz API calls are serialized through the existing global rate limiter, making concurrent download workers safe without needing to pause the background worker. Includes a database migration adding Spotify/iTunes ID columns to the library tables.
pull/130/head
Broque Thomas 2 weeks ago
parent d9efcbdf99
commit d08a2e91a2

@ -211,6 +211,9 @@ class ConfigManager:
"enabled": True,
"embed_album_art": True
},
"musicbrainz": {
"embed_tags": True
},
"playlist_sync": {
"create_backup": True
},

@ -1,4 +1,4 @@
from typing import Optional, Dict, Any, List
from typing import Optional, Dict, Any
import json
from datetime import datetime, timedelta
from difflib import SequenceMatcher
@ -87,26 +87,26 @@ class MusicBrainzService:
if conn:
conn.close()
def _save_to_cache(self, entity_type: str, entity_name: str, artist_name: Optional[str],
def _save_to_cache(self, entity_type: str, entity_name: str, artist_name: Optional[str],
musicbrainz_id: Optional[str], metadata: Optional[Dict], confidence: int):
"""Save MusicBrainz result to cache"""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
metadata_json = json.dumps(metadata) if metadata else None
cursor.execute("""
INSERT OR REPLACE INTO musicbrainz_cache
INSERT OR REPLACE INTO musicbrainz_cache
(entity_type, entity_name, artist_name, musicbrainz_id, metadata_json, match_confidence, last_updated)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (entity_type, entity_name, artist_name, musicbrainz_id, metadata_json, confidence, datetime.now()))
conn.commit()
logger.debug(f"Cached {entity_type} '{entity_name}' (MBID: {musicbrainz_id}, confidence: {confidence})")
except Exception as e:
logger.error(f"Error saving to cache: {e}")
if conn:
@ -410,7 +410,7 @@ class MusicBrainzService:
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE tracks
SET musicbrainz_recording_id = ?,
@ -418,11 +418,11 @@ class MusicBrainzService:
musicbrainz_match_status = ?
WHERE id = ?
""", (mbid, datetime.now(), status, track_id))
conn.commit()
logger.debug(f"Updated track {track_id} with MBID: {mbid}, status: {status}")
except Exception as e:
logger.error(f"Error updating track {track_id}: {e}")
if conn:
@ -430,3 +430,4 @@ class MusicBrainzService:
finally:
if conn:
conn.close()

@ -10,20 +10,20 @@ logger = get_logger("musicbrainz_worker")
class MusicBrainzWorker:
"""Background worker for enriching library with MusicBrainz IDs"""
def __init__(self, database: MusicDatabase, app_name: str = "SoulSync", app_version: str = "1.0", contact_email: str = ""):
self.db = database
self.mb_service = MusicBrainzService(database, app_name, app_version, contact_email)
# Worker state
self.running = False
self.paused = False
self.should_stop = False
self.thread = None
# Current item being processed (for UI tooltip)
self.current_item = None
# Statistics
self.stats = {
'matched': 0,
@ -31,67 +31,67 @@ class MusicBrainzWorker:
'pending': 0,
'errors': 0
}
# Retry configuration
self.retry_days = 30 # Retry 'not_found' items after 30 days
logger.info("MusicBrainz background worker initialized")
def start(self):
"""Start the background worker"""
if self.running:
logger.warning("Worker already running")
return
self.running = True
self.should_stop = False
self.thread = threading.Thread(target=self._run, daemon=True)
self.thread.start()
logger.info("MusicBrainz background worker started")
def stop(self):
"""Stop the background worker"""
if not self.running:
return
logger.info("Stopping MusicBrainz worker...")
self.should_stop = True
self.running = False
if self.thread:
self.thread.join(timeout=5)
logger.info("Music Brainz worker stopped")
def pause(self):
"""Pause the worker"""
if not self.running:
logger.warning("Worker not running, cannot pause")
return
self.paused = True
logger.info("MusicBrainz worker paused")
def resume(self):
"""Resume the worker"""
if not self.running:
logger.warning("Worker not running, start it first")
return
self.paused = False
logger.info("MusicBrainz worker resumed")
def get_stats(self) -> Dict[str, Any]:
"""Get current statistics"""
# Update pending count
self.stats['pending'] = self._count_pending_items()
# Get progress breakdown by entity type
progress = self._get_progress_breakdown()
# Check if thread is actually alive (in case it crashed)
is_actually_running = self.running and (self.thread is not None and self.thread.is_alive())
return {
'enabled': True,
'running': is_actually_running and not self.paused,
@ -100,53 +100,53 @@ class MusicBrainzWorker:
'stats': self.stats.copy(),
'progress': progress
}
def _run(self):
"""Main worker loop"""
logger.info("MusicBrainz worker thread started")
while not self.should_stop:
try:
# Check if paused
if self.paused:
time.sleep(1)
continue
# Clear previous item before getting next
self.current_item = None
# Get next item to process
item = self._get_next_item()
if not item:
# No more items - sleep for a bit
logger.debug("No pending items, sleeping...")
time.sleep(10)
continue
# Set current item for UI tracking
self.current_item = item
# Process the item
self._process_item(item)
# Keep current_item set during sleep so UI can see what was just processed
# Rate limit: 1 request per second
time.sleep(1)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
time.sleep(5) # Back off on errors
logger.info("MusicBrainz worker thread finished")
def _get_next_item(self) -> Optional[Dict[str, Any]]:
"""Get next item to process from priority queue"""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
# Priority 1: Unattempted artists
cursor.execute("""
SELECT id, name
@ -158,7 +158,7 @@ class MusicBrainzWorker:
row = cursor.fetchone()
if row:
return {'type': 'artist', 'id': row[0], 'name': row[1]}
# Priority 2: Unattempted albums
cursor.execute("""
SELECT a.id, a.title, ar.name AS artist_name
@ -171,7 +171,7 @@ class MusicBrainzWorker:
row = cursor.fetchone()
if row:
return {'type': 'album', 'id': row[0], 'name': row[1], 'artist': row[2]}
# Priority 3: Unattempted tracks
cursor.execute("""
SELECT t.id, t.title, ar.name AS artist_name
@ -184,7 +184,7 @@ class MusicBrainzWorker:
row = cursor.fetchone()
if row:
return {'type': 'track', 'id': row[0], 'name': row[1], 'artist': row[2]}
# Priority 4: Retry 'not_found' artists after retry_days
cutoff_date = datetime.now() - timedelta(days=self.retry_days)
cursor.execute("""
@ -199,7 +199,7 @@ class MusicBrainzWorker:
if row:
logger.info(f"Retrying artist '{row[1]}' (last attempted: {cutoff_date})")
return {'type': 'artist', 'id': row[0], 'name': row[1]}
# Priority 5: Retry 'not_found' albums
cursor.execute("""
SELECT a.id, a.title, ar.name AS artist_name
@ -213,7 +213,7 @@ class MusicBrainzWorker:
row = cursor.fetchone()
if row:
return {'type': 'album', 'id': row[0], 'name': row[1], 'artist': row[2]}
# Priority 6: Retry 'not_found' tracks
cursor.execute("""
SELECT t.id, t.title, ar.name AS artist_name
@ -227,25 +227,25 @@ class MusicBrainzWorker:
row = cursor.fetchone()
if row:
return {'type': 'track', 'id': row[0], 'name': row[1], 'artist': row[2]}
return None
except Exception as e:
logger.error(f"Error getting next item: {e}")
return None
finally:
if conn:
conn.close()
def _process_item(self, item: Dict[str, Any]):
"""Process a single item (artist, album, or track)"""
try:
item_type = item['type']
item_id = item['id']
item_name = item['name']
logger.debug(f"Processing {item_type} #{item_id}: {item_name}")
if item_type == 'artist':
result = self.mb_service.match_artist(item_name)
if result and result.get('mbid'):
@ -256,7 +256,7 @@ class MusicBrainzWorker:
self.mb_service.update_artist_mbid(item_id, None, 'not_found')
self.stats['not_found'] += 1
logger.debug(f"❌ No match for artist '{item_name}'")
elif item_type == 'album':
artist_name = item.get('artist')
result = self.mb_service.match_release(item_name, artist_name)
@ -268,7 +268,7 @@ class MusicBrainzWorker:
self.mb_service.update_album_mbid(item_id, None, 'not_found')
self.stats['not_found'] += 1
logger.debug(f"❌ No match for album '{item_name}'")
elif item_type == 'track':
artist_name = item.get('artist')
result = self.mb_service.match_recording(item_name, artist_name)
@ -280,11 +280,11 @@ class MusicBrainzWorker:
self.mb_service.update_track_mbid(item_id, None, 'not_found')
self.stats['not_found'] += 1
logger.debug(f"❌ No match for track '{item_name}'")
except Exception as e:
logger.error(f"Error processing {item['type']} #{item['id']}: {e}")
self.stats['errors'] += 1
# Mark as error in database
try:
if item['type'] == 'artist':
@ -295,46 +295,46 @@ class MusicBrainzWorker:
self.mb_service.update_track_mbid(item['id'], None, 'error')
except Exception as e2:
logger.error(f"Error updating item status: {e2}")
def _count_pending_items(self) -> int:
"""Count how many items still need processing"""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
# Count unattempted items
cursor.execute("""
SELECT
SELECT
(SELECT COUNT(*) FROM artists WHERE musicbrainz_match_status IS NULL) +
(SELECT COUNT(*) FROM albums WHERE musicbrainz_match_status IS NULL) +
(SELECT COUNT(*) FROM tracks WHERE musicbrainz_match_status IS NULL)
AS pending
""")
row = cursor.fetchone()
return row[0] if row else 0
except Exception as e:
logger.error(f"Error counting pending items: {e}")
return 0
finally:
if conn:
conn.close()
def _get_progress_breakdown(self) -> Dict[str, Dict[str, int]]:
"""Get progress breakdown by entity type"""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
progress = {}
# Artists progress
cursor.execute("""
SELECT
SELECT
COUNT(*) AS total,
SUM(CASE WHEN musicbrainz_match_status IS NOT NULL THEN 1 ELSE 0 END) AS processed
FROM artists
@ -347,10 +347,10 @@ class MusicBrainzWorker:
'total': total,
'percent': int((processed / total * 100) if total > 0 else 0)
}
# Albums progress
cursor.execute("""
SELECT
SELECT
COUNT(*) AS total,
SUM(CASE WHEN musicbrainz_match_status IS NOT NULL THEN 1 ELSE 0 END) AS processed
FROM albums
@ -363,10 +363,10 @@ class MusicBrainzWorker:
'total': total,
'percent': int((processed / total * 100) if total > 0 else 0)
}
# Tracks progress
cursor.execute("""
SELECT
SELECT
COUNT(*) AS total,
SUM(CASE WHEN musicbrainz_match_status IS NOT NULL THEN 1 ELSE 0 END) AS processed
FROM tracks
@ -379,9 +379,9 @@ class MusicBrainzWorker:
'total': total,
'percent': int((processed / total * 100) if total > 0 else 0)
}
return progress
except Exception as e:
logger.error(f"Error getting progress breakdown: {e}")
return {}

@ -297,6 +297,9 @@ class MusicDatabase:
# Add MusicBrainz columns to library tables (migration)
self._add_musicbrainz_columns(cursor)
# Add external ID columns (Spotify/iTunes) to library tables (migration)
self._add_external_id_columns(cursor)
conn.commit()
logger.info("Database initialized successfully")
@ -952,6 +955,46 @@ class MusicDatabase:
logger.error(f"Error adding MusicBrainz columns: {e}")
# Don't raise - this is a migration, database can still function
def _add_external_id_columns(self, cursor):
"""Add Spotify/iTunes external ID columns to library tables for enrichment"""
try:
# Artists table
cursor.execute("PRAGMA table_info(artists)")
artists_columns = [column[1] for column in cursor.fetchall()]
if 'spotify_artist_id' not in artists_columns:
cursor.execute("ALTER TABLE artists ADD COLUMN spotify_artist_id TEXT")
cursor.execute("ALTER TABLE artists ADD COLUMN itunes_artist_id TEXT")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_artists_spotify_id ON artists (spotify_artist_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_artists_itunes_id ON artists (itunes_artist_id)")
logger.info("Added external ID columns to artists table")
# Albums table
cursor.execute("PRAGMA table_info(albums)")
albums_columns = [column[1] for column in cursor.fetchall()]
if 'spotify_album_id' not in albums_columns:
cursor.execute("ALTER TABLE albums ADD COLUMN spotify_album_id TEXT")
cursor.execute("ALTER TABLE albums ADD COLUMN itunes_album_id TEXT")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_albums_spotify_id ON albums (spotify_album_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_albums_itunes_id ON albums (itunes_album_id)")
logger.info("Added external ID columns to albums table")
# Tracks table
cursor.execute("PRAGMA table_info(tracks)")
tracks_columns = [column[1] for column in cursor.fetchall()]
if 'spotify_track_id' not in tracks_columns:
cursor.execute("ALTER TABLE tracks ADD COLUMN spotify_track_id TEXT")
cursor.execute("ALTER TABLE tracks ADD COLUMN itunes_track_id TEXT")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tracks_spotify_id ON tracks (spotify_track_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tracks_itunes_id ON tracks (itunes_track_id)")
logger.info("Added external ID columns to tracks table")
except Exception as e:
logger.error(f"Error adding external ID columns: {e}")
# Don't raise - this is a migration, database can still function
def close(self):
"""Close database connection (no-op since we create connections per operation)"""
# Each operation creates and closes its own connection, so nothing to do here

@ -7767,9 +7767,9 @@ def _get_file_path_from_template(context: dict, template_type: str = 'album_path
# METADATA & COVER ART HELPERS (Ported from downloads.py)
# ===================================================================
from mutagen import File as MutagenFile
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TDRC, TRCK, TCON, TPE2, TPOS, TXXX, APIC
from mutagen.id3 import ID3, TIT2, TPE1, TALB, TDRC, TRCK, TCON, TPE2, TPOS, TXXX, APIC, UFID, TSRC
from mutagen.flac import FLAC, Picture
from mutagen.mp4 import MP4, MP4Cover
from mutagen.mp4 import MP4, MP4Cover, MP4FreeForm
from mutagen.oggvorbis import OggVorbis
import urllib.request
@ -7883,6 +7883,8 @@ def _enhance_file_metadata(file_path: str, context: dict, artist: dict, album_in
if config_manager.get('metadata_enhancement.embed_album_art', True):
_embed_album_art_metadata(audio_file_raw, metadata)
_embed_source_ids(audio_file_raw, metadata)
audio_file_raw.save()
print("✅ Metadata enhanced successfully.")
@ -8025,6 +8027,28 @@ def _extract_spotify_metadata(context: dict, artist: dict, album_info: dict) ->
metadata['album_art_url'] = album_info.get('album_image_url')
# Extract source IDs (Spotify or iTunes) for tag embedding
track_info = context.get("track_info", {})
if track_info and track_info.get('id'):
# Spotify track IDs are alphanumeric strings; iTunes IDs are numeric
track_id = str(track_info['id'])
if track_id.isdigit():
metadata['itunes_track_id'] = track_id
else:
metadata['spotify_track_id'] = track_id
if artist.get('id'):
artist_id = str(artist['id'])
if artist_id.isdigit():
pass # iTunes artist ID not available in this context reliably
else:
metadata['spotify_artist_id'] = artist_id
if spotify_album and spotify_album.get('id'):
album_id = str(spotify_album['id'])
if album_id.isdigit():
pass # iTunes album ID not available in this context reliably
else:
metadata['spotify_album_id'] = album_id
return metadata
def _embed_album_art_metadata(audio_file, metadata: dict):
@ -8065,6 +8089,161 @@ def _embed_album_art_metadata(audio_file, metadata: dict):
except Exception as e:
print(f"❌ Error embedding album art: {e}")
def _embed_source_ids(audio_file, metadata: dict):
"""
Lookup MusicBrainz recording MBID, ISRC, and genres, then embed them along
with Spotify/iTunes source IDs as custom tags into the audio file.
One file write, one shot. Concurrent calls are safe the global rate
limiter in musicbrainz_client.py serializes all MB API access.
Operates on a non-easy-mode MutagenFile object (caller must save).
"""
try:
# ── 1. Collect Spotify / iTunes IDs already in metadata ──
id_tags = {}
if metadata.get('spotify_track_id'):
id_tags['SPOTIFY_TRACK_ID'] = metadata['spotify_track_id']
if metadata.get('spotify_artist_id'):
id_tags['SPOTIFY_ARTIST_ID'] = metadata['spotify_artist_id']
if metadata.get('spotify_album_id'):
id_tags['SPOTIFY_ALBUM_ID'] = metadata['spotify_album_id']
if metadata.get('itunes_track_id'):
id_tags['ITUNES_TRACK_ID'] = metadata['itunes_track_id']
# ── 2. MusicBrainz lookup for MBID, genres, and ISRC ──
# The global rate limiter in musicbrainz_client.py serializes all API
# calls (worker + any number of post-processing threads) to 1 req/sec
# via _api_call_lock, so no pause/resume needed.
recording_mbid = None
artist_mbid = None
mb_genres = []
isrc = None
track_title = metadata.get('title', '')
# Use album_artist (single primary artist) for MB lookup, not the
# comma-joined multi-artist field which would give bad search results
artist_name = metadata.get('album_artist', '') or metadata.get('artist', '')
if not config_manager.get('musicbrainz.embed_tags', True):
# Skip MB lookup, just write Spotify/iTunes IDs if any
pass
elif track_title and artist_name:
try:
mb_service = mb_worker.mb_service if mb_worker else None
if mb_service:
result = mb_service.match_recording(track_title, artist_name)
if result and result.get('mbid'):
recording_mbid = result['mbid']
id_tags['MUSICBRAINZ_RECORDING_ID'] = recording_mbid
print(f"🎵 MusicBrainz recording matched: {recording_mbid}")
# Lookup recording details for ISRC and genres
details = mb_service.mb_client.get_recording(
recording_mbid, includes=['isrcs', 'genres']
)
if details:
isrcs = details.get('isrcs', [])
if isrcs:
isrc = isrcs[0]
mb_genres = [
g['name'] for g in sorted(
details.get('genres', []),
key=lambda x: x.get('count', 0),
reverse=True
)
]
# Also try to get artist MBID (may already be cached from worker)
artist_result = mb_service.match_artist(artist_name)
if artist_result and artist_result.get('mbid'):
artist_mbid = artist_result['mbid']
id_tags['MUSICBRAINZ_ARTIST_ID'] = artist_mbid
else:
print("⚠️ MusicBrainz worker not available, skipping MBID lookup")
except Exception as e:
print(f"⚠️ MusicBrainz lookup failed (non-fatal): {e}")
if not id_tags:
return
# ── 3. Write all tags into the file ──
written = []
# MP3 (ID3)
if isinstance(audio_file.tags, ID3):
for tag_name, value in id_tags.items():
if tag_name == 'MUSICBRAINZ_RECORDING_ID':
audio_file.tags.add(UFID(owner='http://musicbrainz.org', data=value.encode('ascii')))
written.append('UFID:http://musicbrainz.org')
elif tag_name == 'MUSICBRAINZ_ARTIST_ID':
audio_file.tags.add(TXXX(encoding=3, desc='MusicBrainz Artist Id', text=[value]))
written.append('TXXX:MusicBrainz Artist Id')
else:
audio_file.tags.add(TXXX(encoding=3, desc=tag_name, text=[str(value)]))
written.append(f'TXXX:{tag_name}')
# FLAC / OGG Vorbis
elif isinstance(audio_file, (FLAC, OggVorbis)):
for tag_name, value in id_tags.items():
if tag_name == 'MUSICBRAINZ_RECORDING_ID':
audio_file['MUSICBRAINZ_TRACKID'] = [value]
written.append('MUSICBRAINZ_TRACKID')
elif tag_name == 'MUSICBRAINZ_ARTIST_ID':
audio_file['MUSICBRAINZ_ARTISTID'] = [value]
written.append('MUSICBRAINZ_ARTISTID')
else:
audio_file[tag_name] = [str(value)]
written.append(tag_name)
# MP4 (M4A/AAC)
elif isinstance(audio_file, MP4):
for tag_name, value in id_tags.items():
if tag_name == 'MUSICBRAINZ_RECORDING_ID':
key = '----:com.apple.iTunes:MusicBrainz Track Id'
elif tag_name == 'MUSICBRAINZ_ARTIST_ID':
key = '----:com.apple.iTunes:MusicBrainz Artist Id'
else:
key = f'----:com.apple.iTunes:{tag_name}'
audio_file[key] = [MP4FreeForm(str(value).encode('utf-8'))]
written.append(key)
if written:
print(f"🔗 Embedded IDs: {', '.join(written)}")
# ── 4. Merge genres (Spotify + MusicBrainz) and overwrite tag ──
if mb_genres:
spotify_genres = [g.strip() for g in metadata.get('genre', '').split(',') if g.strip()]
seen = set()
merged = []
for g in spotify_genres + mb_genres:
key = g.strip().lower()
if key and key not in seen:
seen.add(key)
merged.append(g.strip().title())
if len(merged) >= 5:
break
if merged:
genre_string = ', '.join(merged)
if isinstance(audio_file.tags, ID3):
audio_file.tags.add(TCON(encoding=3, text=[genre_string]))
elif isinstance(audio_file, (FLAC, OggVorbis)):
audio_file['GENRE'] = [genre_string]
elif isinstance(audio_file, MP4):
audio_file['\xa9gen'] = [genre_string]
print(f"🎶 Genres merged: {genre_string}")
# ── 5. Write ISRC if available ──
if isrc:
if isinstance(audio_file.tags, ID3):
audio_file.tags.add(TSRC(encoding=3, text=[isrc]))
elif isinstance(audio_file, (FLAC, OggVorbis)):
audio_file['ISRC'] = [isrc]
elif isinstance(audio_file, MP4):
audio_file['----:com.apple.iTunes:ISRC'] = [MP4FreeForm(isrc.encode('utf-8'))]
print(f"🔖 ISRC: {isrc}")
except Exception as e:
print(f"⚠️ Error embedding source IDs (non-fatal): {e}")
def _download_cover_art(album_info: dict, target_dir: str):
"""Downloads cover.jpg into the specified directory."""
try:

Loading…
Cancel
Save