You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/workers/metadata_update.py

663 lines
30 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""WebMetadataUpdateWorker — lifted from web_server.py.
Body is byte-identical to the original. The module-level
``metadata_update_state`` global must be initialized via ``init()`` before
``WebMetadataUpdateWorker`` is instantiated, so the dict reference inside
the class body resolves to the same dict that web_server.py owns.
"""
import logging
import threading
import requests
from datetime import datetime
from core.matching_engine import MusicMatchingEngine
from core.runtime_state import add_activity_item
from config.settings import config_manager
logger = logging.getLogger(__name__)
# Injected at runtime via init() — points to web_server.metadata_update_state.
metadata_update_state = None
def init(state):
"""Bind the shared metadata_update_state dict from web_server."""
global metadata_update_state
metadata_update_state = state
class WebMetadataUpdateWorker:
"""Web-based metadata update worker - EXACT port of dashboard.py MetadataUpdateWorker"""
def __init__(self, artists, media_client, spotify_client, server_type, refresh_interval_days=30):
self.artists = artists
self.media_client = media_client # Can be plex_client or jellyfin_client
self.spotify_client = spotify_client
self.server_type = server_type # "plex" or "jellyfin"
self.matching_engine = MusicMatchingEngine()
self.refresh_interval_days = refresh_interval_days
self.should_stop = False
self.processed_count = 0
self.successful_count = 0
self.failed_count = 0
self.max_workers = 1
# DB-first: reuse existing metadata from SoulSync database
try:
from database.music_database import MusicDatabase
self._db = MusicDatabase()
except Exception:
self._db = None
self.thread_lock = threading.Lock()
def stop(self):
self.should_stop = True
def get_artist_name(self, artist):
"""Get artist name consistently across Plex and Jellyfin"""
return getattr(artist, 'title', 'Unknown Artist')
def run(self):
"""Process all artists one by one - EXACT copy from dashboard.py"""
global metadata_update_state
try:
# Load artists in background if not provided - EXACTLY like dashboard.py
if self.artists is None:
# Enable lightweight mode for Jellyfin to skip track caching
if self.server_type == "jellyfin":
self.media_client.set_metadata_only_mode(True)
elif self.server_type == "navidrome":
# Navidrome doesn't need special mode setting
pass
all_artists = self.media_client.get_all_artists()
logger.debug(f"Raw artists returned: {[getattr(a, 'title', 'NO_TITLE') for a in (all_artists or [])]}")
if not all_artists:
metadata_update_state['status'] = 'error'
metadata_update_state['error'] = f"No artists found in {self.server_type.title()} library"
add_activity_item("", "Metadata Update", metadata_update_state['error'], "Now")
return
# Filter artists that need processing
artists_to_process = [artist for artist in all_artists if self.artist_needs_processing(artist)]
self.artists = artists_to_process
# Emit loaded signal equivalent - EXACTLY like dashboard.py
if len(artists_to_process) == 0:
metadata_update_state['status'] = 'completed'
metadata_update_state['completed_at'] = datetime.now()
add_activity_item("", "Metadata Update", "All artists already have good metadata", "Now")
return
else:
add_activity_item("", "Metadata Update", f"Processing {len(artists_to_process)} of {len(all_artists)} artists", "Now")
if not artists_to_process:
metadata_update_state['status'] = 'completed'
metadata_update_state['completed_at'] = datetime.now()
return
total_artists = len(self.artists)
metadata_update_state['total'] = total_artists
# Process artists in parallel using ThreadPoolExecutor - EXACTLY like dashboard.py
def process_single_artist(artist):
"""Process a single artist and return results"""
if self.should_stop or metadata_update_state['status'] == 'stopping':
return None
artist_name = getattr(artist, 'title', 'Unknown Artist')
# Double-check ignore flag right before processing
if self.media_client.is_artist_ignored(artist):
return (artist_name, True, "Skipped (ignored)")
try:
success, details = self.update_artist_metadata(artist)
return (artist_name, success, details)
except Exception as e:
return (artist_name, False, f"Error: {str(e)}")
# Process artists sequentially with rate limiting
# (no ThreadPoolExecutor — API rate limits make parallelism counterproductive)
import time
for artist in self.artists:
if self.should_stop or metadata_update_state['status'] == 'stopping':
break
result = process_single_artist(artist)
if result is None:
continue
artist_name, success, details = result
with self.thread_lock:
self.processed_count += 1
if success:
self.successful_count += 1
else:
self.failed_count += 1
progress_percent = (self.processed_count / total_artists) * 100
metadata_update_state.update({
'current_artist': artist_name,
'processed': self.processed_count,
'percentage': progress_percent,
'successful': self.successful_count,
'failed': self.failed_count
})
# Rate limit: 1.5s between artists (this actually runs between artists now)
time.sleep(1.5)
# Mark as completed - equivalent to finished.emit
metadata_update_state['status'] = 'completed'
metadata_update_state['completed_at'] = datetime.now()
metadata_update_state['current_artist'] = 'Completed'
summary = f"Processed {self.processed_count} artists: {self.successful_count} updated, {self.failed_count} failed"
add_activity_item("", "Metadata Complete", summary, "Now")
except Exception as e:
logger.error(f"Metadata update failed: {e}")
metadata_update_state['status'] = 'error'
metadata_update_state['error'] = str(e)
add_activity_item("", "Metadata Error", str(e), "Now")
def artist_needs_processing(self, artist):
"""Check if an artist needs metadata processing using age-based detection - EXACT copy from dashboard.py"""
try:
# Check if artist is manually ignored
if self.media_client.is_artist_ignored(artist):
return False
# Use media client's age-based checking with configured interval
return self.media_client.needs_update_by_age(artist, self.refresh_interval_days)
except Exception as e:
logger.error(f"Error checking artist {getattr(artist, 'title', 'Unknown')}: {e}")
return True # Process if we can't determine status
def _check_db_artist(self, artist_name):
"""Check SoulSync DB for existing artist metadata (genres, spotify_artist_id).
NOTE: DB thumb_url is a Plex/Jellyfin internal path, NOT a downloadable URL.
Photos must be checked via the media server object, not the DB.
Returns (db_artist_dict, has_genres, spotify_artist_id) or (None, False, None) if not found."""
if not self._db:
return None, False, None
try:
db_artists = self._db.search_artists(artist_name, limit=5)
if not db_artists:
return None, False, None
# Find best name match
best = None
best_score = 0.0
norm_name = self.matching_engine.normalize_string(artist_name)
for dba in db_artists:
score = self.matching_engine.similarity_score(
norm_name, self.matching_engine.normalize_string(dba.name))
if score > best_score:
best_score = score
best = dba
if not best or best_score < 0.85:
return None, False, None
has_genres = bool(best.genres and len(best.genres) > 0)
# Get spotify_artist_id from raw DB row (not in dataclass)
spotify_artist_id = None
try:
raw = self._db.api_get_artist(best.id)
if raw:
spotify_artist_id = raw.get('spotify_artist_id')
except Exception as e:
logger.debug("get spotify_artist_id failed: %s", e)
return best, has_genres, spotify_artist_id
except Exception:
return None, False, None
def update_artist_metadata(self, artist):
"""Update a single artist's metadata. Checks SoulSync DB first to avoid unnecessary API calls.
DB-first strategy:
- Genres: DB stores real genre strings → can apply directly, skip Spotify
- spotify_artist_id: DB may have it from enrichment → skip search_artists() call
- Photos/album art: DB thumb_url is a media-server internal path (not downloadable)
so these MUST come from Spotify API
"""
try:
artist_name = getattr(artist, 'title', 'Unknown Artist')
# Skip processing for artists with no valid name
if artist_name == 'Unknown Artist' or not artist_name or not artist_name.strip():
return False, "Skipped: No valid artist name"
# DB-first: check what we already have cached
db_artist, db_has_genres, db_spotify_id = self._check_db_artist(artist_name)
# Check what the media server artist is currently missing
needs_photo = not self.artist_has_valid_photo(artist) if self.server_type != "jellyfin" else True
needs_genres = not getattr(artist, 'genres', None)
needs_album_art = self.server_type == "plex"
# If media server already has valid photo + genres + album art, skip entirely
if not needs_photo and not needs_genres and not needs_album_art:
self.media_client.update_artist_biography(artist)
return True, "Already up to date"
# Determine if we actually need Spotify
# Photos and album art MUST come from Spotify (DB only has internal media server paths)
# Genres CAN come from DB if available
need_spotify = needs_photo or needs_album_art or (needs_genres and not db_has_genres)
spotify_artist = None
highest_score = 0.0
if need_spotify:
# Try direct lookup by cached spotify_artist_id first (1 API call vs search)
if db_spotify_id:
try:
from core.spotify_client import Artist as SpotifyArtistDC
raw = self.spotify_client.get_artist(db_spotify_id)
if raw and 'name' in raw:
spotify_artist = SpotifyArtistDC.from_spotify_artist(raw)
highest_score = 1.0
logger.debug(f"Metadata updater: direct Spotify lookup for '{artist_name}' via cached ID {db_spotify_id}")
except Exception as e:
logger.debug(f"Direct Spotify lookup failed for {db_spotify_id}: {e}")
spotify_artist = None
# Fall back to search if direct lookup didn't work
if not spotify_artist:
spotify_artists = self.spotify_client.search_artists(artist_name, limit=5)
if not spotify_artists:
# Spotify failed — apply DB genres if available, skip photos/art
changes_made = []
if needs_genres and db_has_genres and db_artist:
if self._apply_db_genres(artist, db_artist.genres):
changes_made.append("genres (DB)")
if changes_made:
self.media_client.update_artist_biography(artist)
return True, f"Updated {', '.join(changes_made)} (Spotify unavailable)"
return False, "Not found on Spotify"
# Find the best match
best_match = None
plex_artist_normalized = self.matching_engine.normalize_string(artist_name)
for sa in spotify_artists:
spotify_artist_normalized = self.matching_engine.normalize_string(sa.name)
score = self.matching_engine.similarity_score(plex_artist_normalized, spotify_artist_normalized)
if score > highest_score:
highest_score = score
best_match = sa
if not best_match or highest_score < 0.7:
# No good Spotify match — still try DB genres
changes_made = []
if needs_genres and db_has_genres and db_artist:
if self._apply_db_genres(artist, db_artist.genres):
changes_made.append("genres (DB)")
if changes_made:
self.media_client.update_artist_biography(artist)
return True, f"Updated {', '.join(changes_made)} (no Spotify match)"
return False, f"No confident match found (best: '{getattr(best_match, 'name', 'N/A')}', score: {highest_score:.2f})"
spotify_artist = best_match
changes_made = []
# Update photo (always from Spotify — DB only has media server paths)
if needs_photo and spotify_artist:
photo_updated = self.update_artist_photo(artist, spotify_artist)
if photo_updated:
changes_made.append("photo")
# Update genres — use DB if available, otherwise Spotify
if needs_genres:
if db_has_genres and db_artist:
genres_updated = self._apply_db_genres(artist, db_artist.genres)
if genres_updated:
changes_made.append("genres (DB)")
elif spotify_artist:
# DB genres didn't result in changes, try Spotify for newer/different genres
genres_updated = self.update_artist_genres(artist, spotify_artist)
if genres_updated:
changes_made.append("genres")
elif spotify_artist:
genres_updated = self.update_artist_genres(artist, spotify_artist)
if genres_updated:
changes_made.append("genres")
# Update album artwork (only for Plex, always from Spotify)
if self.server_type == "plex" and spotify_artist:
albums_updated = self.update_album_artwork(artist, spotify_artist)
if albums_updated > 0:
changes_made.append(f"{albums_updated} album art")
elif self.server_type != "plex":
logger.info(f"Skipping album artwork updates for Jellyfin artist: {artist.title}")
if changes_made:
biography_updated = self.media_client.update_artist_biography(artist)
if biography_updated:
changes_made.append("timestamp")
source = f"match: '{spotify_artist.name}', score: {highest_score:.2f}" if spotify_artist else "DB cache"
details = f"Updated {', '.join(changes_made)} ({source})"
return True, details
else:
self.media_client.update_artist_biography(artist)
return True, "Already up to date"
except Exception as e:
return False, str(e)
def _apply_db_genres(self, artist, genres):
"""Apply genres from DB cache to media server."""
try:
if not genres:
return False
existing_genres = set(genre.tag if hasattr(genre, 'tag') else str(genre)
for genre in (getattr(artist, 'genres', None) or []))
db_genres = set(g for g in genres if g and g.strip() and len(g.strip()) > 1)
if db_genres and db_genres != existing_genres:
return self.media_client.update_artist_genres(artist, list(db_genres)[:10])
return False
except Exception:
return False
def update_artist_photo(self, artist, spotify_artist):
"""Update artist photo from Spotify - EXACT copy from dashboard.py"""
try:
# Check if artist already has a good photo (skip check for Jellyfin)
if self.server_type != "jellyfin" and self.artist_has_valid_photo(artist):
logger.info(f"Skipping {artist.title}: already has valid photo ({getattr(artist, 'thumb', 'None')})")
return False
# Get the image URL from Spotify
if not spotify_artist.image_url:
logger.warning(f"Skipping {artist.title}: no Spotify image URL available")
return False
logger.info(f"Processing {artist.title}: downloading from Spotify...")
image_url = spotify_artist.image_url
# Download and validate image
response = requests.get(image_url, timeout=10)
response.raise_for_status()
# Validate and convert image (skip conversion for Jellyfin to preserve format)
if self.server_type == "jellyfin":
# For Jellyfin, use raw image data to preserve original format
image_data = response.content
logger.info(f"Using raw image data for Jellyfin ({len(image_data)} bytes)")
else:
# For other servers, validate and convert
image_data = self.validate_and_convert_image(response.content)
if not image_data:
return False
# Upload to media server using client's method
return self.media_client.update_artist_poster(artist, image_data)
except Exception as e:
logger.error(f"Error updating photo for {getattr(artist, 'title', 'Unknown')}: {e}")
return False
def update_artist_genres(self, artist, spotify_artist):
"""Update artist genres from Spotify and albums - EXACT copy from dashboard.py"""
try:
# Get existing genres
existing_genres = set(genre.tag if hasattr(genre, 'tag') else str(genre)
for genre in (artist.genres or []))
# Get Spotify artist genres
spotify_genres = set(spotify_artist.genres or [])
# Get genres from all albums
album_genres = set()
try:
for album in artist.albums():
if hasattr(album, 'genres') and album.genres:
album_genres.update(genre.tag if hasattr(genre, 'tag') else str(genre)
for genre in album.genres)
except Exception as e:
logger.debug("artist album genre walk: %s", e)
# Combine all genres (prioritize Spotify genres)
all_genres = spotify_genres.union(album_genres)
# Filter out empty/invalid genres
all_genres = {g for g in all_genres if g and g.strip() and len(g.strip()) > 1}
# Only update if we have new genres and they're different
if all_genres and (not existing_genres or all_genres != existing_genres):
# Convert to list and limit to 10 genres
genre_list = list(all_genres)[:10]
# Use media client API to update genres
success = self.media_client.update_artist_genres(artist, genre_list)
if success:
return True
else:
return False
else:
return False
except Exception as e:
logger.error(f"Error updating genres for {getattr(artist, 'title', 'Unknown')}: {e}")
return False
def update_album_artwork(self, artist, spotify_artist):
"""Update album artwork for all albums by this artist from Spotify.
DB thumb_url is a media-server internal path, so album art must come from Spotify."""
try:
updated_count = 0
skipped_count = 0
# Get all albums for this artist
try:
albums = list(artist.albums())
except Exception:
logger.error(f"Could not access albums for artist '{artist.title}'")
return 0
if not albums:
logger.warning(f"No albums found for artist '{artist.title}'")
return 0
import time
for album in albums:
try:
album_title = getattr(album, 'title', 'Unknown Album')
# Check if album already has good artwork on the media server
if self.album_has_valid_artwork(album):
skipped_count += 1
continue
# Rate limit between album API calls
time.sleep(0.5)
# Search for this specific album on Spotify
album_query = f"album:{album_title} artist:{spotify_artist.name}"
spotify_albums = self.spotify_client.search_albums(album_query, limit=3)
if not spotify_albums:
continue
# Find the best matching album
best_album = None
highest_score = 0.0
plex_album_normalized = self.matching_engine.normalize_string(album_title)
for spotify_album in spotify_albums:
spotify_album_normalized = self.matching_engine.normalize_string(spotify_album.name)
score = self.matching_engine.similarity_score(plex_album_normalized, spotify_album_normalized)
if score > highest_score:
highest_score = score
best_album = spotify_album
# If we found a good match with artwork, download it
if best_album and highest_score > 0.7 and best_album.image_url:
if self.download_and_upload_album_artwork(album, best_album.image_url):
updated_count += 1
except Exception as e:
logger.error(f"Error processing album '{getattr(album, 'title', 'Unknown')}': {e}")
continue
return updated_count
except Exception as e:
logger.error(f"Error updating album artwork for artist '{getattr(artist, 'title', 'Unknown')}': {e}")
return 0
def album_has_valid_artwork(self, album):
"""Check if album has valid artwork - EXACT copy from dashboard.py"""
try:
if not hasattr(album, 'thumb') or not album.thumb:
return False
thumb_url = str(album.thumb)
# Completely empty or None
if not thumb_url or thumb_url.strip() == '':
return False
# Obvious placeholder text in URL
obvious_placeholders = ['no-image', 'placeholder', 'missing', 'default-album', 'blank.jpg', 'empty.png']
thumb_lower = thumb_url.lower()
for placeholder in obvious_placeholders:
if placeholder in thumb_lower:
return False
# Extremely short URLs (likely broken)
if len(thumb_url) < 20:
return False
return True
except Exception as e:
return True
def download_and_upload_album_artwork(self, album, image_url):
"""Download artwork from Spotify and upload to media server - EXACT copy from dashboard.py"""
try:
# Download image from Spotify
response = requests.get(image_url, timeout=10)
response.raise_for_status()
# Validate and convert image
image_data = self.validate_and_convert_image(response.content)
if not image_data:
return False
# Upload using media client
success = self.media_client.update_album_poster(album, image_data)
return success
except Exception as e:
logger.error(f"Error downloading/uploading artwork for album '{getattr(album, 'title', 'Unknown')}': {e}")
return False
def artist_has_valid_photo(self, artist):
"""Check if artist has a valid photo - EXACT copy from dashboard.py"""
try:
if not hasattr(artist, 'thumb') or not artist.thumb:
return False
thumb_url = str(artist.thumb)
if 'default' in thumb_url.lower() or len(thumb_url) < 50:
return False
return True
except Exception:
return False
def validate_and_convert_image(self, image_data):
"""Validate and convert image for media server compatibility - EXACT copy from dashboard.py"""
try:
from PIL import Image
import io
# Open and validate image
image = Image.open(io.BytesIO(image_data))
# Check minimum dimensions
width, height = image.size
if width < 200 or height < 200:
return None
# Convert to JPEG for consistency
if image.format != 'JPEG':
buffer = io.BytesIO()
image.convert('RGB').save(buffer, format='JPEG', quality=95)
return buffer.getvalue()
return image_data
except Exception:
return None
def upload_artist_poster(self, artist, image_data):
"""Upload poster using media client - EXACT copy from dashboard.py"""
try:
# Use media client's update method if available
if hasattr(self.media_client, 'update_artist_poster'):
return self.media_client.update_artist_poster(artist, image_data)
# Fallback for Plex: direct API call
if self.server_type == "plex":
import requests
server = self.media_client.server
upload_url = f"{server._baseurl}/library/metadata/{artist.ratingKey}/posters"
headers = {
'X-Plex-Token': server._token,
'Content-Type': 'image/jpeg'
}
response = requests.post(upload_url, data=image_data, headers=headers)
response.raise_for_status()
# Refresh artist to see changes
artist.refresh()
return True
# Jellyfin: Use Jellyfin API to upload artist image
elif self.server_type == "jellyfin":
import requests
jellyfin_config = config_manager.get_jellyfin_config()
jellyfin_base_url = jellyfin_config.get('base_url', '')
jellyfin_token = jellyfin_config.get('api_key', '')
if not jellyfin_base_url or not jellyfin_token:
logger.warning("Jellyfin configuration missing for image upload")
return False
upload_url = f"{jellyfin_base_url.rstrip('/')}/Items/{artist.ratingKey}/Images/Primary"
headers = {
'Authorization': f'MediaBrowser Token="{jellyfin_token}"',
'Content-Type': 'image/jpeg'
}
response = requests.post(upload_url, data=image_data, headers=headers)
response.raise_for_status()
return True
# Navidrome: Currently not supported (Subsonic API doesn't support image uploads)
elif self.server_type == "navidrome":
logger.info(" Navidrome does not support artist image uploads via Subsonic API")
return False
else:
# Unknown server type
return False
except Exception as e:
logger.error(f"Error uploading poster: {e}")
return False