You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/watchlist_scanner.py

4032 lines
191 KiB

#!/usr/bin/env python3
"""
Watchlist Scanner Service - Monitors watched artists for new releases
"""
from typing import List, Dict, Any, Optional, Callable
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass
import re
import time
from difflib import SequenceMatcher
import requests
from bs4 import BeautifulSoup
from database.music_database import get_database, WatchlistArtist
from core.spotify_client import SpotifyClient
from core.metadata_service import (
get_album_tracks_for_source,
get_client_for_source,
get_primary_source,
get_source_priority,
)
from core.wishlist_service import get_wishlist_service
from core.matching_engine import MusicMatchingEngine
from utils.logging_config import get_logger
logger = get_logger("watchlist_scanner")
# Rate limiting constants for watchlist operations
DELAY_BETWEEN_ARTISTS = 4.0 # 4 seconds between different artists (was 2s, increased to reduce Spotify rate limit risk)
DELAY_BETWEEN_ALBUMS = 0.5 # 500ms between albums for same artist
DELAY_BETWEEN_API_BATCHES = 1.0 # 1 second between API batch operations
def clean_track_name_for_search(track_name):
"""
Intelligently cleans a track name for searching by removing noise while preserving important version information.
Removes: (feat. Artist), (Explicit), (Clean), etc.
Keeps: (Extended Version), (Live), (Acoustic), (Remix), etc.
"""
if not track_name or not isinstance(track_name, str):
return track_name
cleaned_name = track_name
# Define patterns to REMOVE (noise that doesn't affect track identity)
remove_patterns = [
r'\s*\(explicit\)', # (Explicit)
r'\s*\(clean\)', # (Clean)
r'\s*\(radio\s*edit\)', # (Radio Edit)
r'\s*\(radio\s*version\)', # (Radio Version)
r'\s*\(feat\.?\s*[^)]+\)', # (feat. Artist) or (ft. Artist)
r'\s*\(ft\.?\s*[^)]+\)', # (ft Artist)
r'\s*\(featuring\s*[^)]+\)', # (featuring Artist)
r'\s*\(with\s*[^)]+\)', # (with Artist)
r'\s*\[[^\]]*explicit[^\]]*\]', # [Explicit] in brackets
r'\s*\[[^\]]*clean[^\]]*\]', # [Clean] in brackets
]
# Apply removal patterns
for pattern in remove_patterns:
cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE).strip()
# PRESERVE important version information (do NOT remove these)
# These patterns are intentionally NOT in the remove list:
# - (Extended Version), (Extended), (Long Version)
# - (Live), (Live Version), (Concert)
# - (Acoustic), (Acoustic Version)
# - (Remix), (Club Mix), (Dance Mix)
# - (Remastered), (Remaster)
# - (Demo), (Studio Version)
# - (Instrumental)
# - Album/year info like (2023), (Deluxe Edition)
# If cleaning results in an empty string, return the original track name
if not cleaned_name.strip():
return track_name
# Log cleaning if significant changes were made
if cleaned_name != track_name:
logger.debug(f"Intelligent track cleaning: '{track_name}' -> '{cleaned_name}'")
return cleaned_name
def is_live_version(track_name: str, album_name: str = "") -> bool:
"""
Detect if a track or album is a live version.
Uses patterns that require a clear live-recording context (parenthesized
"(Live)", dash-suffixed "- Live", or "live" with a location/format
modifier). The bare `\\blive\\b` pattern was too loose — it falsely
flagged verb uses like "What We Live For" or "Live Forever".
Args:
track_name: Track name to check
album_name: Album name to check (optional)
Returns:
True if this is a live version, False otherwise
"""
if not track_name:
return False
# Combine track and album names for comprehensive checking
text_to_check = f"{track_name} {album_name}".lower()
# Live-recording patterns — each one requires clear context so verbs
# like "What We Live For" / "Live Forever" / "Living on a Prayer" don't
# get swept up.
live_patterns = [
r'[\(\[]live\b', # (Live), (Live at ...), [Live Version]
r'-\s*live\b', # Song - Live, Song - Live at ...
# "live" followed by a recording-context word
r'\blive (at|from|in|on|version|session|recording|performance|album|show|tour|concert|edit|cut|take)\b',
r'\bin concert\b', # In Concert
r'\bconcert\b', # Concert (album name)
r'\bon stage\b', # On Stage
r'\bunplugged\b', # MTV Unplugged
]
for pattern in live_patterns:
if re.search(pattern, text_to_check, re.IGNORECASE):
return True
return False
def is_remix_version(track_name: str, album_name: str = "") -> bool:
"""
Detect if a track is a remix.
Args:
track_name: Track name to check
album_name: Album name to check (optional)
Returns:
True if this is a remix, False otherwise
"""
if not track_name:
return False
# Combine track and album names for comprehensive checking
text_to_check = f"{track_name} {album_name}".lower()
# Remix patterns (but NOT remaster/remastered)
remix_patterns = [
r'\bremix\b', # Remix, Remixed
r'\bmix\b(?!.*\bremaster)', # Mix (but not if followed by remaster)
r'\bedit\b', # Radio Edit, Extended Edit
r'\bversion\b(?=.*\bmix\b)', # Version with Mix (e.g., "Dance Version Mix")
r'\bclub mix\b', # Club Mix
r'\bdance mix\b', # Dance Mix
r'\bradio edit\b', # Radio Edit
r'\bextended\b(?=.*\bmix\b)', # Extended Mix
r'\bdub\b', # Dub version
r'\bvip mix\b', # VIP Mix
]
# But exclude remaster/remastered - those are originals
if re.search(r'\bremaster(ed)?\b', text_to_check, re.IGNORECASE):
return False
for pattern in remix_patterns:
if re.search(pattern, text_to_check, re.IGNORECASE):
return True
return False
def is_acoustic_version(track_name: str, album_name: str = "") -> bool:
"""
Detect if a track is an acoustic version.
Args:
track_name: Track name to check
album_name: Album name to check (optional)
Returns:
True if this is an acoustic version, False otherwise
"""
if not track_name:
return False
# Combine track and album names for comprehensive checking
text_to_check = f"{track_name} {album_name}".lower()
# Acoustic version patterns
acoustic_patterns = [
r'\bacoustic\b', # Acoustic, Acoustic Version
r'\bstripped\b', # Stripped version
r'\bpiano version\b', # Piano Version
r'\bunplugged\b', # MTV Unplugged (can be acoustic)
]
for pattern in acoustic_patterns:
if re.search(pattern, text_to_check, re.IGNORECASE):
return True
return False
def is_instrumental_version(track_name: str, album_name: str = "") -> bool:
"""
Detect if a track is an instrumental version.
Args:
track_name: Track name to check
album_name: Album name to check (optional)
Returns:
True if this is an instrumental version, False otherwise
"""
if not track_name:
return False
text_to_check = f"{track_name} {album_name}".lower()
instrumental_patterns = [
r'\binstrumental\b', # Instrumental, Instrumental Version
r'\binst\.\b', # Inst. (abbreviation)
r'\bkaraoke\b', # Karaoke version
r'\bbacking track\b', # Backing Track
]
for pattern in instrumental_patterns:
if re.search(pattern, text_to_check, re.IGNORECASE):
return True
return False
def matches_custom_exclude_terms(track_name: str, album_name: str, exclude_terms: list) -> str:
"""
Check if a track or album name contains any user-defined exclusion terms.
Args:
track_name: Track name to check
album_name: Album name to check
exclude_terms: List of terms to exclude (case-insensitive)
Returns:
The matched term if found, empty string if no match
"""
if not exclude_terms:
return ""
text_to_check = f"{track_name} {album_name}".lower()
for term in exclude_terms:
term = term.strip().lower()
if not term:
continue
if term in text_to_check:
return term
return ""
def is_compilation_album(album_name: str) -> bool:
"""
Detect if an album is a compilation/greatest hits album.
Args:
album_name: Album name to check
Returns:
True if this is a compilation album, False otherwise
"""
if not album_name:
return False
album_lower = album_name.lower()
# Compilation album patterns
compilation_patterns = [
r'\bgreatest hits\b', # Greatest Hits
r'\bbest of\b', # Best Of
r'\banthology\b', # Anthology
r'\bcollection\b', # Collection
r'\bcompilation\b', # Compilation
r'\bthe essential\b', # The Essential...
r'\bcomplete\b', # Complete Collection
r'\bhits\b', # Hits (standalone or at end)
r'\btop\s+\d+\b', # Top 10, Top 40, etc.
r'\bvery best\b', # Very Best Of
r'\bdefinitive\b', # Definitive Collection
]
for pattern in compilation_patterns:
if re.search(pattern, album_lower, re.IGNORECASE):
return True
return False
# Common qualifying parentheticals appended to album names by Spotify /
# Deezer / iTunes / Discogs that the user's media server (Plex / Navidrome /
# Jellyfin) typically strips out of the file tags. Without normalization,
# fuzzy-comparing the two sides reports a false "different album" verdict —
# the watchlist scanner then thinks the track is missing and re-downloads
# it on every scan.
_ALBUM_QUALIFIER_PATTERNS = [
r'\bmusic\s+from(?:\s+the)?(?:\s+motion\s+picture)?\b',
r'\boriginal\s+(?:motion\s+picture\s+)?(?:soundtrack|score)\b',
r'\bsoundtrack(?:\s+from(?:\s+the)?(?:\s+motion\s+picture)?)?\b',
r'\bo\.?s\.?t\.?\b',
r'\bdeluxe(?:\s+(?:edition|version))?\b',
r'\bexpanded(?:\s+edition)?\b',
r'\bremaster(?:ed)?(?:\s+(?:\d{4}|edition))?\b',
r'\banniversary(?:\s+edition)?\b',
r'\bspecial\s+edition\b',
r'\bbonus\s+(?:track\s+)?(?:edition|version)\b',
r'\bextended(?:\s+(?:edition|version))?\b',
r'\bexplicit\b',
r'\bclean\s+version\b',
]
_ALBUM_QUALIFIER_RE = re.compile(
'|'.join(_ALBUM_QUALIFIER_PATTERNS),
re.IGNORECASE,
)
def _normalize_album_for_match(name: str) -> str:
"""Return a canonical form of an album name suitable for fuzzy comparison.
Strips qualifying parentheticals (``(Music From The Motion Picture)``,
``[Deluxe Edition]``, ``- Remastered 2011``, etc.) and any leftover
bracketed groups, lowercases, collapses whitespace. The output is meant
for comparison only — never display.
"""
if not name:
return ""
cleaned = name
# Strip the well-known qualifier phrases regardless of whether they
# sit in brackets, after a dash, or bare.
cleaned = _ALBUM_QUALIFIER_RE.sub(' ', cleaned)
# Then strip any other parenthesized / bracketed groups whatsoever —
# they're almost always edition or commentary noise, not part of the
# album's identifying name.
cleaned = re.sub(r'\s*[\(\[][^\)\]]*[\)\]]\s*', ' ', cleaned)
# Trailing dash-clauses ("Album - Remastered", "Album - Live")
cleaned = re.sub(r'\s*-\s*[^-]+$', '', cleaned)
cleaned = re.sub(r'[^a-z0-9 ]+', ' ', cleaned.lower())
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned
_VOLUME_MARKER_RE = re.compile(
r'\b(?:vol(?:ume)?|pt|part|disc|book|chapter|episode)\.?\s*(\d+)\b|\b(\d+)\s*$',
re.IGNORECASE,
)
def _extract_volume_marker(normalized_name: str):
"""Pull the trailing volume / part / disc / standalone-number marker out
of a normalized album name. Used to reject ``"Greatest Hits Volume 1"``
vs ``"Greatest Hits Volume 2"`` matches that would otherwise pass a
fuzzy ratio test on the heavily-shared prefix.
"""
if not normalized_name:
return None
matches = list(_VOLUME_MARKER_RE.finditer(normalized_name))
if not matches:
return None
last = matches[-1]
return last.group(1) or last.group(2)
def _albums_likely_match(spotify_album: str, lib_album: str, threshold: float = 0.6) -> bool:
"""Return True when two album names plausibly identify the same release.
Designed to swallow naming drift between metadata sources and the
media-server tag scan: ``"Napoleon Dynamite (Music From The Motion
Picture)"`` vs ``"Napoleon Dynamite OST"`` should be the same album,
not two — otherwise the watchlist scanner downloads the track again
every 30 minutes.
"""
if not spotify_album or not lib_album:
return False
norm_a = _normalize_album_for_match(spotify_album)
norm_b = _normalize_album_for_match(lib_album)
if not norm_a or not norm_b:
return False
# Volume / part / disc markers must agree when both sides have one.
# Otherwise ``"Greatest Hits Volume 1"`` and ``"Greatest Hits Volume 2"``
# would slip past every fuzzy threshold on the shared prefix.
vol_a = _extract_volume_marker(norm_a)
vol_b = _extract_volume_marker(norm_b)
if vol_a and vol_b and vol_a != vol_b:
return False
if norm_a == norm_b:
return True
# After normalization the shorter name often becomes a prefix /
# substring of the longer one ("napoleon dynamite" ⊂ "napoleon
# dynamite music from the motion picture" before stripping).
if norm_a in norm_b or norm_b in norm_a:
return True
return SequenceMatcher(None, norm_a, norm_b).ratio() >= threshold
@dataclass
class ScanResult:
"""Result of scanning a single artist"""
artist_name: str
spotify_artist_id: str
albums_checked: int
new_tracks_found: int
tracks_added_to_wishlist: int
success: bool
error_message: Optional[str] = None
@dataclass
class WatchlistDiscographyResult:
"""Resolved watchlist artist discography for a specific metadata source."""
source: str
artist_id: str
albums: List[Any]
image_url: Optional[str] = None
class WatchlistScanner:
"""Service for scanning watched artists for new releases"""
def __init__(self, spotify_client: SpotifyClient = None, metadata_service=None, database_path: str = "database/music_library.db"):
# Support both old (spotify_client) and new (metadata_service) initialization
self.database_path = database_path
self._database = None
self._wishlist_service = None
self._matching_engine = None
self._rescan_cutoff_log_marker = None
if metadata_service:
self._metadata_service = metadata_service
self.spotify_client = metadata_service.spotify # For backward compatibility
elif spotify_client:
self.spotify_client = spotify_client
self._metadata_service = None # Lazy load if needed
else:
raise ValueError("Must provide either spotify_client or metadata_service")
# Run-local Spotify suppression. One rate-limit hit disables Spotify
# for rest of current scan, but keeps fallback providers running.
self._spotify_disabled_for_run = False
self._spotify_disabled_reason = None
@property
def database(self):
"""Get database instance (lazy loading)"""
if self._database is None:
self._database = get_database(self.database_path)
return self._database
@property
def wishlist_service(self):
"""Get wishlist service instance (lazy loading)"""
if self._wishlist_service is None:
self._wishlist_service = get_wishlist_service()
return self._wishlist_service
@property
def matching_engine(self):
"""Get matching engine instance (lazy loading)"""
if self._matching_engine is None:
self._matching_engine = MusicMatchingEngine()
return self._matching_engine
@property
def metadata_service(self):
"""Get or create MetadataService instance (lazy loading)"""
if self._metadata_service is None:
from core.metadata.service import MetadataService
self._metadata_service = MetadataService()
return self._metadata_service
def _disable_spotify_for_run(self, reason: str):
"""Disable Spotify for rest of current run, once."""
if not self._spotify_disabled_for_run:
logger.warning(f"Spotify disabled for rest of run: {reason}")
self._spotify_disabled_for_run = True
self._spotify_disabled_reason = reason
def _spotify_available_for_run(self) -> bool:
"""Check if Spotify should be used for this run."""
if self._spotify_disabled_for_run:
return False
if not self.spotify_client:
return False
return self.spotify_client.is_spotify_authenticated()
def _spotify_is_primary_source(self) -> bool:
"""Check if Spotify is both authenticated and the configured primary metadata source.
Use this (not _spotify_available_for_run) when deciding whether to fetch
album/artist data from Spotify. Plain auth is not sufficient — the user
may have Spotify connected only for playlist sync while Deezer/iTunes
serves as the metadata source, and calling Spotify for data in that case
burns API quota unnecessarily.
_spotify_available_for_run() is still used for Spotify-specific features
(e.g. library-cache sync) that must run regardless of primary source.
"""
if not self._spotify_available_for_run():
return False
try:
return get_primary_source() == 'spotify'
except Exception:
return False
def _watchlist_source_priority(self) -> List[str]:
"""Return watchlist scan sources in the configured priority order."""
return list(get_source_priority(get_primary_source()))
def _discovery_source_priority(self) -> List[str]:
"""Return discovery sources in configured priority order.
Discovery pool writes only support Spotify, iTunes, and Deezer IDs, so
we filter the broader metadata priority list down to those sources.
"""
return [source for source in self._watchlist_source_priority() if source in {'spotify', 'itunes', 'deezer'}]
@staticmethod
def _artist_id_attribute_for_source(source: str) -> Optional[str]:
"""Return the watchlist artist attribute that stores the given source ID."""
return {
'spotify': 'spotify_artist_id',
'itunes': 'itunes_artist_id',
'deezer': 'deezer_artist_id',
'discogs': 'discogs_artist_id',
}.get(source)
@staticmethod
def _similar_artist_id_attribute_for_source(source: str) -> Optional[str]:
"""Return the similar-artist attribute that stores the given source ID."""
return {
'spotify': 'similar_artist_spotify_id',
'itunes': 'similar_artist_itunes_id',
'deezer': 'similar_artist_deezer_id',
}.get(source)
@staticmethod
def _extract_entity_id(value: Any) -> Optional[str]:
"""Extract an ID from a dataclass, dict, or plain object."""
if value is None:
return None
if isinstance(value, str):
return value
if isinstance(value, dict):
return value.get('id') or value.get('artist_id') or value.get('release_id')
return getattr(value, 'id', None) or getattr(value, 'artist_id', None) or getattr(value, 'release_id', None)
def _cache_watchlist_artist_source_id(self, watchlist_artist: WatchlistArtist, source: str, source_id: str) -> None:
"""Cache a resolved artist ID for a watchlist artist when we have a storage column."""
if not source_id:
return
if source == 'spotify':
self.database.update_watchlist_spotify_id(watchlist_artist.id, source_id)
watchlist_artist.spotify_artist_id = source_id
elif source == 'itunes':
self.database.update_watchlist_itunes_id(watchlist_artist.id, source_id)
watchlist_artist.itunes_artist_id = source_id
elif source == 'deezer':
self.database.update_watchlist_deezer_id(watchlist_artist.id, source_id)
watchlist_artist.deezer_artist_id = source_id
elif source == 'discogs':
self.database.update_watchlist_discogs_id(watchlist_artist.id, source_id)
watchlist_artist.discogs_artist_id = source_id
def _resolve_watchlist_artist_source_id(self, watchlist_artist: WatchlistArtist, source: str, client: Any) -> Optional[str]:
"""Resolve the artist ID for an exact source, searching by name if needed."""
attr = self._artist_id_attribute_for_source(source)
stored_id = getattr(watchlist_artist, attr, None) if attr else None
if stored_id:
return stored_id
search_results = self._search_artists_for_source(source, watchlist_artist.artist_name, limit=1, client=client)
if not search_results:
return None
found_id = self._extract_entity_id(search_results[0])
if found_id and attr:
self._cache_watchlist_artist_source_id(watchlist_artist, source, found_id)
return found_id
def _search_artists_for_source(self, source: str, artist_name: str, limit: int = 1, client: Any = None) -> List[Any]:
"""Search artists for a specific source, keeping Spotify strict."""
if client is None:
client = get_client_for_source(source)
if not client or not hasattr(client, 'search_artists'):
return []
try:
search_kwargs = {'limit': limit}
if source == 'spotify':
search_kwargs['allow_fallback'] = False
return client.search_artists(artist_name, **search_kwargs) or []
except Exception as e:
logger.debug("Could not search %s for %s: %s", source, artist_name, e)
return []
@staticmethod
def _get_artist_image_from_data(artist_data: Any) -> Optional[str]:
"""Extract an image URL from artist payloads across providers."""
if not artist_data:
return None
if isinstance(artist_data, dict):
images = artist_data.get('images') or []
if images:
first_image = images[0]
if isinstance(first_image, dict):
return first_image.get('url')
return (
artist_data.get('image_url')
or artist_data.get('thumb_url')
or artist_data.get('cover_image')
or artist_data.get('picture_xl')
or artist_data.get('picture_big')
or artist_data.get('picture_medium')
)
images = getattr(artist_data, 'images', None)
if images:
first_image = images[0]
if isinstance(first_image, dict):
return first_image.get('url')
return (
getattr(artist_data, 'image_url', None)
or getattr(artist_data, 'thumb_url', None)
or getattr(artist_data, 'cover_image', None)
)
def _get_artist_metadata_from_data(self, artist_data: Any) -> Dict[str, Any]:
"""Extract normalized artist metadata from a provider result."""
if not artist_data:
return {'name': None, 'image_url': None, 'genres': [], 'popularity': 0}
if isinstance(artist_data, dict):
name = artist_data.get('name') or artist_data.get('artist_name') or artist_data.get('title')
genres = artist_data.get('genres') or []
popularity = artist_data.get('popularity') or artist_data.get('rank') or 0
else:
name = (
getattr(artist_data, 'name', None)
or getattr(artist_data, 'artist_name', None)
or getattr(artist_data, 'title', None)
)
genres = getattr(artist_data, 'genres', None) or []
popularity = getattr(artist_data, 'popularity', None) or getattr(artist_data, 'rank', None) or 0
if isinstance(genres, str):
genres = [genres]
elif not isinstance(genres, list):
genres = list(genres) if genres else []
try:
popularity = int(popularity or 0)
except Exception:
popularity = 0
return {
'name': name,
'image_url': self._get_artist_image_from_data(artist_data),
'genres': genres,
'popularity': popularity,
}
def _get_artist_image_for_source(self, watchlist_artist: WatchlistArtist, source: str, client: Any, artist_id: str) -> Optional[str]:
"""Fetch an artist image for a specific source."""
if not client or not artist_id or not hasattr(client, 'get_artist'):
return None
try:
if source == 'spotify':
artist_data = client.get_artist(artist_id, allow_fallback=False)
else:
artist_data = client.get_artist(artist_id)
except Exception as e:
logger.debug("Could not fetch artist image for %s on %s: %s", watchlist_artist.artist_name, source, e)
return None
return self._get_artist_image_from_data(artist_data)
def _get_album_data_for_source(self, source: str, album_id: str, album_name: str = '') -> Optional[Dict[str, Any]]:
"""Fetch album data for a specific source and normalize track payloads when needed."""
client = get_client_for_source(source)
if not client or not album_id or not hasattr(client, 'get_album'):
return None
try:
if source == 'spotify':
album_data = client.get_album(album_id, allow_fallback=False)
else:
album_data = client.get_album(album_id)
except Exception as e:
logger.debug("Could not fetch album %s on %s: %s", album_id, source, e)
album_data = None
if not album_data:
return None
# Some providers return album metadata without embedded tracks; normalize that shape.
tracks = album_data.get('tracks') if isinstance(album_data, dict) else None
if not tracks:
track_items = get_album_tracks_for_source(source, album_id)
if track_items:
if not isinstance(album_data, dict):
try:
album_data = dict(album_data)
except Exception:
album_data = {'name': album_name or album_id}
if isinstance(track_items, dict):
album_data['tracks'] = track_items
else:
album_data['tracks'] = {'items': track_items}
return album_data
@staticmethod
def _extract_track_items(album_data: Any) -> List[Dict[str, Any]]:
"""Normalize track payloads from different album formats to a list of items."""
if not album_data:
return []
tracks = None
if isinstance(album_data, dict):
tracks = album_data.get('tracks')
else:
tracks = getattr(album_data, 'tracks', None)
if not tracks:
return []
if isinstance(tracks, dict):
items = tracks.get('items') or tracks.get('data') or []
return list(items) if isinstance(items, list) else []
if isinstance(tracks, list):
return tracks
return []
def _resolve_watchlist_discography_for_source(
self,
watchlist_artist: WatchlistArtist,
source: str,
last_scan_timestamp: Optional[datetime] = None,
) -> Optional[WatchlistDiscographyResult]:
"""Resolve a watchlist artist to a specific source and fetch its discography."""
client = get_client_for_source(source)
if not client:
return None
artist_id = self._resolve_watchlist_artist_source_id(watchlist_artist, source, client)
if not artist_id:
return None
albums = self._get_artist_discography_with_client(
client,
artist_id,
last_scan_timestamp,
lookback_days=watchlist_artist.lookback_days,
)
# albums can be None (API failure) or empty list (no new releases).
# None means this source failed — try next source.
# Empty list means success — artist has no new releases in the lookback window.
if albums is None:
return None
image_url = self._get_artist_image_for_source(watchlist_artist, source, client, artist_id)
return WatchlistDiscographyResult(
source=source,
artist_id=artist_id,
albums=albums,
image_url=image_url,
)
def get_artist_image_url(self, watchlist_artist: WatchlistArtist) -> Optional[str]:
"""
Get artist image URL using the configured source priority.
Returns:
Image URL string or None if not available
"""
for source in self._watchlist_source_priority():
client = get_client_for_source(source)
if not client:
continue
artist_id = self._resolve_watchlist_artist_source_id(watchlist_artist, source, client)
if not artist_id:
continue
image_url = self._get_artist_image_for_source(watchlist_artist, source, client, artist_id)
if image_url:
return image_url
return None
def _get_artist_albums_for_source(
self,
source: str,
artist_id: str,
album_type: str = 'album,single,ep',
limit: int = 50,
# Only applies to Spotify currently
skip_cache: bool = True,
# Only applies to Spotify currently
max_pages: int = 0,
) -> List[Any]:
"""Fetch artist albums for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not artist_id or not hasattr(client, 'get_artist_albums'):
return []
try:
kwargs = {
'album_type': album_type,
'limit': limit,
}
if source == 'spotify':
kwargs['skip_cache'] = skip_cache
kwargs['max_pages'] = max_pages
kwargs['allow_fallback'] = False
return client.get_artist_albums(artist_id, **kwargs) or []
except Exception as e:
logger.debug("Could not fetch artist albums for %s on %s: %s", artist_id, source, e)
return []
def _get_artist_data_for_source(self, source: str, artist_id: str) -> Optional[Dict[str, Any]]:
"""Fetch artist metadata for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not artist_id or not hasattr(client, 'get_artist'):
return None
try:
if source == 'spotify':
return client.get_artist(artist_id, allow_fallback=False)
return client.get_artist(artist_id)
except Exception as e:
logger.debug("Could not fetch artist data for %s on %s: %s", artist_id, source, e)
return None
def _search_albums_for_source(self, source: str, query: str, limit: int = 1):
"""Search albums for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not hasattr(client, 'search_albums'):
return []
try:
if source == 'spotify':
return client.search_albums(query, limit=limit, allow_fallback=False) or []
return client.search_albums(query, limit=limit) or []
except Exception as e:
logger.debug("Could not search albums for %s on %s: %s", query, source, e)
return []
def _resolve_artist_id_for_source(
self,
source: str,
artist_name: str,
stored_id: Optional[str] = None,
cache_callback: Optional[Callable[[str], None]] = None,
) -> Optional[str]:
"""Resolve an artist ID for a specific source, searching by name if needed."""
if stored_id:
return stored_id
client = get_client_for_source(source)
if not client or not hasattr(client, 'search_artists'):
return None
try:
search_kwargs = {'limit': 1}
if source == 'spotify':
search_kwargs['allow_fallback'] = False
results = client.search_artists(artist_name, **search_kwargs)
except Exception as e:
logger.debug("Could not resolve %s artist ID for %s: %s", source, artist_name, e)
return None
if not results:
return None
found_id = self._extract_entity_id(results[0])
if found_id and cache_callback:
try:
cache_callback(found_id)
except Exception as e:
logger.debug("Could not cache %s artist ID for %s: %s", source, artist_name, e)
return found_id
def backfill_watchlist_artist_images(self, profile_id: int) -> int:
"""Backfill missing watchlist artist images using cached metadata and existing album art."""
try:
conn = self.database._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT id, artist_name, spotify_artist_id, itunes_artist_id,
deezer_artist_id, discogs_artist_id
FROM watchlist_artists
WHERE profile_id = ? AND (image_url IS NULL OR image_url = '' OR image_url = 'None'
OR image_url NOT LIKE 'http%')
""", (profile_id,))
imageless = cursor.fetchall()
if not imageless:
return 0
logger.info("Backfilling images for %s watchlist artists (profile %s)...", len(imageless), profile_id)
filled = 0
for row in imageless:
name = row['artist_name']
img = None
# 1. Check metadata cache for artist image
cursor.execute("""
SELECT image_url FROM metadata_cache_entities
WHERE entity_type = 'artist' AND name = ? COLLATE NOCASE
AND image_url IS NOT NULL AND image_url LIKE 'http%'
LIMIT 1
""", (name,))
cr = cursor.fetchone()
if cr:
img = cr['image_url']
# 2. Deezer direct URL (no API call needed)
if not img and row['deezer_artist_id']:
img = f"https://api.deezer.com/artist/{row['deezer_artist_id']}/image?size=big"
# 3. Deezer ID from cache (artist may have a Deezer match we haven't stored on watchlist)
if not img:
cursor.execute("""
SELECT entity_id FROM metadata_cache_entities
WHERE entity_type = 'artist' AND source = 'deezer'
AND name = ? COLLATE NOCASE LIMIT 1
""", (name,))
dz = cursor.fetchone()
if dz and dz['entity_id']:
img = f"https://api.deezer.com/artist/{dz['entity_id']}/image?size=big"
# 4. Album art fallback (iTunes artists have no artist images)
if not img:
cursor.execute("""
SELECT image_url FROM metadata_cache_entities
WHERE entity_type = 'album' AND image_url LIKE 'http%'
AND artist_name = ? COLLATE NOCASE LIMIT 1
""", (name,))
alb = cursor.fetchone()
if alb:
img = alb['image_url']
if img:
aid = (row['spotify_artist_id'] or row['itunes_artist_id']
or row['deezer_artist_id'] or row['discogs_artist_id'])
if aid:
self.database.update_watchlist_artist_image(aid, img)
else:
# No external IDs — update by internal row ID directly
cursor.execute("""
UPDATE watchlist_artists SET image_url = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""", (img, row['id']))
conn.commit()
filled += 1
if filled:
logger.info("Backfilled %s/%s watchlist artist images (profile %s)", filled, len(imageless), profile_id)
return filled
except Exception as e:
logger.debug("Error backfilling watchlist artist images for profile %s: %s", profile_id, e, exc_info=True)
return 0
def get_artist_discography_for_watchlist(self, watchlist_artist: WatchlistArtist, last_scan_timestamp: Optional[datetime] = None) -> Optional[WatchlistDiscographyResult]:
"""
Get artist's discography using the configured source priority, with proper ID resolution.
Returns the first provider that can actually return albums.
Args:
watchlist_artist: WatchlistArtist object (has provider IDs when available)
last_scan_timestamp: Only return releases after this date (for incremental scans)
Returns:
WatchlistDiscographyResult or None on error
"""
# Per-artist metadata source override — if set, use that source first with fallback
preferred = getattr(watchlist_artist, 'preferred_metadata_source', None)
if preferred and preferred in ('spotify', 'deezer', 'itunes', 'discogs'):
source_priority = list(get_source_priority(preferred))
else:
source_priority = self._watchlist_source_priority()
for source in source_priority:
result = self._resolve_watchlist_discography_for_source(watchlist_artist, source, last_scan_timestamp)
if result:
return result
logger.warning(f"No valid client/ID for {watchlist_artist.artist_name}")
return None
def _apply_global_watchlist_overrides(self, watchlist_artists: List[WatchlistArtist]):
"""Apply global watchlist release-type overrides to a batch of artists."""
try:
from config.settings import config_manager
except Exception:
return
if not config_manager.get('watchlist.global_override_enabled', False):
return
g_albums = config_manager.get('watchlist.global_include_albums', True)
g_eps = config_manager.get('watchlist.global_include_eps', True)
g_singles = config_manager.get('watchlist.global_include_singles', True)
g_live = config_manager.get('watchlist.global_include_live', False)
g_remixes = config_manager.get('watchlist.global_include_remixes', False)
g_acoustic = config_manager.get('watchlist.global_include_acoustic', False)
g_compilations = config_manager.get('watchlist.global_include_compilations', False)
g_instrumentals = config_manager.get('watchlist.global_include_instrumentals', False)
logger.info(
"Applying global watchlist override to %s artists "
"(albums=%s, eps=%s, singles=%s, live=%s, remixes=%s, acoustic=%s, compilations=%s, instrumentals=%s)",
len(watchlist_artists),
g_albums,
g_eps,
g_singles,
g_live,
g_remixes,
g_acoustic,
g_compilations,
g_instrumentals,
)
for artist in watchlist_artists:
artist.include_albums = g_albums
artist.include_eps = g_eps
artist.include_singles = g_singles
artist.include_live = g_live
artist.include_remixes = g_remixes
artist.include_acoustic = g_acoustic
artist.include_compilations = g_compilations
artist.include_instrumentals = g_instrumentals
def scan_watchlist_profile(
self,
profile_id: int,
watchlist_artists: Optional[List[WatchlistArtist]] = None,
*,
scan_state: Optional[Dict[str, Any]] = None,
progress_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
cancel_check: Optional[Callable[[], bool]] = None,
artist_index_offset: int = 0,
total_artists_override: Optional[int] = None,
apply_global_overrides: bool = True,
) -> List[ScanResult]:
"""Scan a single watchlist profile using the shared watchlist scan engine."""
if watchlist_artists is None:
watchlist_artists = self.database.get_watchlist_artists(profile_id=profile_id)
# scan_watchlist_artists applies overrides itself now — pass the flag
# through instead of applying here (prevents double-application).
return self.scan_watchlist_artists(
watchlist_artists,
profile_id=profile_id,
scan_state=scan_state,
progress_callback=progress_callback,
cancel_check=cancel_check,
artist_index_offset=artist_index_offset,
total_artists_override=total_artists_override,
apply_global_overrides=apply_global_overrides,
)
def scan_watchlist_artists(
self,
watchlist_artists: List[WatchlistArtist],
*,
profile_id: int = 1,
scan_state: Optional[Dict[str, Any]] = None,
progress_callback: Optional[Callable[[str, Dict[str, Any]], None]] = None,
cancel_check: Optional[Callable[[], bool]] = None,
artist_index_offset: int = 0,
total_artists_override: Optional[int] = None,
apply_global_overrides: bool = True,
) -> List[ScanResult]:
"""Scan a list of watchlist artists using the shared web watchlist scan flow.
apply_global_overrides: when True (default), per-artist include_*
flags are overwritten with the global values if
`watchlist.global_override_enabled` is set. This matches the
behaviour of `scan_watchlist_profile` so every entry point respects
the user's Global Override toggle.
"""
if apply_global_overrides:
self._apply_global_watchlist_overrides(watchlist_artists)
scan_results: List[ScanResult] = []
if not watchlist_artists:
if scan_state is not None:
scan_state.update({
'status': 'completed',
'total_artists': 0,
'current_artist_index': 0,
'current_artist_name': '',
'current_artist_image_url': '',
'current_phase': 'completed',
'albums_to_check': 0,
'albums_checked': 0,
'current_album': '',
'current_album_image_url': '',
'current_track_name': '',
'tracks_found_this_scan': 0,
'tracks_added_this_scan': 0,
'recent_wishlist_additions': [],
'results': [],
'summary': {
'total_artists': 0,
'successful_scans': 0,
'new_tracks_found': 0,
'tracks_added_to_wishlist': 0,
},
'completed_at': datetime.now(),
'error': None,
})
return scan_results
if scan_state is not None:
scan_state.update({
'status': 'scanning',
'started_at': scan_state.get('started_at') or datetime.now(),
'total_artists': total_artists_override if total_artists_override is not None else len(watchlist_artists),
'current_artist_index': scan_state.get('current_artist_index', artist_index_offset),
'current_artist_name': scan_state.get('current_artist_name', ''),
'current_artist_image_url': scan_state.get('current_artist_image_url', ''),
'current_phase': 'starting',
'albums_to_check': 0,
'albums_checked': 0,
'current_album': '',
'current_album_image_url': '',
'current_track_name': '',
'tracks_found_this_scan': scan_state.get('tracks_found_this_scan', 0),
'tracks_added_this_scan': scan_state.get('tracks_added_this_scan', 0),
'recent_wishlist_additions': scan_state.get('recent_wishlist_additions', []),
'results': scan_state.get('results', []),
'summary': scan_state.get('summary', {}),
'error': None,
})
def _emit(event_type: str, **payload):
if progress_callback:
try:
progress_callback(event_type, payload)
except Exception:
logger.debug("Watchlist scan progress callback failed for %s", event_type, exc_info=True)
_emit('scan_started', profile_id=profile_id, total_artists=len(watchlist_artists))
# Keep this as a plain source list; resolve the client right before each use.
providers_to_backfill = [
source for source in self._watchlist_source_priority()
if source in {'spotify', 'itunes', 'deezer', 'discogs'}
]
for provider in providers_to_backfill:
try:
logger.info("Checking for missing %s IDs in watchlist...", provider)
self._backfill_missing_ids(watchlist_artists, provider)
except Exception as backfill_error:
logger.warning("Error during %s ID backfilling: %s", provider, backfill_error)
lookback_period = self._get_lookback_period_setting()
is_full_discography = (lookback_period == 'all')
artist_count = len(watchlist_artists)
base_artist_delay = DELAY_BETWEEN_ARTISTS
base_album_delay = DELAY_BETWEEN_ALBUMS
if is_full_discography:
base_artist_delay *= 2.0
base_album_delay *= 2.0
if artist_count > 200:
base_artist_delay *= 1.5
base_album_delay *= 1.25
elif artist_count > 100:
base_artist_delay *= 1.25
artist_delay = base_artist_delay
album_delay = base_album_delay
logger.info(
"Scan parameters: %s artists, lookback=%s, delays: %.1fs/artist, %.1fs/album",
artist_count,
lookback_period,
artist_delay,
album_delay,
)
for i, artist in enumerate(watchlist_artists):
if cancel_check and cancel_check():
logger.info("Watchlist scan cancelled after %s/%s artists", i, len(watchlist_artists))
if scan_state is not None:
successful_scans = [r for r in scan_results if r.success]
scan_state['status'] = 'cancelled'
scan_state['current_phase'] = 'cancelled'
scan_state['summary'] = {
'total_artists': i,
'successful_scans': len(successful_scans),
'new_tracks_found': sum(r.new_tracks_found for r in successful_scans),
'tracks_added_to_wishlist': sum(r.tracks_added_to_wishlist for r in successful_scans),
'cancelled': True,
}
_emit('cancelled', processed=i, total=len(watchlist_artists))
break
source_artist_id = (
artist.spotify_artist_id
or artist.itunes_artist_id
or artist.deezer_artist_id
or artist.discogs_artist_id
or str(artist.id)
)
try:
discography_result = self.get_artist_discography_for_watchlist(artist, artist.last_scan_timestamp)
if discography_result is None:
scan_results.append(ScanResult(
artist_name=artist.artist_name,
spotify_artist_id=source_artist_id,
albums_checked=0,
new_tracks_found=0,
tracks_added_to_wishlist=0,
success=False,
error_message="Failed to get artist discography",
))
_emit(
'artist_error',
artist_name=artist.artist_name,
profile_id=profile_id,
error_message="Failed to get artist discography",
)
continue
if isinstance(discography_result, list):
albums = discography_result
artist_image_url = self.get_artist_image_url(artist) or ''
album_fetcher = lambda album_id, album_name='': self.metadata_service.get_album(album_id)
else:
source = discography_result.source
albums = discography_result.albums
source_artist_id = discography_result.artist_id
artist_image_url = discography_result.image_url or self.get_artist_image_url(artist) or ''
album_fetcher = lambda album_id, album_name='', source=source: self._get_album_data_for_source(source, album_id, album_name)
absolute_index = artist_index_offset + i + 1
if scan_state is not None:
scan_state.update({
'current_artist_index': absolute_index,
'current_artist_name': artist.artist_name,
'current_artist_image_url': artist_image_url,
'current_phase': 'fetching_discography',
'albums_to_check': 0,
'albums_checked': 0,
'current_album': '',
'current_album_image_url': '',
'current_track_name': '',
})
_emit(
'artist_started',
artist_name=artist.artist_name,
artist_index=absolute_index,
total_artists=total_artists_override if total_artists_override is not None else len(watchlist_artists),
profile_id=profile_id,
artist_image_url=artist_image_url,
)
if scan_state is not None:
scan_state.update({
'current_phase': 'checking_albums',
'albums_to_check': len(albums),
'albums_checked': 0,
})
artist_new_tracks = 0
artist_added_tracks = 0
for album_index, album in enumerate(albums):
try:
album_data = album_fetcher(album.id, getattr(album, 'name', ''))
tracks = self._extract_track_items(album_data)
if not album_data or not tracks:
logger.debug("Skipping album %s (id=%s): no track data returned", album.name, album.id)
continue
album_name = getattr(album, 'name', '')
if isinstance(album_data, dict):
album_name = album_data.get('name', album_name)
else:
album_name = getattr(album_data, 'name', album_name)
if self._has_placeholder_tracks(tracks):
logger.info("Skipping album with placeholder tracks: %s", album_name)
continue
if not self._should_include_release(len(tracks), artist):
continue
album_image_url = ''
album_images = []
if isinstance(album_data, dict):
album_images = album_data.get('images') or []
else:
album_images = getattr(album_data, 'images', None) or []
if album_images:
first_image = album_images[0]
if isinstance(first_image, dict):
album_image_url = first_image.get('url', '')
if scan_state is not None:
scan_state.update({
'albums_checked': album_index + 1,
'current_album': album_name,
'current_album_image_url': album_image_url,
'current_phase': f'checking_album_{album_index + 1}_of_{len(albums)}',
})
_emit(
'album_started',
artist_name=artist.artist_name,
album_name=album_name,
album_index=album_index + 1,
total_albums=len(albums),
album_image_url=album_image_url,
)
for track in tracks:
if not self._should_include_track(track, album_data, artist):
continue
track_name = track.get('name', 'Unknown Track')
if scan_state is not None:
scan_state['current_track_name'] = track_name
if self.is_track_missing_from_library(track, album_name=album_name):
artist_new_tracks += 1
if scan_state is not None:
scan_state['tracks_found_this_scan'] += 1
if self.add_track_to_wishlist(track, album_data, artist):
artist_added_tracks += 1
if scan_state is not None:
scan_state['tracks_added_this_scan'] += 1
track_artists = track.get('artists', [])
track_artist_name = track_artists[0].get('name', 'Unknown Artist') if track_artists else 'Unknown Artist'
if scan_state is not None:
scan_state['recent_wishlist_additions'].insert(0, {
'track_name': track_name,
'artist_name': track_artist_name,
'album_image_url': album_image_url,
})
if len(scan_state['recent_wishlist_additions']) > 10:
scan_state['recent_wishlist_additions'].pop()
if album_index < len(albums) - 1:
time.sleep(album_delay)
except Exception as e:
logger.warning("Error checking album %s: %s", album.name, e)
continue
self.update_artist_scan_timestamp(artist)
scan_results.append(ScanResult(
artist_name=artist.artist_name,
spotify_artist_id=source_artist_id or artist.spotify_artist_id or '',
albums_checked=len(albums),
new_tracks_found=artist_new_tracks,
tracks_added_to_wishlist=artist_added_tracks,
success=True,
))
_emit(
'artist_completed',
artist_name=artist.artist_name,
artist_index=absolute_index,
total_artists=total_artists_override if total_artists_override is not None else len(watchlist_artists),
profile_id=profile_id,
albums_checked=len(albums),
new_tracks_found=artist_new_tracks,
tracks_added_to_wishlist=artist_added_tracks,
)
try:
if scan_state is not None:
scan_state['current_phase'] = 'fetching_similar_artists'
artist_profile_id = getattr(artist, 'profile_id', profile_id)
if self.database.has_fresh_similar_artists(source_artist_id, days_threshold=30, profile_id=artist_profile_id):
logger.info("Similar artists for %s are cached and fresh (profile %s)", artist.artist_name, artist_profile_id)
self._backfill_similar_artists_fallback_ids(source_artist_id, profile_id=artist_profile_id)
else:
logger.info("Fetching similar artists for %s (profile %s)...", artist.artist_name, artist_profile_id)
self.update_similar_artists(artist, profile_id=artist_profile_id, source_artist_id=source_artist_id)
logger.info("Similar artists updated for %s", artist.artist_name)
except Exception as similar_error:
logger.warning("Failed to update similar artists for %s: %s", artist.artist_name, similar_error)
if i < len(watchlist_artists) - 1:
if scan_state is not None:
scan_state['current_phase'] = 'rate_limiting'
time.sleep(artist_delay)
except Exception as e:
logger.error("Error scanning artist %s: %s", artist.artist_name, e)
scan_results.append(ScanResult(
artist_name=artist.artist_name,
spotify_artist_id=source_artist_id,
albums_checked=0,
new_tracks_found=0,
tracks_added_to_wishlist=0,
success=False,
error_message=str(e),
))
_emit(
'artist_error',
artist_name=artist.artist_name,
artist_index=artist_index_offset + i + 1,
total_artists=total_artists_override if total_artists_override is not None else len(watchlist_artists),
profile_id=profile_id,
error_message=str(e),
)
if scan_state is not None:
successful_scans = [r for r in scan_results if r.success]
total_new_tracks = sum(r.new_tracks_found for r in successful_scans)
total_added_to_wishlist = sum(r.tracks_added_to_wishlist for r in successful_scans)
scan_state['results'] = list(scan_state.get('results', [])) + scan_results
if scan_state.get('status') != 'cancelled':
scan_state['status'] = 'completed'
scan_state['completed_at'] = datetime.now()
scan_state['current_phase'] = 'completed'
scan_state['summary'] = {
'total_artists': len(scan_results),
'successful_scans': len(successful_scans),
'new_tracks_found': total_new_tracks,
'tracks_added_to_wishlist': total_added_to_wishlist,
}
_emit(
'scan_completed',
profile_id=profile_id,
total_artists=len(watchlist_artists),
total_scanned=len(scan_results),
successful_scans=len([r for r in scan_results if r.success]),
new_tracks_found=sum(r.new_tracks_found for r in scan_results if r.success),
tracks_added_to_wishlist=sum(r.tracks_added_to_wishlist for r in scan_results if r.success),
)
return scan_results
def get_artist_discography(
self,
spotify_artist_id: str,
last_scan_timestamp: Optional[datetime] = None,
lookback_days: Optional[int] = None,
) -> Optional[List]:
"""
Get artist's discography from Spotify, optionally filtered by release date.
Args:
spotify_artist_id: Spotify artist ID
last_scan_timestamp: Only return releases after this date (for incremental scans)
If None, uses lookback period setting from database
lookback_days: Optional per-artist override for lookback period
"""
try:
return self._get_artist_discography_with_client(
self.spotify_client,
spotify_artist_id,
last_scan_timestamp,
lookback_days=lookback_days,
)
except Exception as e:
logger.error(f"Error getting discography for artist {spotify_artist_id}: {e}")
return None
def _get_artist_discography_with_client(self, client, artist_id: str, last_scan_timestamp: Optional[datetime] = None, lookback_days: Optional[int] = None) -> Optional[List]:
"""
Get artist's discography using the specified client, optionally filtered by release date.
Args:
client: The metadata client to use (spotify or itunes)
artist_id: Artist ID for the given client
last_scan_timestamp: Only return releases after this date (for incremental scans)
If None, uses lookback period setting from database
lookback_days: Per-artist override for lookback period (None = use global setting)
"""
try:
# Determine if we need full discography or just recent releases BEFORE fetching.
# Spotify returns albums newest-first, so for time-bounded scans we only need
# the first page (50 albums) — cuts API calls by ~90% for prolific artists.
lookback_period = self._get_lookback_period_setting()
needs_full_discog = False
if lookback_period == 'all':
cutoff_timestamp = None
needs_full_discog = True
elif last_scan_timestamp is not None:
cutoff_timestamp = last_scan_timestamp
# Check if a lookback period change requires a one-time wider window
rescan_cutoff = self._get_rescan_cutoff()
if rescan_cutoff == 'all':
if self._rescan_cutoff_log_marker != 'all':
logger.info("Lookback period changed to 'all' — returning full discography")
self._rescan_cutoff_log_marker = 'all'
cutoff_timestamp = None
needs_full_discog = True
elif rescan_cutoff is not None:
scan_ts = cutoff_timestamp
if scan_ts.tzinfo is None:
scan_ts = scan_ts.replace(tzinfo=timezone.utc)
if rescan_cutoff.tzinfo is None:
rescan_cutoff = rescan_cutoff.replace(tzinfo=timezone.utc)
if rescan_cutoff < scan_ts:
marker = rescan_cutoff.isoformat()
if self._rescan_cutoff_log_marker != marker:
logger.info(f"Lookback period change detected — expanding cutoff from {cutoff_timestamp} to {rescan_cutoff}")
self._rescan_cutoff_log_marker = marker
cutoff_timestamp = rescan_cutoff
else:
# No scan timestamp — first scan, use lookback period
if lookback_days is not None:
days = lookback_days
else:
days = int(lookback_period)
cutoff_timestamp = datetime.now(timezone.utc) - timedelta(days=days)
logger.info(f"Using lookback period: {days} days (cutoff: {cutoff_timestamp})")
# Fetch albums — limit pagination unless full discography is needed
logger.debug(f"Fetching discography for artist {artist_id}" +
(" (full)" if needs_full_discog else " (recent only, max 1 page)"))
_skip = {'skip_cache': True} if hasattr(client, 'sp') else {}
_max_pages = 0 if needs_full_discog else 1
# Only pass max_pages to clients that support it (spotify_client)
if hasattr(client, 'sp'):
_skip['max_pages'] = _max_pages
albums = client.get_artist_albums(artist_id, album_type='album,single', limit=50, **_skip)
if albums is None:
logger.warning(f"API failure fetching albums for artist {artist_id}")
return None
if not albums:
logger.debug(f"No albums found for artist {artist_id}")
return []
# Add small delay after fetching artist discography to be extra safe
time.sleep(0.3) # 300ms breathing room
# Filter by release date if we have a cutoff timestamp
if cutoff_timestamp:
filtered_albums = []
for album in albums:
if self.is_album_after_timestamp(album, cutoff_timestamp):
filtered_albums.append(album)
logger.info(f"Filtered {len(albums)} albums to {len(filtered_albums)} released after {cutoff_timestamp}")
albums = filtered_albums
# Skip future/unreleased albums — no real audio available yet
now = datetime.now(timezone.utc)
released = [a for a in albums if not self._is_future_release(a, now)]
skipped = len(albums) - len(released)
if skipped:
logger.info(f"Skipped {skipped} future/unreleased albums (will be picked up after release)")
return released
except Exception as e:
logger.error(f"Error getting discography for artist {artist_id}: {e}")
return None
def _backfill_missing_ids(self, artists: List[WatchlistArtist], provider: str):
"""
Proactively match ALL artists missing IDs for the current provider.
Example: User has 50 artists with only Spotify IDs.
When iTunes becomes active, this matches ALL 50 to iTunes in one batch.
"""
# Find artists missing IDs for the active provider (regardless of which other IDs they have)
id_attr = {
'spotify': 'spotify_artist_id',
'itunes': 'itunes_artist_id',
'deezer': 'deezer_artist_id',
'discogs': 'discogs_artist_id',
}.get(provider)
if not id_attr:
logger.debug(f"Backfill not supported for provider: {provider}")
return
artists_to_match = [a for a in artists if not getattr(a, id_attr, None)]
if not artists_to_match:
logger.info(f"All artists already have {provider} IDs")
return
logger.info(f"Backfilling {len(artists_to_match)} artists with {provider} IDs...")
match_fn = {
'spotify': self._match_to_spotify,
'itunes': self._match_to_itunes,
'deezer': self._match_to_deezer,
'discogs': self._match_to_discogs,
}.get(provider)
update_fn = {
'spotify': self.database.update_watchlist_spotify_id,
'itunes': self.database.update_watchlist_itunes_id,
'deezer': self.database.update_watchlist_deezer_id,
'discogs': self.database.update_watchlist_discogs_id,
}.get(provider)
if not match_fn or not update_fn:
logger.debug(f"No match/update function available for provider: {provider}")
return
matched_count = 0
unmatched_names = []
for artist in artists_to_match:
try:
new_id = match_fn(artist.artist_name)
if new_id:
update_fn(artist.id, new_id)
setattr(artist, id_attr, new_id)
matched_count += 1
logger.info(f"Matched '{artist.artist_name}' to {provider}: {new_id}")
else:
unmatched_names.append(artist.artist_name)
time.sleep(0.3)
except Exception as e:
logger.warning(f"Could not match '{artist.artist_name}' to {provider}: {e}")
unmatched_names.append(artist.artist_name)
continue
logger.info(f"Backfilled {matched_count}/{len(artists_to_match)} artists with {provider} IDs")
if unmatched_names:
logger.warning(f"Could not confidently match {len(unmatched_names)} artists: {', '.join(unmatched_names[:10])}"
f"{'...' if len(unmatched_names) > 10 else ''} — use Watchlist Settings to link manually")
@staticmethod
def _normalize_artist_name(name: str) -> str:
"""Normalize artist name for comparison."""
if not name:
return ""
s = name.lower().strip()
# Remove "the " prefix
s = re.sub(r'^the\s+', '', s)
# Remove non-alphanumeric except spaces
s = re.sub(r'[^\w\s]', '', s)
# Collapse whitespace
s = re.sub(r'\s+', ' ', s).strip()
return s
@staticmethod
def _artist_name_similarity(name_a: str, name_b: str) -> float:
"""Calculate similarity between two artist names (0.0-1.0)."""
from difflib import SequenceMatcher
na = WatchlistScanner._normalize_artist_name(name_a)
nb = WatchlistScanner._normalize_artist_name(name_b)
if not na or not nb:
return 0.0
if na == nb:
return 1.0
return SequenceMatcher(None, na, nb).ratio()
def _best_artist_match(self, results, artist_name: str) -> Optional[str]:
"""Pick the best matching artist from search results using name similarity.
Returns the artist ID only if we're confident it's the right match.
"""
if not results:
return None
# Exact normalized match gets immediate acceptance
for r in results:
if self._normalize_artist_name(r.name) == self._normalize_artist_name(artist_name):
logger.info(f" Exact match: '{r.name}' (id={r.id})")
return r.id
# Score all results by name similarity + popularity bonus
candidates = []
for r in results:
sim = self._artist_name_similarity(artist_name, r.name)
# Small popularity bonus (max 0.05) to break ties between similar names
pop_bonus = (getattr(r, 'popularity', 0) / 100) * 0.05
score = sim + pop_bonus
candidates.append((r, sim, score))
logger.debug(f" Candidate: '{r.name}' sim={sim:.2f} pop={getattr(r, 'popularity', 0)} score={score:.3f}")
# Sort by score descending
candidates.sort(key=lambda x: x[2], reverse=True)
best, best_sim, best_score = candidates[0]
# Require high similarity to accept (0.85 threshold)
if best_sim >= 0.85:
logger.info(f" Best match: '{best.name}' (sim={best_sim:.2f}, id={best.id})")
return best.id
# Between 0.70-0.85: accept only if it's clearly better than runner-up
if best_sim >= 0.70 and len(candidates) > 1:
runner_up_sim = candidates[1][1]
if best_sim - runner_up_sim >= 0.15:
logger.info(f" Best match (clear winner): '{best.name}' (sim={best_sim:.2f}, id={best.id})")
return best.id
logger.warning(f" No confident match for '{artist_name}' — best was '{best.name}' (sim={best_sim:.2f})")
return None
def _match_to_spotify(self, artist_name: str) -> Optional[str]:
"""Match artist name to Spotify ID using fuzzy name comparison."""
try:
client = get_client_for_source('spotify')
if not client:
return None
results = client.search_artists(artist_name, limit=5, allow_fallback=False)
return self._best_artist_match(results, artist_name)
except Exception as e:
logger.warning(f"Could not match {artist_name} to Spotify: {e}")
return None
def _match_to_itunes(self, artist_name: str) -> Optional[str]:
"""Match artist name to iTunes ID using fuzzy name comparison."""
try:
if hasattr(self, '_metadata_service') and self._metadata_service:
results = self._metadata_service.itunes.search_artists(artist_name, limit=5)
else:
logger.warning("Cannot match to iTunes - MetadataService not available")
return None
return self._best_artist_match(results, artist_name)
except Exception as e:
logger.warning(f"Could not match {artist_name} to iTunes: {e}")
return None
def _match_to_deezer(self, artist_name: str) -> Optional[str]:
"""Match artist name to Deezer ID using fuzzy name comparison."""
try:
# Try MetadataService fallback client (if it's Deezer)
if hasattr(self, '_metadata_service') and self._metadata_service:
client = self._metadata_service.itunes # Named 'itunes' but may be DeezerClient
from core.deezer_client import DeezerClient
if isinstance(client, DeezerClient):
results = client.search_artists(artist_name, limit=5)
return self._best_artist_match(results, artist_name)
# Fallback: use cached Deezer client
from core.metadata.registry import get_deezer_client
client = get_deezer_client()
results = client.search_artists(artist_name, limit=5)
return self._best_artist_match(results, artist_name)
except Exception as e:
logger.warning(f"Could not match {artist_name} to Deezer: {e}")
return None
def _match_to_discogs(self, artist_name: str) -> Optional[str]:
"""Match artist name to Discogs ID using fuzzy name comparison."""
try:
from core.metadata.registry import get_discogs_client
client = get_discogs_client()
results = client.search_artists(artist_name, limit=5)
return self._best_artist_match(results, artist_name)
except Exception as e:
logger.warning(f"Could not match {artist_name} to Discogs: {e}")
return None
def _get_lookback_period_setting(self) -> str:
"""
Get the discovery lookback period setting from database.
Returns:
str: Period value ('7', '30', '90', '180', or 'all')
"""
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT value FROM metadata WHERE key = 'discovery_lookback_period'")
row = cursor.fetchone()
if row:
return row['value']
else:
# Default to 30 days if not set
return '30'
except Exception as e:
logger.warning(f"Error getting lookback period setting, defaulting to 30 days: {e}")
return '30'
def _get_rescan_cutoff(self):
"""
Check if a lookback period change requires a one-time wider scan window.
When the lookback period is expanded, a 'watchlist_rescan_cutoff' metadata key
is set with the new cutoff date. This method returns that cutoff so the scanner
can use the wider window for artists scanned before the change. After a full
scan cycle, the key is cleared by _clear_rescan_cutoff().
Returns:
datetime cutoff if a rescan is pending with a specific date,
'all' string if lookback was set to entire discography,
None if no rescan is pending
"""
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT value FROM metadata WHERE key = 'watchlist_rescan_cutoff'")
row = cursor.fetchone()
if row is not None:
val = row['value']
if val == '':
return 'all' # Lookback set to 'all' — scan everything
return datetime.fromisoformat(val)
except Exception as e:
logger.debug(f"Error reading rescan cutoff: {e}")
return None
def _clear_rescan_cutoff(self):
"""Clear the one-time rescan cutoff after a full scan cycle completes."""
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("DELETE FROM metadata WHERE key = 'watchlist_rescan_cutoff'")
conn.commit()
logger.info("Cleared watchlist rescan cutoff flag")
self._rescan_cutoff_log_marker = None
except Exception as e:
logger.debug(f"Error clearing rescan cutoff: {e}")
def is_album_after_timestamp(self, album, timestamp: datetime) -> bool:
"""Check if album was released after the given timestamp"""
try:
if not album.release_date:
return True # Include albums with unknown release dates to be safe
# Parse release date - Spotify provides different precisions
release_date_str = album.release_date
# Handle different date formats
if len(release_date_str) == 4: # Year only (e.g., "2023")
album_date = datetime(int(release_date_str), 1, 1, tzinfo=timezone.utc)
elif len(release_date_str) == 7: # Year-month (e.g., "2023-10")
year, month = release_date_str.split('-')
album_date = datetime(int(year), int(month), 1, tzinfo=timezone.utc)
elif len(release_date_str) == 10: # Full date (e.g., "2023-10-15")
album_date = datetime.strptime(release_date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
elif 'T' in release_date_str: # ISO 8601 with time (e.g., "2017-12-08T08:00:00Z" from iTunes)
# Strip the time portion and parse just the date
date_part = release_date_str.split('T')[0]
album_date = datetime.strptime(date_part, "%Y-%m-%d").replace(tzinfo=timezone.utc)
else:
logger.warning(f"Unknown release date format: {release_date_str}")
return True # Include if we can't parse
# Ensure timestamp has timezone info
if timestamp.tzinfo is None:
timestamp = timestamp.replace(tzinfo=timezone.utc)
return album_date > timestamp
except Exception as e:
logger.warning(f"Error comparing album date {album.release_date} with timestamp {timestamp}: {e}")
return True # Include if we can't determine
def _is_future_release(self, album, now: datetime) -> bool:
"""Check if an album's release date is in the future. Returns False for unknown dates (safe default)."""
try:
if not album.release_date:
return False # Unknown date — assume released
release_date_str = album.release_date
if len(release_date_str) == 4:
album_date = datetime(int(release_date_str), 1, 1, tzinfo=timezone.utc)
elif len(release_date_str) == 7:
year, month = release_date_str.split('-')
album_date = datetime(int(year), int(month), 1, tzinfo=timezone.utc)
elif len(release_date_str) == 10:
album_date = datetime.strptime(release_date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
elif 'T' in release_date_str:
date_part = release_date_str.split('T')[0]
album_date = datetime.strptime(date_part, "%Y-%m-%d").replace(tzinfo=timezone.utc)
else:
return False # Can't parse — assume released
return album_date > now
except Exception:
return False # Error — assume released
def _has_placeholder_tracks(self, tracks: list) -> bool:
"""Check if an album's tracks are mostly placeholders (unreleased/unannounced tracklist).
Spotify uses 'Track 1', 'Track 2', etc. for tracks whose names haven't been revealed."""
if not tracks or len(tracks) == 0:
return False
import re
placeholder_count = 0
for track in tracks:
name = track.get('name', '') if isinstance(track, dict) else getattr(track, 'name', '')
# Match "Track 1", "Track 2", ..., "Track 99" (case-insensitive)
if re.match(r'^track\s+\d+$', name.strip(), re.IGNORECASE):
placeholder_count += 1
# If more than half the tracks are placeholders, skip the album
# (some albums legitimately have a track called "Track X" but not most of them)
return placeholder_count > len(tracks) / 2
def _should_include_release(self, track_count: int, watchlist_artist: WatchlistArtist) -> bool:
"""
Check if a release should be included based on user's preferences.
Categorization:
- Singles: 1-3 tracks
- EPs: 4-6 tracks
- Albums: 7+ tracks
Args:
track_count: Number of tracks in the release
watchlist_artist: WatchlistArtist object with user preferences
Returns:
True if release should be included, False if should be skipped
"""
try:
# Default to including everything if preferences aren't set (backwards compatibility)
include_albums = getattr(watchlist_artist, 'include_albums', True)
include_eps = getattr(watchlist_artist, 'include_eps', True)
include_singles = getattr(watchlist_artist, 'include_singles', True)
# Determine release type based on track count
if track_count >= 7:
# This is an album
return include_albums
elif track_count >= 4:
# This is an EP (4-6 tracks)
return include_eps
else:
# This is a single (1-3 tracks)
return include_singles
except Exception as e:
logger.warning(f"Error checking release inclusion: {e}")
return True # Default to including on error
def _should_include_track(self, track, album_data, watchlist_artist: WatchlistArtist) -> bool:
"""
Check if a track should be included based on content type filters.
Filters:
- Live versions
- Remixes
- Acoustic versions
- Compilation albums
Args:
track: Track object or dict
album_data: Album data object or dict
watchlist_artist: WatchlistArtist object with user preferences
Returns:
True if track should be included, False if should be skipped
"""
try:
# Get track name and album name
if isinstance(track, dict):
track_name = track.get('name', '')
else:
track_name = getattr(track, 'name', '')
if isinstance(album_data, dict):
album_name = album_data.get('name', '')
else:
album_name = getattr(album_data, 'name', '')
# Get user preferences (default to False = exclude by default)
include_live = getattr(watchlist_artist, 'include_live', False)
include_remixes = getattr(watchlist_artist, 'include_remixes', False)
include_acoustic = getattr(watchlist_artist, 'include_acoustic', False)
include_compilations = getattr(watchlist_artist, 'include_compilations', False)
include_instrumentals = getattr(watchlist_artist, 'include_instrumentals', False)
# Check compilation albums (album-level filter)
if not include_compilations:
if is_compilation_album(album_name):
logger.debug(f"Skipping compilation album: {album_name}")
return False
# Check track content type filters
if not include_live:
if is_live_version(track_name, album_name):
logger.debug(f"Skipping live version: {track_name}")
return False
if not include_remixes:
if is_remix_version(track_name, album_name):
logger.debug(f"Skipping remix: {track_name}")
return False
if not include_acoustic:
if is_acoustic_version(track_name, album_name):
logger.debug(f"Skipping acoustic version: {track_name}")
return False
# Check instrumental versions
if not include_instrumentals:
if is_instrumental_version(track_name, album_name):
logger.debug(f"Skipping instrumental version: {track_name}")
return False
# Check custom exclusion terms
try:
from config.settings import config_manager as _cfg
exclude_terms_str = _cfg.get('watchlist.exclude_terms', '')
if exclude_terms_str:
exclude_terms = [t.strip() for t in exclude_terms_str.split(',') if t.strip()]
matched_term = matches_custom_exclude_terms(track_name, album_name, exclude_terms)
if matched_term:
logger.debug(f"Skipping track '{track_name}' — matched custom exclusion term: '{matched_term}'")
return False
except Exception as e:
logger.warning(f"Error checking custom exclusion terms: {e}")
# Track passes all filters
return True
except Exception as e:
logger.warning(f"Error checking track content type inclusion: {e}")
return True # Default to including on error
def is_track_missing_from_library(self, track, album_name: str = None) -> bool:
"""
Check if a track is missing from the local library.
Uses the same matching logic as the download missing tracks modals.
"""
try:
# Handle both dict and object track formats
if isinstance(track, dict):
original_title = track.get('name', 'Unknown')
track_artists = track.get('artists', [])
artists_to_search = [artist.get('name', 'Unknown') for artist in track_artists] if track_artists else ["Unknown"]
else:
original_title = track.name
artists_to_search = [artist.name for artist in track.artists] if track.artists else ["Unknown"]
# Generate title variations (same logic as sync page)
title_variations = [original_title]
# Only add cleaned version if it removes clear noise
cleaned_for_search = clean_track_name_for_search(original_title)
if cleaned_for_search.lower() != original_title.lower():
title_variations.append(cleaned_for_search)
# Use matching engine's conservative clean_title
base_title = self.matching_engine.clean_title(original_title)
if base_title.lower() not in [t.lower() for t in title_variations]:
title_variations.append(base_title)
unique_title_variations = list(dict.fromkeys(title_variations))
# Search for each artist with each title variation
from config.settings import config_manager
active_server = config_manager.get_active_media_server()
allow_duplicates = config_manager.get('wishlist.allow_duplicate_tracks', True)
# Provider-neutral external-ID short-circuit: before doing
# title+artist+album fuzzy comparison, ask the library if any
# row carries a matching external ID (Spotify, Deezer, iTunes,
# Tidal, Qobuz, MusicBrainz, AudioDB, Hydrabase, ISRC). When
# the library has stale album metadata for an existing file
# (e.g. file tagged on the wrong album by an old import), the
# fuzzy block declares the track missing and re-downloads it
# on every scan — but the file's external IDs unambiguously
# identify it as the same recording. See plan-watchlist-id-
# match.md for the reported scenario.
try:
from core.library.track_identity import (
extract_external_ids,
find_library_track_by_external_id,
find_provenance_by_external_id,
)
import os as _os_local
# Pass the configured primary source as a hint so the
# extractor can disambiguate raw Spotify / iTunes API
# responses that don't carry a provider / source field
# of their own (Deezer / Discogs / Hydrabase clients
# already tag tracks with _source).
try:
_source_hint = get_primary_source()
except Exception:
_source_hint = None
source_ids = extract_external_ids(track, source_hint=_source_hint)
if source_ids:
matched = find_library_track_by_external_id(
self.database,
external_ids=source_ids,
server_source=active_server,
)
if matched is not None:
logger.info(
f"[ExtID Match] Track found in library by external ID: "
f"'{original_title}' by '{artists_to_search[0] if artists_to_search else 'Unknown'}' "
f"(matched on: {', '.join(sorted(source_ids.keys()))})"
)
return False # Track exists in library
# Second-tier fallback: provenance table. Catches the
# window between "SoulSync downloaded the file" and
# "media-server scan + sync populated the tracks row
# with IDs". File still has to exist on disk —
# otherwise a user who deleted a file would never get
# it back.
prov = find_provenance_by_external_id(
self.database, external_ids=source_ids,
)
if prov is not None:
prov_path = prov.get('file_path')
if prov_path and _os_local.path.exists(prov_path):
logger.info(
f"[Provenance Match] Track found in download provenance: "
f"'{original_title}' by '{artists_to_search[0] if artists_to_search else 'Unknown'}' "
f"(matched on: {', '.join(sorted(source_ids.keys()))})"
)
return False
except Exception as ext_id_err:
logger.debug(f"External-ID match probe failed (falling through to fuzzy): {ext_id_err}")
for artist_name in artists_to_search:
for query_title in unique_title_variations:
# When allow_duplicates is on, skip album hint so we get title+artist matches only
search_album = None if allow_duplicates else album_name
db_track, confidence = self.database.check_track_exists(query_title, artist_name, confidence_threshold=0.7, server_source=active_server, album=search_album)
if db_track and confidence >= 0.7:
# When allow_duplicates is on, only skip if we believe
# the library copy is on the same album the watchlist
# is asking about. Album name drift between Spotify
# and the media-server scan ("Napoleon Dynamite (Music
# From The Motion Picture)" vs "Napoleon Dynamite OST")
# used to fail a strict 0.85 fuzzy threshold and force
# an infinite redownload loop.
if allow_duplicates and album_name:
lib_album = getattr(db_track, 'album_title', '') or ''
if lib_album:
if _albums_likely_match(album_name, lib_album):
logger.info(f"[AllowDup] Album match — skipping: '{original_title}' (wanted: '{album_name}', library: '{lib_album}')")
else:
logger.info(f"[AllowDup] Different album — allowing: '{original_title}' (wanted: '{album_name}', library: '{lib_album}')")
continue # Different album — allow it
else:
# No album info in library — can't compare, allow it
logger.info(f"[AllowDup] No album info in library — allowing: '{original_title}'")
continue
logger.debug(f"Track found in library: '{original_title}' by '{artist_name}' (confidence: {confidence:.2f})")
return False # Track exists in library
# No match found with any variation or artist
logger.info(f"Track missing from library: '{original_title}' by '{artists_to_search[0] if artists_to_search else 'Unknown'}' - adding to wishlist")
return True # Track is missing
except Exception as e:
# Handle both dict and object track formats for error logging
track_name = track.get('name', 'Unknown') if isinstance(track, dict) else getattr(track, 'name', 'Unknown')
logger.warning(f"Error checking if track exists: {track_name}: {e}")
return True # Assume missing if we can't check
def add_track_to_wishlist(self, track, album, watchlist_artist: WatchlistArtist) -> bool:
"""Add a missing track to the wishlist"""
try:
# Handle both dict and object track/album formats
if isinstance(track, dict):
track_id = track.get('id', '')
track_name = track.get('name', 'Unknown')
track_artists = track.get('artists', [])
track_duration = track.get('duration_ms', 0)
track_explicit = track.get('explicit', False)
track_external_urls = track.get('external_urls', {})
track_popularity = track.get('popularity', 0)
track_preview_url = track.get('preview_url', None)
track_number = track.get('track_number', 1)
disc_number = track.get('disc_number', 1)
track_uri = track.get('uri', '')
else:
track_id = track.id
track_name = track.name
track_artists = [{'name': artist.name, 'id': artist.id} for artist in track.artists]
track_duration = getattr(track, 'duration_ms', 0)
track_explicit = getattr(track, 'explicit', False)
track_external_urls = getattr(track, 'external_urls', {})
track_popularity = getattr(track, 'popularity', 0)
track_preview_url = getattr(track, 'preview_url', None)
track_number = getattr(track, 'track_number', 1)
disc_number = getattr(track, 'disc_number', 1)
track_uri = getattr(track, 'uri', '')
if isinstance(album, dict):
album_name = album.get('name', 'Unknown')
album_id = album.get('id', '')
album_release_date = album.get('release_date', '')
album_images = album.get('images', [])
album_type = album.get('album_type', 'album') # 'album', 'single', or 'ep'
total_tracks = album.get('total_tracks', 0)
album_artists = album.get('artists', [])
else:
album_name = album.name
album_id = album.id
album_release_date = album.release_date
album_images = album.images if hasattr(album, 'images') else []
album_type = album.album_type if hasattr(album, 'album_type') else 'album'
total_tracks = album.total_tracks if hasattr(album, 'total_tracks') else 0
album_artists = album.artists if hasattr(album, 'artists') else []
# Create Spotify track data structure
spotify_track_data = {
'id': track_id,
'name': track_name,
'artists': track_artists,
'album': {
'name': album_name,
'id': album_id,
'release_date': album_release_date,
'images': album_images,
'album_type': album_type, # Store album type for category filtering
'total_tracks': total_tracks, # Store track count for accurate categorization
'artists': album_artists
},
'duration_ms': track_duration,
'explicit': track_explicit,
'external_urls': track_external_urls,
'popularity': track_popularity,
'preview_url': track_preview_url,
'track_number': track_number,
'disc_number': disc_number,
'uri': track_uri,
'is_local': False
}
# Add to wishlist with watchlist context (scoped to artist's profile)
success = self.database.add_to_wishlist(
spotify_track_data=spotify_track_data,
failure_reason="Missing from library (found by watchlist scan)",
source_type="watchlist",
source_info={
'watchlist_artist_name': watchlist_artist.artist_name,
'watchlist_artist_id': watchlist_artist.spotify_artist_id,
'album_name': album_name,
'scan_timestamp': datetime.now().isoformat()
},
profile_id=getattr(watchlist_artist, 'profile_id', 1)
)
if success:
first_artist = track_artists[0].get('name', 'Unknown') if track_artists else 'Unknown'
logger.debug(f"Added track to wishlist: {track_name} by {first_artist}")
else:
logger.warning(f"Failed to add track to wishlist: {track_name}")
return success
except Exception as e:
logger.error(f"Error adding track to wishlist: {track_name}: {e}")
return False
def update_artist_scan_timestamp(self, artist) -> bool:
"""Update the last scan timestamp for an artist.
Args:
artist: WatchlistArtist object, or a string spotify_artist_id for backward compat
"""
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
# Support both WatchlistArtist objects and raw string IDs
if hasattr(artist, 'id'):
# WatchlistArtist object - use database primary key (always reliable)
cursor.execute("""
UPDATE watchlist_artists
SET last_scan_timestamp = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""", (artist.id,))
artist_label = f"{artist.artist_name} (id={artist.id})"
else:
# Backward compat: raw string ID (try spotify, then itunes)
cursor.execute("""
UPDATE watchlist_artists
SET last_scan_timestamp = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
WHERE spotify_artist_id = ? OR itunes_artist_id = ?
""", (artist, artist))
artist_label = f"ID {artist}"
conn.commit()
if cursor.rowcount > 0:
logger.debug(f"Updated scan timestamp for artist {artist_label}")
return True
else:
logger.warning(f"No artist found for {artist_label}")
return False
except Exception as e:
logger.error(f"Error updating scan timestamp: {e}")
return False
def _fetch_similar_artists_from_musicmap(self, artist_name: str, limit: int = 20) -> List[Dict[str, Any]]:
"""
Fetch similar artists from MusicMap and match them against configured metadata providers.
Args:
artist_name: The artist name to find similar artists for
limit: Maximum number of similar artists to return (default: 20)
Returns:
List of matched artist dictionaries with provider-specific IDs when available
"""
try:
logger.info(f"Fetching similar artists from MusicMap for: {artist_name}")
# Construct MusicMap URL
from urllib.parse import quote_plus
url_artist = quote_plus(artist_name.strip())
musicmap_url = f'https://www.music-map.com/{url_artist}'
# Set headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
# Fetch MusicMap page
response = requests.get(musicmap_url, headers=headers, timeout=10)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
gnod_map = soup.find(id='gnodMap')
if not gnod_map:
logger.warning(f"Could not find artist map on MusicMap for {artist_name}")
return []
# Extract similar artist names
all_anchors = gnod_map.find_all('a')
searched_artist_lower = artist_name.lower().strip()
similar_artist_names = []
for anchor in all_anchors:
artist_text = anchor.get_text(strip=True)
# Skip if this is the searched artist
if artist_text.lower() == searched_artist_lower:
continue
similar_artist_names.append(artist_text)
logger.info(f"Found {len(similar_artist_names)} similar artists from MusicMap")
source_priority = self._discovery_source_priority()
source_id_keys = {
'spotify': 'spotify_id',
'itunes': 'itunes_id',
'deezer': 'deezer_id',
}
searched_source_ids = {}
available_sources = []
for source in source_priority:
search_results = self._search_artists_for_source(source, artist_name, limit=1)
if search_results:
searched_source_ids[source] = self._extract_entity_id(search_results[0])
available_sources.append(source)
else:
searched_source_ids[source] = None
if not available_sources:
logger.warning(f"No metadata providers available for MusicMap matching: {artist_name}")
return []
matched_artists = []
seen_names = set()
provider_match_counts = {source: 0 for source in available_sources}
for artist_name_to_match in similar_artist_names[:limit]:
try:
name_lower = artist_name_to_match.lower().strip()
if name_lower in seen_names:
continue
artist_data = {
'name': artist_name_to_match,
'spotify_id': None,
'itunes_id': None,
'deezer_id': None,
'image_url': None,
'genres': [],
'popularity': 0,
}
for source in available_sources:
search_results = self._search_artists_for_source(source, artist_name_to_match, limit=1)
if not search_results:
continue
matched_artist = search_results[0]
matched_id = self._extract_entity_id(matched_artist)
if not matched_id or matched_id == searched_source_ids.get(source):
continue
id_key = source_id_keys.get(source)
if not id_key:
continue
artist_data[id_key] = matched_id
provider_match_counts[source] += 1
metadata = self._get_artist_metadata_from_data(matched_artist)
if metadata['name'] and artist_data['name'] == artist_name_to_match:
artist_data['name'] = metadata['name']
if metadata['image_url'] and not artist_data['image_url']:
artist_data['image_url'] = metadata['image_url']
if metadata['genres'] and not artist_data['genres']:
artist_data['genres'] = metadata['genres']
if metadata['popularity'] and not artist_data['popularity']:
artist_data['popularity'] = metadata['popularity']
if any(artist_data.get(key) for key in source_id_keys.values()):
seen_names.add(name_lower)
matched_artists.append(artist_data)
provider_summary = ", ".join(
f"{source}: {artist_data.get(source_id_keys[source])}"
for source in available_sources
if artist_data.get(source_id_keys[source])
)
logger.debug(f" Matched: {artist_data['name']} ({provider_summary})")
except Exception as match_error:
logger.debug(f"Error matching {artist_name_to_match}: {match_error}")
continue
# Log detailed matching statistics
provider_stats = ", ".join(
f"{source}: {provider_match_counts[source]}"
for source in available_sources
)
logger.info(f"Matched {len(matched_artists)} similar artists - {provider_stats}")
return matched_artists
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching from MusicMap: {e}")
return []
except Exception as e:
logger.error(f"Error fetching similar artists from MusicMap: {e}")
return []
def _update_similar_artist_source_id(self, similar_artist_id: int, source: str, source_id: str) -> bool:
"""Persist a resolved similar-artist ID for a supported source."""
if source == 'deezer':
return self.database.update_similar_artist_deezer_id(similar_artist_id, source_id)
if source == 'itunes':
return self.database.update_similar_artist_itunes_id(similar_artist_id, source_id)
return False
def _backfill_similar_artists_fallback_ids(self, source_artist_id: str, profile_id: int = 1) -> int:
"""
Backfill missing fallback-provider IDs for cached similar artists.
Uses the configured source priority, filtered to providers that have
writable similar-artist ID columns. This keeps old cached rows usable
when the active metadata provider changes.
"""
backfill_sources = [source for source in self._discovery_source_priority() if source in {'itunes', 'deezer'}]
if not backfill_sources:
logger.debug("No fallback metadata providers available for similar-artist backfill")
return 0
updated_total = 0
try:
for source in backfill_sources:
client = get_client_for_source(source)
if not client:
logger.debug("Skipping %s similar-artist backfill - client unavailable", source)
continue
similar_artists = self.database.get_similar_artists_missing_fallback_ids(
source_artist_id,
source,
profile_id=profile_id,
)
if not similar_artists:
continue
logger.info("Backfilling %s IDs for %s similar artists", source, len(similar_artists))
updated_count = 0
for similar_artist in similar_artists:
try:
results = self._search_artists_for_source(source, similar_artist.similar_artist_name, limit=1, client=client)
if not results:
continue
found_id = self._extract_entity_id(results[0])
if not found_id:
continue
success = self._update_similar_artist_source_id(similar_artist.id, source, found_id)
if success:
updated_count += 1
updated_total += 1
logger.debug(" Backfilled %s ID %s for %s", source, found_id, similar_artist.similar_artist_name)
except Exception as e:
logger.debug(" Could not backfill %s ID for %s: %s", source, similar_artist.similar_artist_name, e)
continue
if updated_count > 0:
logger.info("Backfilled %s similar artists with %s IDs", updated_count, source)
return updated_total
except Exception as e:
logger.error("Error backfilling similar artists IDs: %s", e)
return 0
def update_similar_artists(
self,
watchlist_artist: WatchlistArtist,
limit: int = 10,
profile_id: int = 1,
source_artist_id: Optional[str] = None,
) -> bool:
"""
Fetch and store similar artists for a watchlist artist.
Called after each artist scan to build discovery pool.
Uses MusicMap to find similar artists and matches them against available metadata providers.
"""
try:
logger.info(f"Fetching similar artists for {watchlist_artist.artist_name}")
# Get similar artists from MusicMap (returns list of artist dicts with provider IDs)
similar_artists = self._fetch_similar_artists_from_musicmap(watchlist_artist.artist_name, limit=limit)
if not similar_artists:
logger.debug(f"No similar artists found for {watchlist_artist.artist_name}")
return True # Not an error, just no recommendations
logger.info(f"Found {len(similar_artists)} similar artists for {watchlist_artist.artist_name}")
# Use the ID that matched the scan source when available; otherwise fall back to any known ID.
source_artist_id = (
source_artist_id
or watchlist_artist.spotify_artist_id
or watchlist_artist.itunes_artist_id
or watchlist_artist.deezer_artist_id
or watchlist_artist.discogs_artist_id
or str(watchlist_artist.id)
)
# Store each similar artist in database
stored_count = 0
for rank, similar_artist in enumerate(similar_artists, 1):
try:
# similar_artist has 'name', provider IDs, 'image_url', 'genres', 'popularity'
success = self.database.add_or_update_similar_artist(
source_artist_id=source_artist_id,
similar_artist_name=similar_artist['name'],
similar_artist_spotify_id=similar_artist.get('spotify_id'),
similar_artist_itunes_id=similar_artist.get('itunes_id'),
similarity_rank=rank,
profile_id=profile_id,
image_url=similar_artist.get('image_url'),
genres=similar_artist.get('genres'),
popularity=similar_artist.get('popularity', 0),
similar_artist_deezer_id=similar_artist.get('deezer_id')
)
if success:
stored_count += 1
fallback_id = similar_artist.get('deezer_id') or similar_artist.get('itunes_id')
fallback_label = 'Deezer' if similar_artist.get('deezer_id') else 'iTunes'
logger.debug(f" #{rank}: {similar_artist['name']} (Spotify: {similar_artist.get('spotify_id')}, {fallback_label}: {fallback_id})")
except Exception as e:
logger.warning(f"Error storing similar artist {similar_artist.get('name', 'Unknown')}: {e}")
continue
logger.info(f"Stored {stored_count}/{len(similar_artists)} similar artists for {watchlist_artist.artist_name}")
return True
except Exception as e:
logger.error(f"Error fetching similar artists for {watchlist_artist.artist_name}: {e}")
return False
def populate_discovery_pool(self, top_artists_limit: int = 50, albums_per_artist: int = 10, profile_id: int = 1, progress_callback=None):
"""
Populate discovery pool with tracks from top similar artists.
Called after watchlist scan completes.
Supports Spotify, iTunes, and Deezer sources - populates for whichever is available.
- Checks if pool was updated in last 24 hours (prevents over-polling)
- Includes albums, singles, and EPs for comprehensive coverage
- Appends to existing pool instead of replacing it
- Cleans up tracks older than 365 days (maintains 1 year rolling window)
"""
try:
from datetime import datetime, timedelta
import random
# Check if we should run discovery pool population (prevents over-polling)
skip_pool_population = not self.database.should_populate_discovery_pool(hours_threshold=24, profile_id=profile_id)
if skip_pool_population:
logger.info("Discovery pool was populated recently (< 24 hours ago). Skipping pool population.")
logger.info("But still refreshing recent albums cache and curated playlists...")
if progress_callback:
progress_callback('skip', 'Discovery pool recently updated, skipping')
# Still run these even when skipping main pool population
if progress_callback:
progress_callback('phase', 'Caching recent albums...')
self.cache_discovery_recent_albums(profile_id=profile_id)
if progress_callback:
progress_callback('phase', 'Curating playlists...')
self.curate_discovery_playlists(profile_id=profile_id)
return
logger.info("Populating discovery pool from similar artists...")
discovery_sources = self._discovery_source_priority()
if not discovery_sources:
logger.warning("No music sources available to populate discovery pool")
return
logger.info("Discovery source priority: %s", discovery_sources)
# Get top similar artists for this profile's watchlist (ordered by occurrence_count)
similar_artists = self.database.get_top_similar_artists(limit=top_artists_limit, profile_id=profile_id)
if not similar_artists:
logger.info("No similar artists found to populate discovery pool from similar artists")
logger.info("But still caching recent albums from watchlist artists and curating playlists...")
if progress_callback:
progress_callback('skip', 'No similar artists found')
# Still run these even without similar artists - they use watchlist artists
if progress_callback:
progress_callback('phase', 'Caching recent albums...')
self.cache_discovery_recent_albums(profile_id=profile_id)
if progress_callback:
progress_callback('phase', 'Curating playlists...')
self.curate_discovery_playlists(profile_id=profile_id)
return
logger.info(f"Processing {len(similar_artists)} top similar artists for discovery pool")
total_tracks_added = 0
for artist_idx, similar_artist in enumerate(similar_artists, 1):
try:
logger.info(f"[{artist_idx}/{len(similar_artists)}] Processing {similar_artist.similar_artist_name} (occurrence: {similar_artist.occurrence_count})")
if progress_callback:
progress_callback('artist', f'{similar_artist.similar_artist_name} ({artist_idx}/{len(similar_artists)})')
# Resolve the first source that can actually produce albums.
selected_source = None
selected_artist_id = None
selected_albums = []
artist_genres: List[str] = []
for source in discovery_sources:
source_attr = self._artist_id_attribute_for_source(source)
stored_id = getattr(similar_artist, source_attr, None) if source_attr else None
cache_callback = None
if source == 'itunes':
cache_callback = lambda found_id, artist_id=similar_artist.id: self.database.update_similar_artist_itunes_id(artist_id, found_id)
elif source == 'deezer':
cache_callback = lambda found_id, artist_id=similar_artist.id: self.database.update_similar_artist_deezer_id(artist_id, found_id)
artist_id = self._resolve_artist_id_for_source(
source,
similar_artist.similar_artist_name,
stored_id=stored_id,
cache_callback=cache_callback,
)
if not artist_id:
continue
all_albums = self._get_artist_albums_for_source(
source,
artist_id,
album_type='album,single,ep',
limit=50,
skip_cache=False,
max_pages=2,
)
if not all_albums:
logger.debug(f"No albums found for {similar_artist.similar_artist_name} on {source}")
continue
artist_data = self._get_artist_data_for_source(source, artist_id)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
albums = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type == 'album']
singles_eps = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type in ['single', 'ep']]
selected_albums = []
latest_releases = all_albums[:3]
selected_albums.extend(latest_releases)
remaining_slots = albums_per_artist - len(selected_albums)
if remaining_slots > 0:
remaining_content = all_albums[3:]
if len(remaining_content) > remaining_slots:
selected_albums.extend(random.sample(remaining_content, remaining_slots))
else:
selected_albums.extend(remaining_content)
selected_source = source
selected_artist_id = artist_id
logger.info(
f" [{source}] Selected {len(selected_albums)} releases from {len(all_albums)} available "
f"(albums: {len(albums)}, singles/EPs: {len(singles_eps)})"
)
break
if not selected_source or not selected_artist_id or not selected_albums:
logger.debug(f"No valid source/albums for {similar_artist.similar_artist_name}, skipping")
continue
# Process each selected album from the winning source.
for album_idx, album in enumerate(selected_albums, 1):
try:
album_data = self._get_album_data_for_source(selected_source, album.id, album_name=album.name)
if not album_data:
continue
tracks = self._extract_track_items(album_data)
logger.debug(f" Album {album_idx}: {album_data.get('name', 'Unknown')} ({len(tracks)} tracks)")
if self._has_placeholder_tracks(tracks):
logger.info(f" Skipping album with placeholder tracks: {album_data.get('name', 'Unknown')}")
continue
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
is_new = (datetime.now() - release_date).days <= 30
except Exception as e:
logger.debug("album release_date parse failed: %s", e)
for track in tracks:
try:
enhanced_track = {
**track,
'album': {
'id': album_data['id'],
'name': album_data.get('name', 'Unknown Album'),
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
'total_tracks': album_data.get('total_tracks', 0)
},
'_source': selected_source
}
raw_popularity = album_data.get('popularity', 0)
if selected_source in ('itunes', 'deezer') and raw_popularity == 0:
synth_pop = 45
if is_new:
synth_pop += 25
else:
try:
release_str = album_data.get('release_date', '')
if release_str and len(release_str) >= 10:
rel_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
age_days = (datetime.now() - rel_date).days
if age_days <= 90:
synth_pop += 15
elif age_days <= 365:
synth_pop += 5
except Exception as e:
logger.debug("synthetic popularity age calc failed: %s", e)
if similar_artist.occurrence_count >= 3:
synth_pop += 10
elif similar_artist.occurrence_count >= 2:
synth_pop += 5
raw_popularity = min(synth_pop, 100)
track_data = {
'track_name': track.get('name', 'Unknown Track'),
'artist_name': similar_artist.similar_artist_name,
'album_name': album_data.get('name', 'Unknown Album'),
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': raw_popularity,
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': enhanced_track,
'artist_genres': artist_genres
}
if selected_source == 'spotify':
track_data['spotify_track_id'] = track.get('id')
track_data['spotify_album_id'] = album_data.get('id')
track_data['spotify_artist_id'] = selected_artist_id
elif selected_source == 'deezer':
track_data['deezer_track_id'] = track.get('id')
track_data['deezer_album_id'] = album_data.get('id')
track_data['deezer_artist_id'] = selected_artist_id
else:
track_data['itunes_track_id'] = track.get('id')
track_data['itunes_album_id'] = album_data.get('id')
track_data['itunes_artist_id'] = selected_artist_id
if self.database.add_to_discovery_pool(track_data, source=selected_source, profile_id=profile_id):
total_tracks_added += 1
except Exception as track_error:
logger.debug(f"Error adding track to discovery pool: {track_error}")
continue
time.sleep(DELAY_BETWEEN_ALBUMS)
except Exception as album_error:
logger.warning(f"Error processing album on {selected_source}: {album_error}")
continue
if artist_idx < len(similar_artists):
time.sleep(DELAY_BETWEEN_ARTISTS)
except Exception as artist_error:
logger.warning(f"Error processing artist {similar_artist.similar_artist_name}: {artist_error}")
continue
logger.info(f"Discovery pool from similar artists complete: {total_tracks_added} tracks added")
if progress_callback:
progress_callback('success', f'Discovery pool: {total_tracks_added} tracks from {len(similar_artists)} artists')
# Note: Watchlist artist albums are already in discovery pool from the watchlist scan itself
# No need to re-fetch them here to avoid duplicate API calls
# Add tracks from random database albums for extra variety (reduced to 5 to save API calls)
logger.info("Adding tracks from database albums to discovery pool...")
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT DISTINCT a.title, ar.name as artist_name
FROM albums a
JOIN artists ar ON a.artist_id = ar.id
ORDER BY RANDOM()
LIMIT 5
""")
db_albums = cursor.fetchall()
logger.info(f"Processing {len(db_albums)} database albums for discovery pool")
for db_idx, album_row in enumerate(db_albums, 1):
try:
query = f"{album_row['title']} {album_row['artist_name']}"
album_data = None
tracks = []
db_source = None
artist_id_for_genres = None
for source in discovery_sources:
try:
search_query = query if source != 'spotify' else f"album:{album_row['title']} artist:{album_row['artist_name']}"
search_results = self._search_albums_for_source(source, search_query, limit=1)
if not search_results:
continue
album_candidate = search_results[0]
album_data = self._get_album_data_for_source(source, album_candidate.id, album_name=album_row['title'])
if not album_data:
continue
tracks = self._extract_track_items(album_data)
if not tracks:
continue
db_source = source
if album_data.get('artists'):
artist_id_for_genres = album_data['artists'][0].get('id')
break
except Exception as e:
logger.debug(f"{source} search failed for {album_row['title']}: {e}")
if not tracks or not album_data:
continue
artist_genres = []
try:
if artist_id_for_genres:
artist_data = self._get_artist_data_for_source(db_source, artist_id_for_genres)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
except Exception as e:
logger.debug(f"Could not fetch genres for album artist: {e}")
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
is_new = (datetime.now() - release_date).days <= 30
except Exception as e:
logger.debug("album release_date parse failed: %s", e)
for track in tracks:
try:
enhanced_track = {
**track,
'album': {
'id': album_data['id'],
'name': album_row['title'],
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
'total_tracks': album_data.get('total_tracks', 0)
},
'_source': db_source
}
track_data = {
'track_name': track.get('name', 'Unknown Track'),
'artist_name': album_row['artist_name'],
'album_name': album_row['title'],
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': album_data.get('popularity', 0),
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': enhanced_track,
'artist_genres': artist_genres
}
if db_source == 'spotify':
track_data['spotify_track_id'] = track.get('id')
track_data['spotify_album_id'] = album_data.get('id')
track_data['spotify_artist_id'] = artist_id_for_genres or ''
elif db_source == 'deezer':
track_data['deezer_track_id'] = track.get('id')
track_data['deezer_album_id'] = album_data.get('id')
track_data['deezer_artist_id'] = artist_id_for_genres or ''
else:
track_data['itunes_track_id'] = track.get('id')
track_data['itunes_album_id'] = album_data.get('id')
track_data['itunes_artist_id'] = artist_id_for_genres or ''
if self.database.add_to_discovery_pool(track_data, source=db_source, profile_id=profile_id):
total_tracks_added += 1
except Exception:
continue
time.sleep(DELAY_BETWEEN_ALBUMS)
except Exception as album_error:
logger.debug(f"Error processing database album {album_row['title']}: {album_error}")
continue
# Rate limit between albums
if db_idx < len(db_albums):
time.sleep(DELAY_BETWEEN_ARTISTS)
except Exception as db_error:
logger.warning(f"Error processing database albums: {db_error}")
logger.info(f"Discovery pool population complete: {total_tracks_added} total tracks added from all sources")
# Clean up tracks older than 365 days (maintain 1 year rolling window)
logger.info("Cleaning up discovery tracks older than 365 days...")
deleted_count = self.database.cleanup_old_discovery_tracks(days_threshold=365)
logger.info(f"Cleaned up {deleted_count} old tracks from discovery pool")
# Get final track count for metadata
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) as count FROM discovery_pool")
final_count = cursor.fetchone()['count']
# Update timestamp to mark when pool was last populated
self.database.update_discovery_pool_timestamp(track_count=final_count, profile_id=profile_id)
logger.info(f"Discovery pool now contains {final_count} total tracks (built over time)")
# Cache recent albums for discovery page
logger.info("Caching recent albums for discovery page...")
if progress_callback:
progress_callback('phase', 'Caching recent albums...')
self.cache_discovery_recent_albums(profile_id=profile_id)
# Curate playlists for consistent daily experience
logger.info("Curating discovery playlists...")
if progress_callback:
progress_callback('phase', 'Curating playlists...')
self.curate_discovery_playlists(profile_id=profile_id)
except Exception as e:
logger.error(f"Error populating discovery pool: {e}")
import traceback
traceback.print_exc()
def update_discovery_pool_incremental(self, profile_id: int = 1):
"""
Lightweight incremental update for discovery pool - runs every 6 hours.
IMPROVED: Quick check for new releases from watchlist artists only
- Much faster than full populate_discovery_pool (only checks watchlist, not similar artists)
- Only fetches latest 5 releases per artist
- Only adds tracks from releases in last 7 days
- Respects 6-hour cooldown to avoid over-polling
"""
try:
from datetime import datetime, timedelta
# Check if we should run (prevents over-polling Spotify)
if not self.database.should_populate_discovery_pool(hours_threshold=6, profile_id=profile_id):
logger.info("Discovery pool was updated recently (< 6 hours ago). Skipping incremental update.")
return
logger.info("Starting incremental discovery pool update (watchlist artists only)...")
watchlist_artists = self.database.get_watchlist_artists(profile_id=profile_id)
if not watchlist_artists:
logger.info("No watchlist artists to check for incremental update")
return
discovery_sources = self._discovery_source_priority()
if not discovery_sources:
logger.warning("No discovery sources available for incremental update")
return
cutoff_date = datetime.now() - timedelta(days=7) # Only last week's releases
total_tracks_added = 0
for artist_idx, artist in enumerate(watchlist_artists, 1):
try:
logger.info(f"[{artist_idx}/{len(watchlist_artists)}] Checking {artist.artist_name} for new releases...")
selected_source = None
selected_artist_id = None
recent_releases = []
artist_genres: List[str] = []
for source in discovery_sources:
source_attr = self._artist_id_attribute_for_source(source)
stored_id = getattr(artist, source_attr, None) if source_attr else None
cache_callback = None
if source == 'spotify':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'spotify', found_id)
elif source == 'itunes':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'itunes', found_id)
elif source == 'deezer':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'deezer', found_id)
artist_id = self._resolve_artist_id_for_source(
source,
artist.artist_name,
stored_id=stored_id,
cache_callback=cache_callback,
)
if not artist_id:
continue
recent_releases = self._get_artist_albums_for_source(
source,
artist_id,
album_type='album,single,ep',
limit=5,
skip_cache=True,
max_pages=1,
)
if not recent_releases:
continue
try:
artist_data = self._get_artist_data_for_source(source, artist_id)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
except Exception as e:
logger.debug(f"Could not fetch genres for {artist.artist_name} on {source}: {e}")
selected_source = source
selected_artist_id = artist_id
break
if not recent_releases or not selected_source or not selected_artist_id:
continue
for release in recent_releases:
try:
# Check if release is within cutoff
if not self.is_album_after_timestamp(release, cutoff_date):
continue # Skip older releases
# Get full album data with tracks
album_data = self._get_album_data_for_source(selected_source, release.id, album_name=release.name)
if not album_data or 'tracks' not in album_data:
continue
tracks = album_data['tracks'].get('items', [])
logger.debug(f" New release: {release.name} ({len(tracks)} tracks)")
# Determine if this is a new release (within last 30 days)
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) == 10:
release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
days_old = (datetime.now() - release_date).days
is_new = days_old <= 30
except Exception as e:
logger.debug("new-release date parse: %s", e)
# Add each track to discovery pool
for track in tracks:
try:
# Enhance track object with full album data (including album_type)
enhanced_track = {
**track,
'album': {
'id': album_data['id'],
'name': album_data.get('name', 'Unknown Album'),
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
'total_tracks': album_data.get('total_tracks', 0)
}
}
track_data = {
'track_name': track['name'],
'artist_name': artist.artist_name,
'album_name': album_data.get('name', 'Unknown Album'),
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': album_data.get('popularity', 0),
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': enhanced_track, # Store enhanced track with full album data
'artist_genres': artist_genres
}
if selected_source == 'spotify':
track_data['spotify_track_id'] = track['id']
track_data['spotify_album_id'] = album_data['id']
track_data['spotify_artist_id'] = selected_artist_id
elif selected_source == 'deezer':
track_data['deezer_track_id'] = track['id']
track_data['deezer_album_id'] = album_data['id']
track_data['deezer_artist_id'] = selected_artist_id
else:
track_data['itunes_track_id'] = track['id']
track_data['itunes_album_id'] = album_data['id']
track_data['itunes_artist_id'] = selected_artist_id
if self.database.add_to_discovery_pool(track_data, source=selected_source, profile_id=profile_id):
total_tracks_added += 1
except Exception as track_error:
logger.debug(f"Error adding track to discovery pool: {track_error}")
continue
except Exception as release_error:
logger.warning(f"Error processing release: {release_error}")
continue
# Small delay between artists
if artist_idx < len(watchlist_artists):
time.sleep(DELAY_BETWEEN_ARTISTS)
except Exception as artist_error:
logger.warning(f"Error checking {artist.artist_name}: {artist_error}")
continue
logger.info(f"Incremental update complete: {total_tracks_added} new tracks added from watchlist artists")
# Update timestamp
if total_tracks_added > 0:
# Get current track count
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) as count FROM discovery_pool")
current_count = cursor.fetchone()['count']
self.database.update_discovery_pool_timestamp(track_count=current_count, profile_id=profile_id)
logger.info(f"Discovery pool now contains {current_count} total tracks")
except Exception as e:
logger.error(f"Error during incremental discovery pool update: {e}")
import traceback
traceback.print_exc()
def cache_discovery_recent_albums(self, profile_id: int = 1):
"""
Cache recent albums from watchlist and similar artists for discover page.
Uses the configured source priority and caches the first source that
can return albums for each artist.
"""
try:
from datetime import datetime, timedelta
logger.info("Caching recent albums for discover page...")
# Clear existing cache for this profile
self.database.clear_discovery_recent_albums(profile_id=profile_id)
# Adaptive window based on listening velocity
days_lookback = 30
try:
profile = self._get_listening_profile(profile_id)
if profile['has_data']:
if profile['avg_daily_plays'] < 5:
days_lookback = 60 # Casual listener — show more
elif profile['avg_daily_plays'] > 20:
days_lookback = 21 # Heavy listener — keep it fresh
logger.info(f"Recent albums window: {days_lookback} days (avg {profile['avg_daily_plays']:.1f} plays/day)")
except Exception as e:
logger.debug("listening profile lookback adjust failed: %s", e)
cutoff_date = datetime.now() - timedelta(days=days_lookback)
discovery_sources = self._discovery_source_priority()
if not discovery_sources:
logger.warning("No music sources available to cache recent albums")
return
cached_count = {source: 0 for source in discovery_sources}
albums_checked = 0
# Get artists to check (scoped to profile)
watchlist_artists = self.database.get_watchlist_artists(profile_id=profile_id)
# We only need a modest sample here; this path fans out into per-source album lookups.
similar_artists = self.database.get_top_similar_artists(limit=25, profile_id=profile_id)
logger.info(f"Checking albums from {len(watchlist_artists)} watchlist + {len(similar_artists)} similar artists")
def process_album(album, artist_name, artist_spotify_id, artist_itunes_id, source, artist_deezer_id=None):
"""Helper to process and cache a single album"""
nonlocal albums_checked
try:
albums_checked += 1
release_str = album.release_date if hasattr(album, 'release_date') else None
if not release_str:
return False
# Handle iTunes/Deezer ISO format (2017-12-08T08:00:00Z)
if 'T' in release_str:
release_str = release_str.split('T')[0]
if len(release_str) >= 10:
release_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
if release_date >= cutoff_date:
album_data = {
'album_spotify_id': album.id if source == 'spotify' else None,
'album_itunes_id': album.id if source == 'itunes' else None,
'album_deezer_id': album.id if source == 'deezer' else None,
'album_name': album.name,
'artist_name': artist_name,
'artist_spotify_id': artist_spotify_id,
'artist_itunes_id': artist_itunes_id,
'artist_deezer_id': artist_deezer_id,
'album_cover_url': album.image_url if hasattr(album, 'image_url') else None,
'release_date': release_str[:10],
'album_type': album.album_type if hasattr(album, 'album_type') else 'album'
}
if self.database.cache_discovery_recent_album(album_data, source=source, profile_id=profile_id):
cached_count[source] += 1
logger.debug(f"Cached [{source}] recent album: {album.name} by {artist_name} ({release_str})")
return True
except Exception as e:
logger.debug(f"Error processing album: {e}")
return False
# Track resolution stats
fallback_resolved = 0
fallback_failed_resolve = 0
# Process watchlist artists
for artist in watchlist_artists:
selected_source = None
selected_artist_id = None
selected_albums = []
selected_watchlist_id = None
for source in discovery_sources:
source_attr = self._artist_id_attribute_for_source(source)
stored_id = getattr(artist, source_attr, None) if source_attr else None
cache_callback = None
if source == 'spotify':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'spotify', found_id)
elif source == 'itunes':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'itunes', found_id)
elif source == 'deezer':
cache_callback = lambda found_id, watchlist_id=artist.id, artist=artist: self._cache_watchlist_artist_source_id(artist, 'deezer', found_id)
artist_id = self._resolve_artist_id_for_source(
source,
artist.artist_name,
stored_id=stored_id,
cache_callback=cache_callback,
)
if not artist_id:
continue
albums = self._get_artist_albums_for_source(
source,
artist_id,
album_type='album,single,ep',
limit=20,
skip_cache=True,
max_pages=2,
)
if not albums:
logger.debug(f"No recent albums found for {artist.artist_name} on {source}")
continue
selected_source = source
selected_artist_id = artist_id
selected_albums = albums
if source == 'spotify':
selected_watchlist_id = artist_id
elif source == 'itunes':
selected_watchlist_id = artist.itunes_artist_id or artist_id
elif source == 'deezer':
selected_watchlist_id = getattr(artist, 'deezer_artist_id', None) or artist_id
break
if not selected_source or not selected_artist_id or not selected_albums:
time.sleep(DELAY_BETWEEN_ARTISTS)
continue
for album in selected_albums:
process_album(
album,
artist.artist_name,
selected_watchlist_id if selected_source == 'spotify' else artist.spotify_artist_id,
selected_watchlist_id if selected_source == 'itunes' else None,
selected_source,
artist_deezer_id=selected_watchlist_id if selected_source == 'deezer' else None,
)
time.sleep(DELAY_BETWEEN_ARTISTS)
# Process similar artists
for artist in similar_artists:
selected_source = None
selected_artist_id = None
selected_albums = []
selected_similar_id = None
for source in discovery_sources:
source_attr = self._similar_artist_id_attribute_for_source(source)
stored_id = getattr(artist, source_attr, None) if source_attr else None
cache_callback = None
if source == 'itunes':
cache_callback = lambda found_id, similar_id=artist.id: self.database.update_similar_artist_itunes_id(similar_id, found_id)
elif source == 'deezer':
cache_callback = lambda found_id, similar_id=artist.id: self.database.update_similar_artist_deezer_id(similar_id, found_id)
artist_id = self._resolve_artist_id_for_source(
source,
artist.similar_artist_name,
stored_id=stored_id,
cache_callback=cache_callback,
)
if not artist_id:
continue
albums = self._get_artist_albums_for_source(
source,
artist_id,
album_type='album,single,ep',
limit=20,
skip_cache=True,
max_pages=2,
)
if not albums:
logger.debug(f"No recent albums found for similar {artist.similar_artist_name} on {source}")
continue
selected_source = source
selected_artist_id = artist_id
selected_albums = albums
if source == 'spotify':
selected_similar_id = artist_id
elif source == 'itunes':
selected_similar_id = artist.similar_artist_itunes_id or artist_id
elif source == 'deezer':
selected_similar_id = getattr(artist, 'similar_artist_deezer_id', None) or artist_id
break
if not selected_source or not selected_artist_id or not selected_albums:
time.sleep(DELAY_BETWEEN_ARTISTS)
continue
for album in selected_albums:
process_album(
album,
artist.similar_artist_name,
selected_similar_id if selected_source == 'spotify' else artist.similar_artist_spotify_id,
selected_similar_id if selected_source == 'itunes' else None,
selected_source,
artist_deezer_id=selected_similar_id if selected_source == 'deezer' else None,
)
time.sleep(DELAY_BETWEEN_ARTISTS)
total_cached = sum(cached_count.values())
logger.info(f"Cached {total_cached} recent albums from {albums_checked} albums checked")
logger.info(f"Recent albums ID resolution stats: {fallback_resolved} resolved, {fallback_failed_resolve} failed")
except Exception as e:
logger.error(f"Error caching discovery recent albums: {e}")
import traceback
traceback.print_exc()
def _get_listening_profile(self, profile_id: int = 1) -> dict:
"""Build a listening profile from the user's play history for personalized discovery.
Returns a dict with top artists, genres, listening velocity, etc.
Falls back to empty/default values if no listening data exists.
"""
try:
stats = self.database.get_listening_stats('30d')
if not stats or stats.get('total_plays', 0) == 0:
return {'has_data': False, 'top_artist_names': set(), 'top_genres': set(),
'genre_weights': {}, 'artist_play_counts': {}, 'avg_daily_plays': 0, 'listening_diversity': 0}
top_artists = self.database.get_top_artists('30d', 20)
top_artist_names = {a['name'].lower() for a in top_artists}
# Build play count lookup for artist penalty scoring
artist_play_counts = {a['name'].lower(): a['play_count'] for a in top_artists}
genre_breakdown = self.database.get_genre_breakdown('30d')
top_genres = {g['genre'].lower() for g in genre_breakdown[:5]} if genre_breakdown else set()
genre_weights = {g['genre'].lower(): g['percentage'] for g in genre_breakdown} if genre_breakdown else {}
return {
'has_data': True,
'top_artist_names': top_artist_names,
'artist_play_counts': artist_play_counts,
'top_genres': top_genres,
'genre_weights': genre_weights,
'avg_daily_plays': stats.get('total_plays', 0) / 30,
'listening_diversity': stats.get('unique_artists', 0),
}
except Exception as e:
logger.debug(f"Could not build listening profile: {e}")
return {'has_data': False, 'top_artist_names': set(), 'top_genres': set(),
'genre_weights': {}, 'avg_daily_plays': 0, 'listening_diversity': 0}
def curate_discovery_playlists(self, profile_id: int = 1):
"""
Curate consistent playlist selections that stay the same until next discovery pool update.
Supports the discovery metadata sources in priority order and creates
separate curated playlists for each source.
- Release Radar: Prioritizes freshness + popularity from recent releases
- Discovery Weekly: Balanced mix of popular picks, deep cuts, and mid-tier tracks
Uses listening stats (if available) to personalize scoring.
"""
try:
import random
from datetime import datetime
logger.info("Curating discovery playlists...")
# Build listening profile for personalization
profile = self._get_listening_profile(profile_id)
if profile['has_data']:
logger.info(f"Listening profile: {len(profile['top_artist_names'])} top artists, "
f"{len(profile['top_genres'])} top genres, "
f"{profile['avg_daily_plays']:.1f} avg daily plays")
# Determine available sources
sources_to_process = self._discovery_source_priority()
if not sources_to_process:
logger.warning("No discovery sources available to curate playlists")
return
# Pre-build artist genre cache from local DB for genre affinity scoring
_artist_genre_cache = {}
if profile['has_data']:
try:
import json as _json
_conn = self.database._get_connection()
_cur = _conn.cursor()
_cur.execute("SELECT name, genres FROM artists WHERE genres IS NOT NULL AND genres != ''")
for _row in _cur.fetchall():
if not _row[0]:
continue
try:
_parsed = _json.loads(_row[1])
if isinstance(_parsed, list):
_artist_genre_cache[_row[0].lower()] = {g.lower() for g in _parsed if g}
except (ValueError, TypeError):
_artist_genre_cache[_row[0].lower()] = {g.strip().lower() for g in _row[1].split(',') if g.strip()}
_conn.close()
logger.debug(f"Built genre cache for {len(_artist_genre_cache)} artists")
except Exception as e:
logger.debug("artist genre cache build failed: %s", e)
logger.info(f"Curating playlists for sources: {sources_to_process}")
for source in sources_to_process:
logger.info(f"Curating Release Radar for {source}...")
# 1. Curate Release Radar - 50 tracks from recent albums
recent_albums = self.database.get_discovery_recent_albums(limit=50, source=source, profile_id=profile_id)
release_radar_tracks = []
if not recent_albums:
logger.warning(f"[{source.upper()}] No recent albums found for Release Radar - check cache_discovery_recent_albums()")
if recent_albums:
# Group albums by artist for variety
albums_by_artist = {}
for album in recent_albums:
artist = album['artist_name']
if artist not in albums_by_artist:
albums_by_artist[artist] = []
albums_by_artist[artist].append(album)
# Get tracks from each album
artist_track_data = {}
for artist, albums in albums_by_artist.items():
artist_track_data[artist] = []
for album in albums:
try:
# Get album data from the same source that won discovery
if source == 'spotify':
album_id = album.get('album_spotify_id')
elif source == 'deezer':
album_id = album.get('album_deezer_id')
else:
album_id = album.get('album_itunes_id')
if not album_id:
continue
album_data = self._get_album_data_for_source(source, album_id, album_name=album.get('album_name', ''))
if not album_data or 'tracks' not in album_data:
continue
# Calculate days since release for recency score
days_old = 14
try:
release_date_str = album.get('release_date', '')
if release_date_str and len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
days_old = (datetime.now() - release_date).days
except Exception as e:
logger.debug("release-date parse: %s", e)
for track in album_data['tracks'].get('items', []):
track_id = track.get('id')
if not track_id:
continue
# Calculate track score
recency_score = max(0, 100 - (days_old * 7))
popularity_score = track.get('popularity', album_data.get('popularity', 0))
# iTunes/Deezer have no popularity — use recency-based synthetic score
if source in ('itunes', 'deezer') and popularity_score == 0:
popularity_score = max(40, 70 - days_old)
is_single = album.get('album_type', 'album') == 'single'
single_bonus = 20 if is_single else 0
# Personalization bonuses (from listening profile)
genre_bonus = 0
artist_bonus = 0
overplay_penalty = 0
if profile['has_data']:
artist_lower = artist.lower()
# Genre affinity: check album/API genres, then use cached DB genres
artist_genres_lower = {g.lower() for g in (album.get('genres') or album_data.get('genres') or [])}
if not artist_genres_lower:
artist_genres_lower = _artist_genre_cache.get(artist_lower, set())
if artist_genres_lower & profile['top_genres']:
genre_bonus = 10
# Artist familiarity: boost tracks from artists user listens to
if artist_lower in profile['top_artist_names']:
artist_bonus = 15
# Overplay penalty: reduce score for artists user has heard too much
if profile['artist_play_counts'].get(artist_lower, 0) > 20:
overplay_penalty = -10
total_score = (recency_score * 0.45) + (popularity_score * 0.25) + single_bonus + genre_bonus + artist_bonus + overplay_penalty
full_track = {
'id': track_id,
'name': track.get('name', 'Unknown'),
'artists': track.get('artists', [{'name': artist}]),
'album': {
'id': album_data.get('id', ''),
'name': album_data.get('name', 'Unknown Album'),
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
},
'duration_ms': track.get('duration_ms', 0),
'popularity': popularity_score,
'score': total_score,
'source': source
}
artist_track_data[artist].append(full_track)
except Exception as e:
logger.debug(f"Error processing album for {artist}: {e}")
continue
# Balance by artist - max 6 tracks per artist
balanced_track_data = []
for _artist, tracks in artist_track_data.items():
sorted_tracks = sorted(tracks, key=lambda t: t['score'], reverse=True)
balanced_track_data.extend(sorted_tracks[:6])
# Sort by score and shuffle
balanced_track_data.sort(key=lambda t: t['score'], reverse=True)
top_tracks = balanced_track_data[:75]
random.shuffle(top_tracks)
# Take final 50 tracks
release_radar_tracks = [track['id'] for track in top_tracks[:50]]
# Add tracks to discovery pool
for track_data in top_tracks[:50]:
try:
artist_name = track_data['artists'][0].get('name', 'Unknown') if track_data['artists'] else 'Unknown'
formatted_track = {
'track_name': track_data['name'],
'artist_name': artist_name,
'album_name': track_data['album'].get('name', 'Unknown'),
'album_cover_url': track_data['album']['images'][0]['url'] if track_data['album'].get('images') else None,
'duration_ms': track_data.get('duration_ms', 0),
'popularity': track_data.get('popularity', 0),
'release_date': track_data['album'].get('release_date', ''),
'is_new_release': True,
'track_data_json': track_data,
'artist_genres': []
}
if source == 'spotify':
formatted_track['spotify_track_id'] = track_data['id']
formatted_track['spotify_album_id'] = track_data['album'].get('id', '')
elif source == 'deezer':
formatted_track['deezer_track_id'] = track_data['id']
formatted_track['deezer_album_id'] = track_data['album'].get('id', '')
else:
formatted_track['itunes_track_id'] = track_data['id']
formatted_track['itunes_album_id'] = track_data['album'].get('id', '')
self.database.add_to_discovery_pool(formatted_track, source=source, profile_id=profile_id)
except Exception as e:
continue
# Save with source suffix for multi-source support
playlist_key = f'release_radar_{source}'
self.database.save_curated_playlist(playlist_key, release_radar_tracks, profile_id=profile_id)
logger.info(f"Release Radar ({source}) curated: {len(release_radar_tracks)} tracks")
# 2. Curate Discovery Weekly - 50 tracks from discovery pool
logger.info(f"Curating Discovery Weekly for {source}...")
discovery_tracks = self.database.get_discovery_pool_tracks(limit=2000, new_releases_only=False, source=source, profile_id=profile_id)
if not discovery_tracks:
logger.warning(f"[{source.upper()}] No discovery pool tracks found for Discovery Weekly - check populate_discovery_pool()")
discovery_weekly_tracks = []
if discovery_tracks:
# Separate tracks by popularity tiers
popular_picks = []
balanced_mix = []
deep_cuts = []
for track in discovery_tracks:
popularity = track.popularity if hasattr(track, 'popularity') else 50
if popularity >= 60:
popular_picks.append(track)
elif popularity >= 40:
balanced_mix.append(track)
else:
deep_cuts.append(track)
logger.info(f"Discovery pool ({source}): {len(popular_picks)} popular, {len(balanced_mix)} mid-tier, {len(deep_cuts)} deep cuts")
# Serendipity-weighted selection within each tier
def _serendipity_sort(tracks_list):
"""Sort by serendipity: prefer unknown artists in genres user likes."""
if not profile['has_data']:
random.shuffle(tracks_list)
return tracks_list
for t in tracks_list:
score = 1.0
t_artist = (t.artist_name or '').lower()
t_genres = _artist_genre_cache.get(t_artist, set())
# Boost artists user has NEVER played (true discovery)
if t_artist not in profile['top_artist_names']:
score += 0.5
# Boost genres user likes but hasn't explored
if t_genres & profile['top_genres']:
score += 0.3
# Penalize artists user already plays heavily
if profile['artist_play_counts'].get(t_artist, 0) > 10:
score -= 0.4
t._serendipity = score + random.random() * 0.2 # Small random factor
tracks_list.sort(key=lambda t: getattr(t, '_serendipity', 1.0), reverse=True)
return tracks_list
_serendipity_sort(popular_picks)
_serendipity_sort(balanced_mix)
_serendipity_sort(deep_cuts)
selected_tracks = []
selected_tracks.extend(popular_picks[:20])
selected_tracks.extend(balanced_mix[:20])
selected_tracks.extend(deep_cuts[:10])
random.shuffle(selected_tracks)
# Extract appropriate track IDs based on source
for track in selected_tracks:
if source == 'spotify' and track.spotify_track_id:
discovery_weekly_tracks.append(track.spotify_track_id)
elif source == 'itunes' and track.itunes_track_id:
discovery_weekly_tracks.append(track.itunes_track_id)
elif source == 'deezer' and track.deezer_track_id:
discovery_weekly_tracks.append(track.deezer_track_id)
playlist_key = f'discovery_weekly_{source}'
self.database.save_curated_playlist(playlist_key, discovery_weekly_tracks, profile_id=profile_id)
logger.info(f"Discovery Weekly ({source}) curated: {len(discovery_weekly_tracks)} tracks")
# 3. "Because You Listen To" — personalized sections based on top played artists
if profile['has_data']:
logger.info("Building 'Because You Listen To' playlists...")
top_played = self.database.get_top_artists('30d', 3)
active_source_for_bylt = None
all_pool_tracks = []
for candidate_source in sources_to_process:
all_pool_tracks = self.database.get_discovery_pool_tracks(
limit=2000, new_releases_only=False,
source=candidate_source, profile_id=profile_id
)
if all_pool_tracks:
active_source_for_bylt = candidate_source
break
if not active_source_for_bylt:
logger.warning("No discovery pool tracks found for Because You Listen To")
all_pool_tracks = []
# Build source_artist_id → artist_name mapping from watchlist
_wa_id_to_name = {}
try:
_wa_list = self.database.get_watchlist_artists(profile_id=profile_id)
for _wa in _wa_list:
_wa_id_to_name[str(_wa.id)] = (_wa.artist_name or '').lower()
except Exception as e:
logger.debug("watchlist artist id-to-name map failed: %s", e)
all_similar = self.database.get_top_similar_artists(limit=200, profile_id=profile_id)
for i, played_artist in enumerate(top_played):
try:
artist_name = played_artist['name']
artist_lower = artist_name.lower()
# Find similar artists to this played artist via the similar_artists table
similar_names = set()
for s in all_similar:
# Check if this similar artist's source matches our played artist
src_id = str(getattr(s, 'source_artist_id', ''))
src_name = _wa_id_to_name.get(src_id, '')
sim_name = getattr(s, 'similar_artist_name', '') or ''
if src_name == artist_lower and sim_name:
similar_names.add(sim_name.lower())
if not similar_names:
# Fallback: find pool tracks from same genre
played_genres = _artist_genre_cache.get(artist_lower, set())
if played_genres:
for t in all_pool_tracks:
t_artist_lower = (t.artist_name or '').lower()
if t_artist_lower != artist_lower and _artist_genre_cache.get(t_artist_lower, set()) & played_genres:
similar_names.add(t_artist_lower)
if len(similar_names) >= 20:
break
if not similar_names:
continue
# Pick tracks from those similar artists in the pool
matching_tracks = []
for t in all_pool_tracks:
if (t.artist_name or '').lower() in similar_names:
if active_source_for_bylt == 'spotify' and t.spotify_track_id:
matching_tracks.append(t.spotify_track_id)
elif active_source_for_bylt == 'itunes' and t.itunes_track_id:
matching_tracks.append(t.itunes_track_id)
elif active_source_for_bylt == 'deezer' and t.deezer_track_id:
matching_tracks.append(t.deezer_track_id)
if len(matching_tracks) >= 15:
break
if matching_tracks:
import random as _rnd
_rnd.shuffle(matching_tracks)
playlist_key = f'because_you_listen_to_{i}'
self.database.save_curated_playlist(playlist_key, matching_tracks[:10], profile_id=profile_id)
# Store the source artist name in metadata
self.database.set_metadata(f'bylt_artist_{i}', artist_name)
logger.info(f"'Because You Listen To {artist_name}': {len(matching_tracks[:10])} tracks")
except Exception as e:
logger.debug(f"Error building BYLT for {played_artist.get('name', '?')}: {e}")
# Also save without suffix for backward compatibility (use first active source).
active_source = sources_to_process[0]
release_radar_key = f'release_radar_{active_source}'
discovery_weekly_key = f'discovery_weekly_{active_source}'
# Copy active source playlists to non-suffixed keys
release_radar_ids = self.database.get_curated_playlist(release_radar_key, profile_id=profile_id) or []
discovery_weekly_ids = self.database.get_curated_playlist(discovery_weekly_key, profile_id=profile_id) or []
self.database.save_curated_playlist('release_radar', release_radar_ids, profile_id=profile_id)
self.database.save_curated_playlist('discovery_weekly', discovery_weekly_ids, profile_id=profile_id)
logger.info("Playlist curation complete")
except Exception as e:
logger.error(f"Error curating discovery playlists: {e}")
import traceback
traceback.print_exc()
def sync_spotify_library_cache(self, profile_id=1):
"""Sync user's saved Spotify albums into the local cache.
Runs after the main watchlist scan. First sync fetches all saved albums;
subsequent syncs are incremental (only fetch newly saved albums).
Every 7 days, does a full re-sync to detect un-saved albums.
"""
if not self._spotify_available_for_run():
logger.debug("Spotify not authenticated, skipping library cache sync")
return
logger.info("Syncing Spotify library cache...")
try:
last_sync = self.database.get_metadata('spotify_library_last_sync')
last_full_sync = self.database.get_metadata('spotify_library_last_full_sync')
# Determine if we need a full sync (first time or every 7 days)
do_full_sync = False
if not last_sync:
do_full_sync = True
logger.info("First-time Spotify library sync — fetching all saved albums")
elif not last_full_sync:
# last_sync exists but last_full_sync doesn't — first run with this code
do_full_sync = True
logger.info("Full re-sync triggered (no full sync recorded)")
else:
try:
last_full_dt = datetime.fromisoformat(last_full_sync)
if datetime.now() - last_full_dt > timedelta(days=7):
do_full_sync = True
logger.info("Full re-sync triggered (>7 days since last full sync)")
except (ValueError, TypeError):
do_full_sync = True
# Fetch albums from Spotify
since_timestamp = None if do_full_sync else last_sync
albums = self.spotify_client.get_saved_albums(since_timestamp=since_timestamp)
if not albums and not do_full_sync:
logger.info("No new saved albums since last sync")
return
if albums:
self.database.upsert_spotify_library_albums(albums, profile_id=profile_id)
# On full sync, remove albums that are no longer saved
if do_full_sync and albums:
fetched_ids = {a['spotify_album_id'] for a in albums}
self.database.remove_spotify_library_albums_not_in(fetched_ids, profile_id=profile_id)
self.database.set_metadata('spotify_library_last_full_sync', datetime.now().isoformat())
# Update last sync timestamp
self.database.set_metadata('spotify_library_last_sync', datetime.now().isoformat())
logger.info(f"Spotify library cache sync complete — {len(albums)} albums processed")
except Exception as e:
logger.error(f"Error syncing Spotify library cache: {e}")
def _populate_seasonal_content(self):
"""
Populate seasonal content as part of watchlist scan.
IMPROVED: Integrated with discovery system
- Checks if seasonal content needs update (7-day threshold)
- Populates content for all seasons
- Curates seasonal playlists
- Runs once per week automatically
"""
try:
from core.seasonal_discovery import get_seasonal_discovery_service
logger.info("Checking seasonal content update...")
seasonal_service = get_seasonal_discovery_service(self.spotify_client, self.database)
# Get current season to prioritize
current_season = seasonal_service.get_current_season()
if current_season:
# Always update current season if needed
if seasonal_service.should_populate_seasonal_content(current_season, days_threshold=7):
logger.info(f"Populating current season: {current_season}")
seasonal_service.populate_seasonal_content(current_season)
seasonal_service.curate_seasonal_playlist(current_season)
else:
logger.info(f"Current season '{current_season}' is up to date")
# Update other seasons in background (less frequently - 14 day threshold)
from core.seasonal_discovery import SEASONAL_CONFIG
for season_key in SEASONAL_CONFIG.keys():
if season_key == current_season:
continue # Already handled above
if seasonal_service.should_populate_seasonal_content(season_key, days_threshold=14):
logger.info(f"Populating season: {season_key}")
seasonal_service.populate_seasonal_content(season_key)
seasonal_service.curate_seasonal_playlist(season_key)
logger.info("Seasonal content update complete")
except Exception as e:
logger.error(f"Error populating seasonal content: {e}")
import traceback
traceback.print_exc()
def _generate_lastfm_radio_playlists(self):
"""Generate Last.fm Radio playlists from the user's top 3 most-played tracks.
Runs at most once per week (throttled via config key 'lastfm_radio.last_generated').
Requires a Last.fm API key to be configured.
Stores playlists in DB under playlist_type='lastfm_radio' via ListenBrainzManager.
"""
try:
from datetime import datetime, timedelta
from config.settings import config_manager
from database.music_database import get_database
# Weekly throttle
last_generated_str = config_manager.get('lastfm_radio.last_generated', '')
if last_generated_str:
try:
last_generated = datetime.fromisoformat(last_generated_str)
if datetime.now() - last_generated < timedelta(days=7):
logger.info("Last.fm radio: skipping — generated within the last 7 days")
return
except ValueError:
pass # Malformed timestamp — proceed
# Require Last.fm API key
api_key = config_manager.get('lastfm.api_key', '')
if not api_key:
logger.info("Last.fm radio: skipping — no API key configured")
return
# Get top 3 most-played tracks over the last 30 days
db = get_database()
top_tracks = db.get_top_tracks(time_range='30d', limit=3)
if not top_tracks:
logger.info("Last.fm radio: skipping — no listening history found")
return
logger.info(f"Last.fm radio: generating playlists for {len(top_tracks)} top tracks")
from core.lastfm_client import LastFMClient
from core.listenbrainz_manager import ListenBrainzManager
client = LastFMClient(api_key=api_key)
# Use profile_id=1 as a sensible default; the scanner runs globally
lb_manager = ListenBrainzManager(str(db.database_path), profile_id=1)
generated = 0
for track in top_tracks:
track_name = track.get('name', '')
artist_name = track.get('artist', '')
if not track_name or not artist_name:
continue
try:
similar = client.get_similar_tracks(artist_name, track_name, limit=25)
if not similar:
logger.info(f"Last.fm radio: no similar tracks for '{artist_name} - {track_name}'")
continue
playlist_mbid = lb_manager.save_lastfm_radio_playlist(track_name, artist_name, similar)
logger.info(
f"Last.fm radio: saved '{track_name}' by '{artist_name}' "
f"{playlist_mbid} ({len(similar)} tracks)"
)
generated += 1
except Exception as track_err:
logger.warning(f"Last.fm radio: error processing '{track_name}': {track_err}")
if generated > 0:
config_manager.set('lastfm_radio.last_generated', datetime.now().isoformat())
logger.info(f"Last.fm radio: generated {generated} playlists, throttle updated")
except Exception as e:
logger.error(f"Error in _generate_lastfm_radio_playlists: {e}")
import traceback
traceback.print_exc()
# Singleton instance
_watchlist_scanner_instance = None
def get_watchlist_scanner(spotify_client: SpotifyClient) -> WatchlistScanner:
"""Get the global watchlist scanner instance"""
global _watchlist_scanner_instance
if _watchlist_scanner_instance is None:
_watchlist_scanner_instance = WatchlistScanner(spotify_client)
return _watchlist_scanner_instance